/rust/registry/src/github.com-1ecc6299db9ec823/regex-1.4.3/src/compile.rs
Line | Count | Source (jump to first uncovered line) |
1 | | use std::collections::HashMap; |
2 | | use std::fmt; |
3 | | use std::iter; |
4 | | use std::result; |
5 | | use std::sync::Arc; |
6 | | |
7 | | use syntax::hir::{self, Hir}; |
8 | | use syntax::is_word_byte; |
9 | | use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; |
10 | | |
11 | | use prog::{ |
12 | | EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, |
13 | | InstSave, InstSplit, Program, |
14 | | }; |
15 | | |
16 | | use Error; |
17 | | |
18 | | type Result = result::Result<Patch, Error>; |
19 | | type ResultOrEmpty = result::Result<Option<Patch>, Error>; |
20 | | |
21 | | #[derive(Debug)] |
22 | | struct Patch { |
23 | | hole: Hole, |
24 | | entry: InstPtr, |
25 | | } |
26 | | |
27 | | /// A compiler translates a regular expression AST to a sequence of |
28 | | /// instructions. The sequence of instructions represents an NFA. |
29 | | // `Compiler` is only public via the `internal` module, so avoid deriving |
30 | | // `Debug`. |
31 | | #[allow(missing_debug_implementations)] |
32 | | pub struct Compiler { |
33 | | insts: Vec<MaybeInst>, |
34 | | compiled: Program, |
35 | | capture_name_idx: HashMap<String, usize>, |
36 | | num_exprs: usize, |
37 | | size_limit: usize, |
38 | | suffix_cache: SuffixCache, |
39 | | utf8_seqs: Option<Utf8Sequences>, |
40 | | byte_classes: ByteClassSet, |
41 | | } |
42 | | |
43 | | impl Compiler { |
44 | | /// Create a new regular expression compiler. |
45 | | /// |
46 | | /// Various options can be set before calling `compile` on an expression. |
47 | | pub fn new() -> Self { |
48 | | Compiler { |
49 | | insts: vec![], |
50 | | compiled: Program::new(), |
51 | | capture_name_idx: HashMap::new(), |
52 | | num_exprs: 0, |
53 | | size_limit: 10 * (1 << 20), |
54 | | suffix_cache: SuffixCache::new(1000), |
55 | | utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), |
56 | | byte_classes: ByteClassSet::new(), |
57 | | } |
58 | | } |
59 | | |
60 | | /// The size of the resulting program is limited by size_limit. If |
61 | | /// the program approximately exceeds the given size (in bytes), then |
62 | | /// compilation will stop and return an error. |
63 | | pub fn size_limit(mut self, size_limit: usize) -> Self { |
64 | | self.size_limit = size_limit; |
65 | | self |
66 | | } |
67 | | |
68 | | /// If bytes is true, then the program is compiled as a byte based |
69 | | /// automaton, which incorporates UTF-8 decoding into the machine. If it's |
70 | | /// false, then the automaton is Unicode scalar value based, e.g., an |
71 | | /// engine utilizing such an automaton is responsible for UTF-8 decoding. |
72 | | /// |
73 | | /// The specific invariant is that when returning a byte based machine, |
74 | | /// the neither the `Char` nor `Ranges` instructions are produced. |
75 | | /// Conversely, when producing a Unicode scalar value machine, the `Bytes` |
76 | | /// instruction is never produced. |
77 | | /// |
78 | | /// Note that `dfa(true)` implies `bytes(true)`. |
79 | | pub fn bytes(mut self, yes: bool) -> Self { |
80 | | self.compiled.is_bytes = yes; |
81 | | self |
82 | | } |
83 | | |
84 | | /// When disabled, the program compiled may match arbitrary bytes. |
85 | | /// |
86 | | /// When enabled (the default), all compiled programs exclusively match |
87 | | /// valid UTF-8 bytes. |
88 | | pub fn only_utf8(mut self, yes: bool) -> Self { |
89 | | self.compiled.only_utf8 = yes; |
90 | | self |
91 | | } |
92 | | |
93 | | /// When set, the machine returned is suitable for use in the DFA matching |
94 | | /// engine. |
95 | | /// |
96 | | /// In particular, this ensures that if the regex is not anchored in the |
97 | | /// beginning, then a preceding `.*?` is included in the program. (The NFA |
98 | | /// based engines handle the preceding `.*?` explicitly, which is difficult |
99 | | /// or impossible in the DFA engine.) |
100 | | pub fn dfa(mut self, yes: bool) -> Self { |
101 | | self.compiled.is_dfa = yes; |
102 | | self |
103 | | } |
104 | | |
105 | | /// When set, the machine returned is suitable for matching text in |
106 | | /// reverse. In particular, all concatenations are flipped. |
107 | | pub fn reverse(mut self, yes: bool) -> Self { |
108 | | self.compiled.is_reverse = yes; |
109 | | self |
110 | | } |
111 | | |
112 | | /// Compile a regular expression given its AST. |
113 | | /// |
114 | | /// The compiler is guaranteed to succeed unless the program exceeds the |
115 | | /// specified size limit. If the size limit is exceeded, then compilation |
116 | | /// stops and returns an error. |
117 | 0 | pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { |
118 | | debug_assert!(!exprs.is_empty()); |
119 | 0 | self.num_exprs = exprs.len(); |
120 | 0 | if exprs.len() == 1 { |
121 | 0 | self.compile_one(&exprs[0]) |
122 | | } else { |
123 | 0 | self.compile_many(exprs) |
124 | | } |
125 | 0 | } |
126 | | |
127 | 0 | fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { |
128 | 0 | // If we're compiling a forward DFA and we aren't anchored, then |
129 | 0 | // add a `.*?` before the first capture group. |
130 | 0 | // Other matching engines handle this by baking the logic into the |
131 | 0 | // matching engine itself. |
132 | 0 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
133 | 0 | self.compiled.is_anchored_start = expr.is_anchored_start(); |
134 | 0 | self.compiled.is_anchored_end = expr.is_anchored_end(); |
135 | 0 | if self.compiled.needs_dotstar() { |
136 | 0 | dotstar_patch = self.c_dotstar()?; |
137 | 0 | self.compiled.start = dotstar_patch.entry; |
138 | 0 | } |
139 | 0 | self.compiled.captures = vec![None]; |
140 | 0 | let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
141 | 0 | if self.compiled.needs_dotstar() { |
142 | 0 | self.fill(dotstar_patch.hole, patch.entry); |
143 | 0 | } else { |
144 | 0 | self.compiled.start = patch.entry; |
145 | 0 | } |
146 | 0 | self.fill_to_next(patch.hole); |
147 | 0 | self.compiled.matches = vec![self.insts.len()]; |
148 | 0 | self.push_compiled(Inst::Match(0)); |
149 | 0 | self.compile_finish() |
150 | 0 | } |
151 | | |
152 | 0 | fn compile_many( |
153 | 0 | mut self, |
154 | 0 | exprs: &[Hir], |
155 | 0 | ) -> result::Result<Program, Error> { |
156 | | debug_assert!(exprs.len() > 1); |
157 | | |
158 | 0 | self.compiled.is_anchored_start = |
159 | 0 | exprs.iter().all(|e| e.is_anchored_start()); |
160 | 0 | self.compiled.is_anchored_end = |
161 | 0 | exprs.iter().all(|e| e.is_anchored_end()); |
162 | 0 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
163 | 0 | if self.compiled.needs_dotstar() { |
164 | 0 | dotstar_patch = self.c_dotstar()?; |
165 | 0 | self.compiled.start = dotstar_patch.entry; |
166 | 0 | } else { |
167 | 0 | self.compiled.start = 0; // first instruction is always split |
168 | 0 | } |
169 | 0 | self.fill_to_next(dotstar_patch.hole); |
170 | 0 |
|
171 | 0 | let mut prev_hole = Hole::None; |
172 | 0 | for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { |
173 | 0 | self.fill_to_next(prev_hole); |
174 | 0 | let split = self.push_split_hole(); |
175 | 0 | let Patch { hole, entry } = |
176 | 0 | self.c_capture(0, expr)?.unwrap_or(self.next_inst()); |
177 | 0 | self.fill_to_next(hole); |
178 | 0 | self.compiled.matches.push(self.insts.len()); |
179 | 0 | self.push_compiled(Inst::Match(i)); |
180 | 0 | prev_hole = self.fill_split(split, Some(entry), None); |
181 | | } |
182 | 0 | let i = exprs.len() - 1; |
183 | 0 | let Patch { hole, entry } = |
184 | 0 | self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst()); |
185 | 0 | self.fill(prev_hole, entry); |
186 | 0 | self.fill_to_next(hole); |
187 | 0 | self.compiled.matches.push(self.insts.len()); |
188 | 0 | self.push_compiled(Inst::Match(i)); |
189 | 0 | self.compile_finish() |
190 | 0 | } |
191 | | |
192 | 0 | fn compile_finish(mut self) -> result::Result<Program, Error> { |
193 | 0 | self.compiled.insts = |
194 | 0 | self.insts.into_iter().map(|inst| inst.unwrap()).collect(); |
195 | 0 | self.compiled.byte_classes = self.byte_classes.byte_classes(); |
196 | 0 | self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); |
197 | 0 | Ok(self.compiled) |
198 | 0 | } |
199 | | |
200 | | /// Compile expr into self.insts, returning a patch on success, |
201 | | /// or an error if we run out of memory. |
202 | | /// |
203 | | /// All of the c_* methods of the compiler share the contract outlined |
204 | | /// here. |
205 | | /// |
206 | | /// The main thing that a c_* method does is mutate `self.insts` |
207 | | /// to add a list of mostly compiled instructions required to execute |
208 | | /// the given expression. `self.insts` contains MaybeInsts rather than |
209 | | /// Insts because there is some backpatching required. |
210 | | /// |
211 | | /// The `Patch` value returned by each c_* method provides metadata |
212 | | /// about the compiled instructions emitted to `self.insts`. The |
213 | | /// `entry` member of the patch refers to the first instruction |
214 | | /// (the entry point), while the `hole` member contains zero or |
215 | | /// more offsets to partial instructions that need to be backpatched. |
216 | | /// The c_* routine can't know where its list of instructions are going to |
217 | | /// jump to after execution, so it is up to the caller to patch |
218 | | /// these jumps to point to the right place. So compiling some |
219 | | /// expression, e, we would end up with a situation that looked like: |
220 | | /// |
221 | | /// ```text |
222 | | /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] |
223 | | /// ^ ^ ^ |
224 | | /// | \ / |
225 | | /// entry \ / |
226 | | /// hole |
227 | | /// ``` |
228 | | /// |
229 | | /// To compile two expressions, e1 and e2, concatenated together we |
230 | | /// would do: |
231 | | /// |
232 | | /// ```ignore |
233 | | /// let patch1 = self.c(e1); |
234 | | /// let patch2 = self.c(e2); |
235 | | /// ``` |
236 | | /// |
237 | | /// while leaves us with a situation that looks like |
238 | | /// |
239 | | /// ```text |
240 | | /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] |
241 | | /// ^ ^ ^ ^ |
242 | | /// | | | | |
243 | | /// entry1 hole1 entry2 hole2 |
244 | | /// ``` |
245 | | /// |
246 | | /// Then to merge the two patches together into one we would backpatch |
247 | | /// hole1 with entry2 and return a new patch that enters at entry1 |
248 | | /// and has hole2 for a hole. In fact, if you look at the c_concat |
249 | | /// method you will see that it does exactly this, though it handles |
250 | | /// a list of expressions rather than just the two that we use for |
251 | | /// an example. |
252 | | /// |
253 | | /// Ok(None) is returned when an expression is compiled to no |
254 | | /// instruction, and so no patch.entry value makes sense. |
255 | 0 | fn c(&mut self, expr: &Hir) -> ResultOrEmpty { |
256 | | use prog; |
257 | | use syntax::hir::HirKind::*; |
258 | | |
259 | 0 | self.check_size()?; |
260 | 0 | match *expr.kind() { |
261 | 0 | Empty => Ok(None), |
262 | 0 | Literal(hir::Literal::Unicode(c)) => self.c_char(c), |
263 | 0 | Literal(hir::Literal::Byte(b)) => { |
264 | 0 | assert!(self.compiled.uses_bytes()); |
265 | 0 | self.c_byte(b) |
266 | | } |
267 | 0 | Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), |
268 | 0 | Class(hir::Class::Bytes(ref cls)) => { |
269 | 0 | if self.compiled.uses_bytes() { |
270 | 0 | self.c_class_bytes(cls.ranges()) |
271 | | } else { |
272 | 0 | assert!(cls.is_all_ascii()); |
273 | 0 | let mut char_ranges = vec![]; |
274 | 0 | for r in cls.iter() { |
275 | 0 | let (s, e) = (r.start() as char, r.end() as char); |
276 | 0 | char_ranges.push(hir::ClassUnicodeRange::new(s, e)); |
277 | 0 | } |
278 | 0 | self.c_class(&char_ranges) |
279 | | } |
280 | | } |
281 | 0 | Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { |
282 | 0 | self.byte_classes.set_range(b'\n', b'\n'); |
283 | 0 | self.c_empty_look(prog::EmptyLook::EndLine) |
284 | | } |
285 | | Anchor(hir::Anchor::StartLine) => { |
286 | 0 | self.byte_classes.set_range(b'\n', b'\n'); |
287 | 0 | self.c_empty_look(prog::EmptyLook::StartLine) |
288 | | } |
289 | 0 | Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { |
290 | 0 | self.byte_classes.set_range(b'\n', b'\n'); |
291 | 0 | self.c_empty_look(prog::EmptyLook::StartLine) |
292 | | } |
293 | | Anchor(hir::Anchor::EndLine) => { |
294 | 0 | self.byte_classes.set_range(b'\n', b'\n'); |
295 | 0 | self.c_empty_look(prog::EmptyLook::EndLine) |
296 | | } |
297 | 0 | Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { |
298 | 0 | self.c_empty_look(prog::EmptyLook::EndText) |
299 | | } |
300 | | Anchor(hir::Anchor::StartText) => { |
301 | 0 | self.c_empty_look(prog::EmptyLook::StartText) |
302 | | } |
303 | 0 | Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { |
304 | 0 | self.c_empty_look(prog::EmptyLook::StartText) |
305 | | } |
306 | | Anchor(hir::Anchor::EndText) => { |
307 | 0 | self.c_empty_look(prog::EmptyLook::EndText) |
308 | | } |
309 | 0 | WordBoundary(hir::WordBoundary::Unicode) => { |
310 | 0 | if !cfg!(feature = "unicode-perl") { |
311 | | return Err(Error::Syntax( |
312 | | "Unicode word boundaries are unavailable when \ |
313 | | the unicode-perl feature is disabled" |
314 | | .to_string(), |
315 | | )); |
316 | 0 | } |
317 | 0 | self.compiled.has_unicode_word_boundary = true; |
318 | 0 | self.byte_classes.set_word_boundary(); |
319 | 0 | self.c_empty_look(prog::EmptyLook::WordBoundary) |
320 | | } |
321 | | WordBoundary(hir::WordBoundary::UnicodeNegate) => { |
322 | 0 | if !cfg!(feature = "unicode-perl") { |
323 | | return Err(Error::Syntax( |
324 | | "Unicode word boundaries are unavailable when \ |
325 | | the unicode-perl feature is disabled" |
326 | | .to_string(), |
327 | | )); |
328 | 0 | } |
329 | 0 | self.compiled.has_unicode_word_boundary = true; |
330 | 0 | self.byte_classes.set_word_boundary(); |
331 | 0 | self.c_empty_look(prog::EmptyLook::NotWordBoundary) |
332 | | } |
333 | | WordBoundary(hir::WordBoundary::Ascii) => { |
334 | 0 | self.byte_classes.set_word_boundary(); |
335 | 0 | self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) |
336 | | } |
337 | | WordBoundary(hir::WordBoundary::AsciiNegate) => { |
338 | 0 | self.byte_classes.set_word_boundary(); |
339 | 0 | self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) |
340 | | } |
341 | 0 | Group(ref g) => match g.kind { |
342 | 0 | hir::GroupKind::NonCapturing => self.c(&g.hir), |
343 | 0 | hir::GroupKind::CaptureIndex(index) => { |
344 | 0 | if index as usize >= self.compiled.captures.len() { |
345 | 0 | self.compiled.captures.push(None); |
346 | 0 | } |
347 | 0 | self.c_capture(2 * index as usize, &g.hir) |
348 | | } |
349 | 0 | hir::GroupKind::CaptureName { index, ref name } => { |
350 | 0 | if index as usize >= self.compiled.captures.len() { |
351 | 0 | let n = name.to_string(); |
352 | 0 | self.compiled.captures.push(Some(n.clone())); |
353 | 0 | self.capture_name_idx.insert(n, index as usize); |
354 | 0 | } |
355 | 0 | self.c_capture(2 * index as usize, &g.hir) |
356 | | } |
357 | | }, |
358 | 0 | Concat(ref es) => { |
359 | 0 | if self.compiled.is_reverse { |
360 | 0 | self.c_concat(es.iter().rev()) |
361 | | } else { |
362 | 0 | self.c_concat(es) |
363 | | } |
364 | | } |
365 | 0 | Alternation(ref es) => self.c_alternate(&**es), |
366 | 0 | Repetition(ref rep) => self.c_repeat(rep), |
367 | | } |
368 | 0 | } |
369 | | |
370 | 0 | fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { |
371 | 0 | if self.num_exprs > 1 || self.compiled.is_dfa { |
372 | | // Don't ever compile Save instructions for regex sets because |
373 | | // they are never used. They are also never used in DFA programs |
374 | | // because DFAs can't handle captures. |
375 | 0 | self.c(expr) |
376 | | } else { |
377 | 0 | let entry = self.insts.len(); |
378 | 0 | let hole = self.push_hole(InstHole::Save { slot: first_slot }); |
379 | 0 | let patch = self.c(expr)?.unwrap_or(self.next_inst()); |
380 | 0 | self.fill(hole, patch.entry); |
381 | 0 | self.fill_to_next(patch.hole); |
382 | 0 | let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); |
383 | 0 | Ok(Some(Patch { hole: hole, entry: entry })) |
384 | | } |
385 | 0 | } |
386 | | |
387 | 0 | fn c_dotstar(&mut self) -> Result { |
388 | 0 | Ok(if !self.compiled.only_utf8() { |
389 | 0 | self.c(&Hir::repetition(hir::Repetition { |
390 | 0 | kind: hir::RepetitionKind::ZeroOrMore, |
391 | 0 | greedy: false, |
392 | 0 | hir: Box::new(Hir::any(true)), |
393 | 0 | }))? |
394 | | .unwrap() |
395 | | } else { |
396 | 0 | self.c(&Hir::repetition(hir::Repetition { |
397 | 0 | kind: hir::RepetitionKind::ZeroOrMore, |
398 | 0 | greedy: false, |
399 | 0 | hir: Box::new(Hir::any(false)), |
400 | 0 | }))? |
401 | | .unwrap() |
402 | | }) |
403 | 0 | } |
404 | | |
405 | 0 | fn c_char(&mut self, c: char) -> ResultOrEmpty { |
406 | 0 | if self.compiled.uses_bytes() { |
407 | 0 | if c.is_ascii() { |
408 | 0 | let b = c as u8; |
409 | 0 | let hole = |
410 | 0 | self.push_hole(InstHole::Bytes { start: b, end: b }); |
411 | 0 | self.byte_classes.set_range(b, b); |
412 | 0 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
413 | | } else { |
414 | 0 | self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) |
415 | | } |
416 | | } else { |
417 | 0 | let hole = self.push_hole(InstHole::Char { c: c }); |
418 | 0 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
419 | | } |
420 | 0 | } |
421 | | |
422 | 0 | fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { |
423 | 0 | assert!(!ranges.is_empty()); |
424 | 0 | if self.compiled.uses_bytes() { |
425 | 0 | Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) |
426 | | } else { |
427 | 0 | let ranges: Vec<(char, char)> = |
428 | 0 | ranges.iter().map(|r| (r.start(), r.end())).collect(); |
429 | 0 | let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { |
430 | 0 | self.push_hole(InstHole::Char { c: ranges[0].0 }) |
431 | | } else { |
432 | 0 | self.push_hole(InstHole::Ranges { ranges: ranges }) |
433 | | }; |
434 | 0 | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
435 | | } |
436 | 0 | } |
437 | | |
438 | | fn c_byte(&mut self, b: u8) -> ResultOrEmpty { |
439 | | self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) |
440 | | } |
441 | | |
442 | 0 | fn c_class_bytes( |
443 | 0 | &mut self, |
444 | 0 | ranges: &[hir::ClassBytesRange], |
445 | 0 | ) -> ResultOrEmpty { |
446 | | debug_assert!(!ranges.is_empty()); |
447 | | |
448 | 0 | let first_split_entry = self.insts.len(); |
449 | 0 | let mut holes = vec![]; |
450 | 0 | let mut prev_hole = Hole::None; |
451 | 0 | for r in &ranges[0..ranges.len() - 1] { |
452 | 0 | self.fill_to_next(prev_hole); |
453 | 0 | let split = self.push_split_hole(); |
454 | 0 | let next = self.insts.len(); |
455 | 0 | self.byte_classes.set_range(r.start(), r.end()); |
456 | 0 | holes.push(self.push_hole(InstHole::Bytes { |
457 | 0 | start: r.start(), |
458 | 0 | end: r.end(), |
459 | 0 | })); |
460 | 0 | prev_hole = self.fill_split(split, Some(next), None); |
461 | 0 | } |
462 | 0 | let next = self.insts.len(); |
463 | 0 | let r = &ranges[ranges.len() - 1]; |
464 | 0 | self.byte_classes.set_range(r.start(), r.end()); |
465 | 0 | holes.push( |
466 | 0 | self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), |
467 | 0 | ); |
468 | 0 | self.fill(prev_hole, next); |
469 | 0 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
470 | 0 | } |
471 | | |
472 | | fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { |
473 | | let hole = self.push_hole(InstHole::EmptyLook { look: look }); |
474 | | Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) |
475 | | } |
476 | | |
477 | 0 | fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty |
478 | 0 | where |
479 | 0 | I: IntoIterator<Item = &'a Hir>, |
480 | 0 | { |
481 | 0 | let mut exprs = exprs.into_iter(); |
482 | 0 | let Patch { mut hole, entry } = loop { |
483 | 0 | match exprs.next() { |
484 | 0 | None => return Ok(None), |
485 | 0 | Some(e) => { |
486 | 0 | if let Some(p) = self.c(e)? { |
487 | 0 | break p; |
488 | 0 | } |
489 | | } |
490 | | } |
491 | | }; |
492 | 0 | for e in exprs { |
493 | 0 | if let Some(p) = self.c(e)? { |
494 | 0 | self.fill(hole, p.entry); |
495 | 0 | hole = p.hole; |
496 | 0 | } |
497 | | } |
498 | 0 | Ok(Some(Patch { hole: hole, entry: entry })) |
499 | 0 | } Unexecuted instantiation: <regex::compile::Compiler>::c_concat::<core::iter::adapters::take::Take<core::iter::sources::repeat::Repeat<®ex_syntax::hir::Hir>>> Unexecuted instantiation: <regex::compile::Compiler>::c_concat::<&alloc::vec::Vec<regex_syntax::hir::Hir>> Unexecuted instantiation: <regex::compile::Compiler>::c_concat::<core::iter::adapters::rev::Rev<core::slice::iter::Iter<regex_syntax::hir::Hir>>> |
500 | | |
501 | 0 | fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { |
502 | | debug_assert!( |
503 | | exprs.len() >= 2, |
504 | | "alternates must have at least 2 exprs" |
505 | | ); |
506 | | |
507 | | // Initial entry point is always the first split. |
508 | 0 | let first_split_entry = self.insts.len(); |
509 | 0 |
|
510 | 0 | // Save up all of the holes from each alternate. They will all get |
511 | 0 | // patched to point to the same location. |
512 | 0 | let mut holes = vec![]; |
513 | 0 |
|
514 | 0 | // true indicates that the hole is a split where we want to fill |
515 | 0 | // the second branch. |
516 | 0 | let mut prev_hole = (Hole::None, false); |
517 | 0 | for e in &exprs[0..exprs.len() - 1] { |
518 | 0 | if prev_hole.1 { |
519 | 0 | let next = self.insts.len(); |
520 | 0 | self.fill_split(prev_hole.0, None, Some(next)); |
521 | 0 | } else { |
522 | 0 | self.fill_to_next(prev_hole.0); |
523 | 0 | } |
524 | 0 | let split = self.push_split_hole(); |
525 | 0 | if let Some(Patch { hole, entry }) = self.c(e)? { |
526 | 0 | holes.push(hole); |
527 | 0 | prev_hole = (self.fill_split(split, Some(entry), None), false); |
528 | 0 | } else { |
529 | 0 | let (split1, split2) = split.dup_one(); |
530 | 0 | holes.push(split1); |
531 | 0 | prev_hole = (split2, true); |
532 | 0 | } |
533 | | } |
534 | 0 | if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { |
535 | 0 | holes.push(hole); |
536 | 0 | if prev_hole.1 { |
537 | 0 | self.fill_split(prev_hole.0, None, Some(entry)); |
538 | 0 | } else { |
539 | 0 | self.fill(prev_hole.0, entry); |
540 | 0 | } |
541 | 0 | } else { |
542 | 0 | // We ignore prev_hole.1. When it's true, it means we have two |
543 | 0 | // empty branches both pushing prev_hole.0 into holes, so both |
544 | 0 | // branches will go to the same place anyway. |
545 | 0 | holes.push(prev_hole.0); |
546 | 0 | } |
547 | 0 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
548 | 0 | } |
549 | | |
550 | 0 | fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { |
551 | 0 | use syntax::hir::RepetitionKind::*; |
552 | 0 | match rep.kind { |
553 | 0 | ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), |
554 | 0 | ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), |
555 | 0 | OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), |
556 | 0 | Range(hir::RepetitionRange::Exactly(min_max)) => { |
557 | 0 | self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) |
558 | | } |
559 | 0 | Range(hir::RepetitionRange::AtLeast(min)) => { |
560 | 0 | self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) |
561 | | } |
562 | 0 | Range(hir::RepetitionRange::Bounded(min, max)) => { |
563 | 0 | self.c_repeat_range(&rep.hir, rep.greedy, min, max) |
564 | | } |
565 | | } |
566 | 0 | } |
567 | | |
568 | 0 | fn c_repeat_zero_or_one( |
569 | 0 | &mut self, |
570 | 0 | expr: &Hir, |
571 | 0 | greedy: bool, |
572 | 0 | ) -> ResultOrEmpty { |
573 | 0 | let split_entry = self.insts.len(); |
574 | 0 | let split = self.push_split_hole(); |
575 | 0 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
576 | 0 | Some(p) => p, |
577 | 0 | None => return self.pop_split_hole(), |
578 | | }; |
579 | 0 | let split_hole = if greedy { |
580 | 0 | self.fill_split(split, Some(entry_rep), None) |
581 | | } else { |
582 | 0 | self.fill_split(split, None, Some(entry_rep)) |
583 | | }; |
584 | 0 | let holes = vec![hole_rep, split_hole]; |
585 | 0 | Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) |
586 | 0 | } |
587 | | |
588 | 0 | fn c_repeat_zero_or_more( |
589 | 0 | &mut self, |
590 | 0 | expr: &Hir, |
591 | 0 | greedy: bool, |
592 | 0 | ) -> ResultOrEmpty { |
593 | 0 | let split_entry = self.insts.len(); |
594 | 0 | let split = self.push_split_hole(); |
595 | 0 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
596 | 0 | Some(p) => p, |
597 | 0 | None => return self.pop_split_hole(), |
598 | | }; |
599 | | |
600 | 0 | self.fill(hole_rep, split_entry); |
601 | 0 | let split_hole = if greedy { |
602 | 0 | self.fill_split(split, Some(entry_rep), None) |
603 | | } else { |
604 | 0 | self.fill_split(split, None, Some(entry_rep)) |
605 | | }; |
606 | 0 | Ok(Some(Patch { hole: split_hole, entry: split_entry })) |
607 | 0 | } |
608 | | |
609 | 0 | fn c_repeat_one_or_more( |
610 | 0 | &mut self, |
611 | 0 | expr: &Hir, |
612 | 0 | greedy: bool, |
613 | 0 | ) -> ResultOrEmpty { |
614 | 0 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
615 | 0 | Some(p) => p, |
616 | 0 | None => return Ok(None), |
617 | | }; |
618 | 0 | self.fill_to_next(hole_rep); |
619 | 0 | let split = self.push_split_hole(); |
620 | | |
621 | 0 | let split_hole = if greedy { |
622 | 0 | self.fill_split(split, Some(entry_rep), None) |
623 | | } else { |
624 | 0 | self.fill_split(split, None, Some(entry_rep)) |
625 | | }; |
626 | 0 | Ok(Some(Patch { hole: split_hole, entry: entry_rep })) |
627 | 0 | } |
628 | | |
629 | 0 | fn c_repeat_range_min_or_more( |
630 | 0 | &mut self, |
631 | 0 | expr: &Hir, |
632 | 0 | greedy: bool, |
633 | 0 | min: u32, |
634 | 0 | ) -> ResultOrEmpty { |
635 | 0 | let min = u32_to_usize(min); |
636 | | // Using next_inst() is ok, because we can't return it (concat would |
637 | | // have to return Some(_) while c_repeat_range_min_or_more returns |
638 | | // None). |
639 | 0 | let patch_concat = self |
640 | 0 | .c_concat(iter::repeat(expr).take(min))? |
641 | 0 | .unwrap_or(self.next_inst()); |
642 | 0 | if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { |
643 | 0 | self.fill(patch_concat.hole, patch_rep.entry); |
644 | 0 | Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) |
645 | | } else { |
646 | 0 | Ok(None) |
647 | | } |
648 | 0 | } |
649 | | |
650 | 0 | fn c_repeat_range( |
651 | 0 | &mut self, |
652 | 0 | expr: &Hir, |
653 | 0 | greedy: bool, |
654 | 0 | min: u32, |
655 | 0 | max: u32, |
656 | 0 | ) -> ResultOrEmpty { |
657 | 0 | let (min, max) = (u32_to_usize(min), u32_to_usize(max)); |
658 | | debug_assert!(min <= max); |
659 | 0 | let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; |
660 | 0 | if min == max { |
661 | 0 | return Ok(patch_concat); |
662 | 0 | } |
663 | 0 | // Same reasoning as in c_repeat_range_min_or_more (we know that min < |
664 | 0 | // max at this point). |
665 | 0 | let patch_concat = patch_concat.unwrap_or(self.next_inst()); |
666 | 0 | let initial_entry = patch_concat.entry; |
667 | 0 | // It is much simpler to compile, e.g., `a{2,5}` as: |
668 | 0 | // |
669 | 0 | // aaa?a?a? |
670 | 0 | // |
671 | 0 | // But you end up with a sequence of instructions like this: |
672 | 0 | // |
673 | 0 | // 0: 'a' |
674 | 0 | // 1: 'a', |
675 | 0 | // 2: split(3, 4) |
676 | 0 | // 3: 'a' |
677 | 0 | // 4: split(5, 6) |
678 | 0 | // 5: 'a' |
679 | 0 | // 6: split(7, 8) |
680 | 0 | // 7: 'a' |
681 | 0 | // 8: MATCH |
682 | 0 | // |
683 | 0 | // This is *incredibly* inefficient because the splits end |
684 | 0 | // up forming a chain, which has to be resolved everything a |
685 | 0 | // transition is followed. |
686 | 0 | let mut holes = vec![]; |
687 | 0 | let mut prev_hole = patch_concat.hole; |
688 | 0 | for _ in min..max { |
689 | 0 | self.fill_to_next(prev_hole); |
690 | 0 | let split = self.push_split_hole(); |
691 | 0 | let Patch { hole, entry } = match self.c(expr)? { |
692 | 0 | Some(p) => p, |
693 | 0 | None => return self.pop_split_hole(), |
694 | | }; |
695 | 0 | prev_hole = hole; |
696 | 0 | if greedy { |
697 | 0 | holes.push(self.fill_split(split, Some(entry), None)); |
698 | 0 | } else { |
699 | 0 | holes.push(self.fill_split(split, None, Some(entry))); |
700 | 0 | } |
701 | | } |
702 | 0 | holes.push(prev_hole); |
703 | 0 | Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) |
704 | 0 | } |
705 | | |
706 | | /// Can be used as a default value for the c_* functions when the call to |
707 | | /// c_function is followed by inserting at least one instruction that is |
708 | | /// always executed after the ones written by the c* function. |
709 | | fn next_inst(&self) -> Patch { |
710 | | Patch { hole: Hole::None, entry: self.insts.len() } |
711 | | } |
712 | | |
713 | 0 | fn fill(&mut self, hole: Hole, goto: InstPtr) { |
714 | 0 | match hole { |
715 | 0 | Hole::None => {} |
716 | 0 | Hole::One(pc) => { |
717 | 0 | self.insts[pc].fill(goto); |
718 | 0 | } |
719 | 0 | Hole::Many(holes) => { |
720 | 0 | for hole in holes { |
721 | 0 | self.fill(hole, goto); |
722 | 0 | } |
723 | | } |
724 | | } |
725 | 0 | } |
726 | | |
727 | | fn fill_to_next(&mut self, hole: Hole) { |
728 | | let next = self.insts.len(); |
729 | | self.fill(hole, next); |
730 | | } |
731 | | |
732 | 0 | fn fill_split( |
733 | 0 | &mut self, |
734 | 0 | hole: Hole, |
735 | 0 | goto1: Option<InstPtr>, |
736 | 0 | goto2: Option<InstPtr>, |
737 | 0 | ) -> Hole { |
738 | 0 | match hole { |
739 | 0 | Hole::None => Hole::None, |
740 | 0 | Hole::One(pc) => match (goto1, goto2) { |
741 | 0 | (Some(goto1), Some(goto2)) => { |
742 | 0 | self.insts[pc].fill_split(goto1, goto2); |
743 | 0 | Hole::None |
744 | | } |
745 | 0 | (Some(goto1), None) => { |
746 | 0 | self.insts[pc].half_fill_split_goto1(goto1); |
747 | 0 | Hole::One(pc) |
748 | | } |
749 | 0 | (None, Some(goto2)) => { |
750 | 0 | self.insts[pc].half_fill_split_goto2(goto2); |
751 | 0 | Hole::One(pc) |
752 | | } |
753 | 0 | (None, None) => unreachable!( |
754 | 0 | "at least one of the split \ |
755 | 0 | holes must be filled" |
756 | 0 | ), |
757 | | }, |
758 | 0 | Hole::Many(holes) => { |
759 | 0 | let mut new_holes = vec![]; |
760 | 0 | for hole in holes { |
761 | 0 | new_holes.push(self.fill_split(hole, goto1, goto2)); |
762 | 0 | } |
763 | 0 | if new_holes.is_empty() { |
764 | 0 | Hole::None |
765 | 0 | } else if new_holes.len() == 1 { |
766 | 0 | new_holes.pop().unwrap() |
767 | | } else { |
768 | 0 | Hole::Many(new_holes) |
769 | | } |
770 | | } |
771 | | } |
772 | 0 | } |
773 | | |
774 | | fn push_compiled(&mut self, inst: Inst) { |
775 | | self.insts.push(MaybeInst::Compiled(inst)); |
776 | | } |
777 | | |
778 | | fn push_hole(&mut self, inst: InstHole) -> Hole { |
779 | | let hole = self.insts.len(); |
780 | | self.insts.push(MaybeInst::Uncompiled(inst)); |
781 | | Hole::One(hole) |
782 | | } |
783 | | |
784 | | fn push_split_hole(&mut self) -> Hole { |
785 | | let hole = self.insts.len(); |
786 | | self.insts.push(MaybeInst::Split); |
787 | | Hole::One(hole) |
788 | | } |
789 | | |
790 | | fn pop_split_hole(&mut self) -> ResultOrEmpty { |
791 | | self.insts.pop(); |
792 | | Ok(None) |
793 | | } |
794 | | |
795 | 0 | fn check_size(&self) -> result::Result<(), Error> { |
796 | 0 | use std::mem::size_of; |
797 | 0 |
|
798 | 0 | if self.insts.len() * size_of::<Inst>() > self.size_limit { |
799 | 0 | Err(Error::CompiledTooBig(self.size_limit)) |
800 | | } else { |
801 | 0 | Ok(()) |
802 | | } |
803 | 0 | } |
804 | | } |
805 | | |
806 | | #[derive(Debug)] |
807 | | enum Hole { |
808 | | None, |
809 | | One(InstPtr), |
810 | | Many(Vec<Hole>), |
811 | | } |
812 | | |
813 | | impl Hole { |
814 | 0 | fn dup_one(self) -> (Self, Self) { |
815 | 0 | match self { |
816 | 0 | Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), |
817 | | Hole::None | Hole::Many(_) => { |
818 | 0 | unreachable!("must be called on single hole") |
819 | | } |
820 | | } |
821 | 0 | } |
822 | | } |
823 | | |
824 | 0 | #[derive(Clone, Debug)] |
825 | | enum MaybeInst { |
826 | | Compiled(Inst), |
827 | | Uncompiled(InstHole), |
828 | | Split, |
829 | | Split1(InstPtr), |
830 | | Split2(InstPtr), |
831 | | } |
832 | | |
833 | | impl MaybeInst { |
834 | 0 | fn fill(&mut self, goto: InstPtr) { |
835 | 0 | let maybeinst = match *self { |
836 | 0 | MaybeInst::Split => MaybeInst::Split1(goto), |
837 | 0 | MaybeInst::Uncompiled(ref inst) => { |
838 | 0 | MaybeInst::Compiled(inst.fill(goto)) |
839 | | } |
840 | 0 | MaybeInst::Split1(goto1) => { |
841 | 0 | MaybeInst::Compiled(Inst::Split(InstSplit { |
842 | 0 | goto1: goto1, |
843 | 0 | goto2: goto, |
844 | 0 | })) |
845 | | } |
846 | 0 | MaybeInst::Split2(goto2) => { |
847 | 0 | MaybeInst::Compiled(Inst::Split(InstSplit { |
848 | 0 | goto1: goto, |
849 | 0 | goto2: goto2, |
850 | 0 | })) |
851 | | } |
852 | 0 | _ => unreachable!( |
853 | 0 | "not all instructions were compiled! \ |
854 | 0 | found uncompiled instruction: {:?}", |
855 | 0 | self |
856 | 0 | ), |
857 | | }; |
858 | 0 | *self = maybeinst; |
859 | 0 | } |
860 | | |
861 | 0 | fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { |
862 | 0 | let filled = match *self { |
863 | 0 | MaybeInst::Split => { |
864 | 0 | Inst::Split(InstSplit { goto1: goto1, goto2: goto2 }) |
865 | | } |
866 | 0 | _ => unreachable!( |
867 | 0 | "must be called on Split instruction, \ |
868 | 0 | instead it was called on: {:?}", |
869 | 0 | self |
870 | 0 | ), |
871 | | }; |
872 | 0 | *self = MaybeInst::Compiled(filled); |
873 | 0 | } |
874 | | |
875 | 0 | fn half_fill_split_goto1(&mut self, goto1: InstPtr) { |
876 | 0 | let half_filled = match *self { |
877 | 0 | MaybeInst::Split => goto1, |
878 | 0 | _ => unreachable!( |
879 | 0 | "must be called on Split instruction, \ |
880 | 0 | instead it was called on: {:?}", |
881 | 0 | self |
882 | 0 | ), |
883 | | }; |
884 | 0 | *self = MaybeInst::Split1(half_filled); |
885 | 0 | } |
886 | | |
887 | 0 | fn half_fill_split_goto2(&mut self, goto2: InstPtr) { |
888 | 0 | let half_filled = match *self { |
889 | 0 | MaybeInst::Split => goto2, |
890 | 0 | _ => unreachable!( |
891 | 0 | "must be called on Split instruction, \ |
892 | 0 | instead it was called on: {:?}", |
893 | 0 | self |
894 | 0 | ), |
895 | | }; |
896 | 0 | *self = MaybeInst::Split2(half_filled); |
897 | 0 | } |
898 | | |
899 | 0 | fn unwrap(self) -> Inst { |
900 | 0 | match self { |
901 | 0 | MaybeInst::Compiled(inst) => inst, |
902 | 0 | _ => unreachable!( |
903 | 0 | "must be called on a compiled instruction, \ |
904 | 0 | instead it was called on: {:?}", |
905 | 0 | self |
906 | 0 | ), |
907 | | } |
908 | 0 | } |
909 | | } |
910 | | |
911 | 0 | #[derive(Clone, Debug)] |
912 | | enum InstHole { |
913 | | Save { slot: usize }, |
914 | | EmptyLook { look: EmptyLook }, |
915 | | Char { c: char }, |
916 | | Ranges { ranges: Vec<(char, char)> }, |
917 | | Bytes { start: u8, end: u8 }, |
918 | | } |
919 | | |
920 | | impl InstHole { |
921 | 0 | fn fill(&self, goto: InstPtr) -> Inst { |
922 | 0 | match *self { |
923 | 0 | InstHole::Save { slot } => { |
924 | 0 | Inst::Save(InstSave { goto: goto, slot: slot }) |
925 | | } |
926 | 0 | InstHole::EmptyLook { look } => { |
927 | 0 | Inst::EmptyLook(InstEmptyLook { goto: goto, look: look }) |
928 | | } |
929 | 0 | InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }), |
930 | 0 | InstHole::Ranges { ref ranges } => { |
931 | 0 | Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() }) |
932 | | } |
933 | 0 | InstHole::Bytes { start, end } => { |
934 | 0 | Inst::Bytes(InstBytes { goto: goto, start: start, end: end }) |
935 | | } |
936 | | } |
937 | 0 | } |
938 | | } |
939 | | |
940 | | struct CompileClass<'a, 'b> { |
941 | | c: &'a mut Compiler, |
942 | | ranges: &'b [hir::ClassUnicodeRange], |
943 | | } |
944 | | |
945 | | impl<'a, 'b> CompileClass<'a, 'b> { |
946 | 0 | fn compile(mut self) -> Result { |
947 | 0 | let mut holes = vec![]; |
948 | 0 | let mut initial_entry = None; |
949 | 0 | let mut last_split = Hole::None; |
950 | 0 | let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); |
951 | 0 | self.c.suffix_cache.clear(); |
952 | | |
953 | 0 | for (i, range) in self.ranges.iter().enumerate() { |
954 | 0 | let is_last_range = i + 1 == self.ranges.len(); |
955 | 0 | utf8_seqs.reset(range.start(), range.end()); |
956 | 0 | let mut it = (&mut utf8_seqs).peekable(); |
957 | | loop { |
958 | 0 | let utf8_seq = match it.next() { |
959 | 0 | None => break, |
960 | 0 | Some(utf8_seq) => utf8_seq, |
961 | | }; |
962 | 0 | if is_last_range && it.peek().is_none() { |
963 | 0 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
964 | 0 | holes.push(hole); |
965 | 0 | self.c.fill(last_split, entry); |
966 | 0 | last_split = Hole::None; |
967 | 0 | if initial_entry.is_none() { |
968 | 0 | initial_entry = Some(entry); |
969 | 0 | } |
970 | | } else { |
971 | 0 | if initial_entry.is_none() { |
972 | 0 | initial_entry = Some(self.c.insts.len()); |
973 | 0 | } |
974 | 0 | self.c.fill_to_next(last_split); |
975 | 0 | last_split = self.c.push_split_hole(); |
976 | 0 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
977 | 0 | holes.push(hole); |
978 | 0 | last_split = |
979 | 0 | self.c.fill_split(last_split, Some(entry), None); |
980 | | } |
981 | | } |
982 | | } |
983 | 0 | self.c.utf8_seqs = Some(utf8_seqs); |
984 | 0 | Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) |
985 | 0 | } |
986 | | |
987 | 0 | fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { |
988 | 0 | if self.c.compiled.is_reverse { |
989 | 0 | self.c_utf8_seq_(seq) |
990 | | } else { |
991 | 0 | self.c_utf8_seq_(seq.into_iter().rev()) |
992 | | } |
993 | 0 | } |
994 | | |
995 | 0 | fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result |
996 | 0 | where |
997 | 0 | I: IntoIterator<Item = &'r Utf8Range>, |
998 | 0 | { |
999 | | // The initial instruction for each UTF-8 sequence should be the same. |
1000 | 0 | let mut from_inst = ::std::usize::MAX; |
1001 | 0 | let mut last_hole = Hole::None; |
1002 | 0 | for byte_range in seq { |
1003 | 0 | let key = SuffixCacheKey { |
1004 | 0 | from_inst: from_inst, |
1005 | 0 | start: byte_range.start, |
1006 | 0 | end: byte_range.end, |
1007 | 0 | }; |
1008 | 0 | { |
1009 | 0 | let pc = self.c.insts.len(); |
1010 | 0 | if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { |
1011 | 0 | from_inst = cached_pc; |
1012 | | continue; |
1013 | 0 | } |
1014 | 0 | } |
1015 | 0 | self.c.byte_classes.set_range(byte_range.start, byte_range.end); |
1016 | 0 | if from_inst == ::std::usize::MAX { |
1017 | 0 | last_hole = self.c.push_hole(InstHole::Bytes { |
1018 | 0 | start: byte_range.start, |
1019 | 0 | end: byte_range.end, |
1020 | 0 | }); |
1021 | 0 | } else { |
1022 | 0 | self.c.push_compiled(Inst::Bytes(InstBytes { |
1023 | 0 | goto: from_inst, |
1024 | 0 | start: byte_range.start, |
1025 | 0 | end: byte_range.end, |
1026 | 0 | })); |
1027 | 0 | } |
1028 | 0 | from_inst = self.c.insts.len().checked_sub(1).unwrap(); |
1029 | | debug_assert!(from_inst < ::std::usize::MAX); |
1030 | | } |
1031 | | debug_assert!(from_inst < ::std::usize::MAX); |
1032 | 0 | Ok(Patch { hole: last_hole, entry: from_inst }) |
1033 | 0 | } Unexecuted instantiation: <regex::compile::CompileClass>::c_utf8_seq_::<®ex_syntax::utf8::Utf8Sequence> Unexecuted instantiation: <regex::compile::CompileClass>::c_utf8_seq_::<core::iter::adapters::rev::Rev<core::slice::iter::Iter<regex_syntax::utf8::Utf8Range>>> |
1034 | | } |
1035 | | |
1036 | | /// `SuffixCache` is a simple bounded hash map for caching suffix entries in |
1037 | | /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. |
1038 | | /// The set of byte ranges looks like this: |
1039 | | /// |
1040 | | /// [0-7F] |
1041 | | /// [C2-DF][80-BF] |
1042 | | /// [E0][A0-BF][80-BF] |
1043 | | /// [E1-EC][80-BF][80-BF] |
1044 | | /// [ED][80-9F][80-BF] |
1045 | | /// [EE-EF][80-BF][80-BF] |
1046 | | /// |
1047 | | /// Each line above translates to one alternate in the compiled regex program. |
1048 | | /// However, all but one of the alternates end in the same suffix, which is |
1049 | | /// a waste of an instruction. The suffix cache facilitates reusing them across |
1050 | | /// alternates. |
1051 | | /// |
1052 | | /// Note that a HashMap could be trivially used for this, but we don't need its |
1053 | | /// overhead. Some small bounded space (LRU style) is more than enough. |
1054 | | /// |
1055 | | /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), |
1056 | | /// except it uses hashes as original indices and then compares full keys for |
1057 | | /// validation against `dense` array. |
1058 | | #[derive(Debug)] |
1059 | | struct SuffixCache { |
1060 | | sparse: Box<[usize]>, |
1061 | | dense: Vec<SuffixCacheEntry>, |
1062 | | } |
1063 | | |
1064 | 0 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
1065 | | struct SuffixCacheEntry { |
1066 | | key: SuffixCacheKey, |
1067 | | pc: InstPtr, |
1068 | | } |
1069 | | |
1070 | 0 | #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
1071 | | struct SuffixCacheKey { |
1072 | | from_inst: InstPtr, |
1073 | | start: u8, |
1074 | | end: u8, |
1075 | | } |
1076 | | |
1077 | | impl SuffixCache { |
1078 | | fn new(size: usize) -> Self { |
1079 | | SuffixCache { |
1080 | | sparse: vec![0usize; size].into(), |
1081 | | dense: Vec::with_capacity(size), |
1082 | | } |
1083 | | } |
1084 | | |
1085 | 0 | fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { |
1086 | 0 | let hash = self.hash(&key); |
1087 | 0 | let pos = &mut self.sparse[hash]; |
1088 | 0 | if let Some(entry) = self.dense.get(*pos) { |
1089 | 0 | if entry.key == key { |
1090 | 0 | return Some(entry.pc); |
1091 | 0 | } |
1092 | 0 | } |
1093 | 0 | *pos = self.dense.len(); |
1094 | 0 | self.dense.push(SuffixCacheEntry { key: key, pc: pc }); |
1095 | 0 | None |
1096 | 0 | } |
1097 | | |
1098 | | fn clear(&mut self) { |
1099 | | self.dense.clear(); |
1100 | | } |
1101 | | |
1102 | | fn hash(&self, suffix: &SuffixCacheKey) -> usize { |
1103 | | // Basic FNV-1a hash as described: |
1104 | | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
1105 | | const FNV_PRIME: u64 = 1099511628211; |
1106 | | let mut h = 14695981039346656037; |
1107 | | h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); |
1108 | | h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); |
1109 | | h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); |
1110 | | (h as usize) % self.sparse.len() |
1111 | | } |
1112 | | } |
1113 | | |
1114 | | struct ByteClassSet([bool; 256]); |
1115 | | |
1116 | | impl ByteClassSet { |
1117 | | fn new() -> Self { |
1118 | | ByteClassSet([false; 256]) |
1119 | | } |
1120 | | |
1121 | 0 | fn set_range(&mut self, start: u8, end: u8) { |
1122 | | debug_assert!(start <= end); |
1123 | 0 | if start > 0 { |
1124 | 0 | self.0[start as usize - 1] = true; |
1125 | 0 | } |
1126 | 0 | self.0[end as usize] = true; |
1127 | 0 | } |
1128 | | |
1129 | 0 | fn set_word_boundary(&mut self) { |
1130 | 0 | // We need to mark all ranges of bytes whose pairs result in |
1131 | 0 | // evaluating \b differently. |
1132 | 0 | let iswb = is_word_byte; |
1133 | 0 | let mut b1: u16 = 0; |
1134 | | let mut b2: u16; |
1135 | 0 | while b1 <= 255 { |
1136 | 0 | b2 = b1 + 1; |
1137 | 0 | while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { |
1138 | 0 | b2 += 1; |
1139 | 0 | } |
1140 | 0 | self.set_range(b1 as u8, (b2 - 1) as u8); |
1141 | 0 | b1 = b2; |
1142 | | } |
1143 | 0 | } |
1144 | | |
1145 | 0 | fn byte_classes(&self) -> Vec<u8> { |
1146 | 0 | // N.B. If you're debugging the DFA, it's useful to simply return |
1147 | 0 | // `(0..256).collect()`, which effectively removes the byte classes |
1148 | 0 | // and makes the transitions easier to read. |
1149 | 0 | // (0usize..256).map(|x| x as u8).collect() |
1150 | 0 | let mut byte_classes = vec![0; 256]; |
1151 | 0 | let mut class = 0u8; |
1152 | 0 | let mut i = 0; |
1153 | | loop { |
1154 | 0 | byte_classes[i] = class as u8; |
1155 | 0 | if i >= 255 { |
1156 | 0 | break; |
1157 | 0 | } |
1158 | 0 | if self.0[i] { |
1159 | 0 | class = class.checked_add(1).unwrap(); |
1160 | 0 | } |
1161 | 0 | i += 1; |
1162 | | } |
1163 | 0 | byte_classes |
1164 | 0 | } |
1165 | | } |
1166 | | |
1167 | | impl fmt::Debug for ByteClassSet { |
1168 | | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1169 | | f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() |
1170 | | } |
1171 | | } |
1172 | | |
1173 | 0 | fn u32_to_usize(n: u32) -> usize { |
1174 | | // In case usize is less than 32 bits, we need to guard against overflow. |
1175 | | // On most platforms this compiles to nothing. |
1176 | | // TODO Use `std::convert::TryFrom` once it's stable. |
1177 | 0 | if (n as u64) > (::std::usize::MAX as u64) { |
1178 | 0 | panic!("BUG: {} is too big to be pointer sized", n) |
1179 | 0 | } |
1180 | 0 | n as usize |
1181 | 0 | } |
1182 | | |
1183 | | #[cfg(test)] |
1184 | | mod tests { |
1185 | | use super::ByteClassSet; |
1186 | | |
1187 | | #[test] |
1188 | | fn byte_classes() { |
1189 | | let mut set = ByteClassSet::new(); |
1190 | | set.set_range(b'a', b'z'); |
1191 | | let classes = set.byte_classes(); |
1192 | | assert_eq!(classes[0], 0); |
1193 | | assert_eq!(classes[1], 0); |
1194 | | assert_eq!(classes[2], 0); |
1195 | | assert_eq!(classes[b'a' as usize - 1], 0); |
1196 | | assert_eq!(classes[b'a' as usize], 1); |
1197 | | assert_eq!(classes[b'm' as usize], 1); |
1198 | | assert_eq!(classes[b'z' as usize], 1); |
1199 | | assert_eq!(classes[b'z' as usize + 1], 2); |
1200 | | assert_eq!(classes[254], 2); |
1201 | | assert_eq!(classes[255], 2); |
1202 | | |
1203 | | let mut set = ByteClassSet::new(); |
1204 | | set.set_range(0, 2); |
1205 | | set.set_range(4, 6); |
1206 | | let classes = set.byte_classes(); |
1207 | | assert_eq!(classes[0], 0); |
1208 | | assert_eq!(classes[1], 0); |
1209 | | assert_eq!(classes[2], 0); |
1210 | | assert_eq!(classes[3], 1); |
1211 | | assert_eq!(classes[4], 2); |
1212 | | assert_eq!(classes[5], 2); |
1213 | | assert_eq!(classes[6], 2); |
1214 | | assert_eq!(classes[7], 3); |
1215 | | assert_eq!(classes[255], 3); |
1216 | | } |
1217 | | |
1218 | | #[test] |
1219 | | fn full_byte_classes() { |
1220 | | let mut set = ByteClassSet::new(); |
1221 | | for i in 0..256u16 { |
1222 | | set.set_range(i as u8, i as u8); |
1223 | | } |
1224 | | assert_eq!(set.byte_classes().len(), 256); |
1225 | | } |
1226 | | } |