/src/semver-parser/src/lexer.rs
Line | Count | Source |
1 | | //! Lexer for semver ranges. |
2 | | //! |
3 | | //! Breaks a string of input into an iterator of tokens that can be used with a parser. |
4 | | //! |
5 | | //! This should be used with the [`parser`] module. |
6 | | //! |
7 | | //! [`parser`]: ../parser/index.html |
8 | | //! |
9 | | //! # Examples |
10 | | //! |
11 | | //! Example without errors: |
12 | | //! |
13 | | //! ```rust |
14 | | //! use semver_parser::lexer::{Lexer, Token}; |
15 | | //! |
16 | | //! let mut l = Lexer::new("foo 123 *"); |
17 | | //! |
18 | | //! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next()); |
19 | | //! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next()); |
20 | | //! assert_eq!(Some(Ok(Token::Numeric(123))), l.next()); |
21 | | //! assert_eq!(Some(Ok(Token::Whitespace(7, 8))), l.next()); |
22 | | //! assert_eq!(Some(Ok(Token::Star)), l.next()); |
23 | | //! assert_eq!(None, l.next()); |
24 | | //! ``` |
25 | | //! |
26 | | //! Example with error: |
27 | | //! |
28 | | //! ```rust |
29 | | //! use semver_parser::lexer::{Lexer, Token, Error}; |
30 | | //! |
31 | | //! let mut l = Lexer::new("foo / *"); |
32 | | //! |
33 | | //! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next()); |
34 | | //! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next()); |
35 | | //! assert_eq!(Some(Err(Error::UnexpectedChar('/'))), l.next()); |
36 | | //! ``` |
37 | | |
38 | | use self::Error::*; |
39 | | use self::Token::*; |
40 | | use std::str; |
41 | | |
42 | | macro_rules! scan_while { |
43 | | ($slf:expr, $start:expr, $first:pat $(| $rest:pat)*) => {{ |
44 | | let mut __end = $start; |
45 | | |
46 | | loop { |
47 | | if let Some((idx, c)) = $slf.one() { |
48 | | __end = idx; |
49 | | |
50 | | match c { |
51 | | $first $(| $rest)* => $slf.step(), |
52 | | _ => break, |
53 | | } |
54 | | |
55 | | continue; |
56 | | } else { |
57 | | __end = $slf.input.len(); |
58 | | } |
59 | | |
60 | | break; |
61 | | } |
62 | | |
63 | | __end |
64 | | }} |
65 | | } |
66 | | |
67 | | /// Semver tokens. |
68 | | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] |
69 | | pub enum Token<'input> { |
70 | | /// `=` |
71 | | Eq, |
72 | | /// `>` |
73 | | Gt, |
74 | | /// `<` |
75 | | Lt, |
76 | | /// `<=` |
77 | | LtEq, |
78 | | /// `>=` |
79 | | GtEq, |
80 | | /// '^` |
81 | | Caret, |
82 | | /// '~` |
83 | | Tilde, |
84 | | /// '*` |
85 | | Star, |
86 | | /// `.` |
87 | | Dot, |
88 | | /// `,` |
89 | | Comma, |
90 | | /// `-` |
91 | | Hyphen, |
92 | | /// `+` |
93 | | Plus, |
94 | | /// '||' |
95 | | Or, |
96 | | /// any number of whitespace (`\t\r\n `) and its span. |
97 | | Whitespace(usize, usize), |
98 | | /// Numeric component, like `0` or `42`. |
99 | | Numeric(u64), |
100 | | /// Alphanumeric component, like `alpha1` or `79deadbe`. |
101 | | AlphaNumeric(&'input str), |
102 | | } |
103 | | |
104 | | impl<'input> Token<'input> { |
105 | | /// Check if the current token is a whitespace token. |
106 | 0 | pub fn is_whitespace(&self) -> bool { |
107 | 0 | match *self { |
108 | 0 | Whitespace(..) => true, |
109 | 0 | _ => false, |
110 | | } |
111 | 0 | } |
112 | | |
113 | | /// Check if the current token is a wildcard token. |
114 | 0 | pub fn is_wildcard(&self) -> bool { |
115 | 0 | match *self { |
116 | 0 | Star | AlphaNumeric("X") | AlphaNumeric("x") => true, |
117 | 0 | _ => false, |
118 | | } |
119 | 0 | } |
120 | | } |
121 | | |
122 | | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] |
123 | | pub enum Error { |
124 | | /// Unexpected character. |
125 | | UnexpectedChar(char), |
126 | | } |
127 | | |
128 | | /// Lexer for semver tokens belonging to a range. |
129 | | #[derive(Debug)] |
130 | | pub struct Lexer<'input> { |
131 | | input: &'input str, |
132 | | chars: str::CharIndices<'input>, |
133 | | // lookahead |
134 | | c1: Option<(usize, char)>, |
135 | | c2: Option<(usize, char)>, |
136 | | } |
137 | | |
138 | | impl<'input> Lexer<'input> { |
139 | | /// Construct a new lexer for the given input. |
140 | 854 | pub fn new(input: &str) -> Lexer { |
141 | 854 | let mut chars = input.char_indices(); |
142 | 854 | let c1 = chars.next(); |
143 | 854 | let c2 = chars.next(); |
144 | | |
145 | 854 | Lexer { |
146 | 854 | input, |
147 | 854 | chars, |
148 | 854 | c1, |
149 | 854 | c2, |
150 | 854 | } |
151 | 854 | } |
152 | | |
153 | | /// Shift all lookahead storage by one. |
154 | 44.1M | fn step(&mut self) { |
155 | 44.1M | self.c1 = self.c2; |
156 | 44.1M | self.c2 = self.chars.next(); |
157 | 44.1M | } |
158 | | |
159 | 0 | fn step_n(&mut self, n: usize) { |
160 | 0 | for _ in 0..n { |
161 | 0 | self.step(); |
162 | 0 | } |
163 | 0 | } |
164 | | |
165 | | /// Access the one character, or set it if it is not set. |
166 | 46.8M | fn one(&mut self) -> Option<(usize, char)> { |
167 | 46.8M | self.c1 |
168 | 46.8M | } |
169 | | |
170 | | /// Access two characters. |
171 | 7.05M | fn two(&mut self) -> Option<(usize, char, char)> { |
172 | 7.05M | self.c1 |
173 | 7.05M | .and_then(|(start, c1)| self.c2.map(|(_, c2)| (start, c1, c2))) |
174 | 7.05M | } |
175 | | |
176 | | /// Consume a component. |
177 | | /// |
178 | | /// A component can either be an alphanumeric or numeric. |
179 | | /// Does not permit leading zeroes if numeric. |
180 | 2.73M | fn component(&mut self, start: usize) -> Result<Token<'input>, Error> { |
181 | 34.9M | let end = scan_while!(self, start, '0'..='9' | 'A'..='Z' | 'a'..='z'); |
182 | 2.73M | let input = &self.input[start..end]; |
183 | | |
184 | 2.73M | let mut it = input.chars(); |
185 | 2.73M | let (a, b) = (it.next(), it.next()); |
186 | | |
187 | | // exactly zero |
188 | 2.73M | if a == Some('0') && b.is_none() { |
189 | 26.5k | return Ok(Numeric(0)); |
190 | 2.71M | } |
191 | | |
192 | 2.71M | if a != Some('0') { |
193 | 2.71M | if let Ok(numeric) = input.parse::<u64>() { |
194 | 2.05M | return Ok(Numeric(numeric)); |
195 | 657k | } |
196 | 271 | } |
197 | | |
198 | 658k | Ok(AlphaNumeric(input)) |
199 | 2.73M | } |
200 | | |
201 | | /// Consume whitespace. |
202 | 54 | fn whitespace(&mut self, start: usize) -> Result<Token<'input>, Error> { |
203 | 2.09M | let end = scan_while!(self, start, ' ' | '\t' | '\n' | '\r'); |
204 | 54 | Ok(Whitespace(start, end)) |
205 | 54 | } |
206 | | } |
207 | | |
208 | | impl<'input> Iterator for Lexer<'input> { |
209 | | type Item = Result<Token<'input>, Error>; |
210 | | |
211 | 7.05M | fn next(&mut self) -> Option<Self::Item> { |
212 | | #[allow(clippy::never_loop)] |
213 | | loop { |
214 | | // two subsequent char tokens. |
215 | 7.05M | if let Some((_, a, b)) = self.two() { |
216 | 7.05M | let two = match (a, b) { |
217 | 0 | ('<', '=') => Some(LtEq), |
218 | 0 | ('>', '=') => Some(GtEq), |
219 | 0 | ('|', '|') => Some(Or), |
220 | 7.05M | _ => None, |
221 | | }; |
222 | | |
223 | 7.05M | if let Some(two) = two { |
224 | 0 | self.step_n(2); |
225 | 0 | return Some(Ok(two)); |
226 | 7.05M | } |
227 | 91 | } |
228 | | |
229 | | // single char and start of numeric tokens. |
230 | 7.05M | if let Some((start, c)) = self.one() { |
231 | 7.05M | let tok = match c { |
232 | | ' ' | '\t' | '\n' | '\r' => { |
233 | 54 | self.step(); |
234 | 54 | return Some(self.whitespace(start)); |
235 | | } |
236 | 49 | '=' => Eq, |
237 | 5 | '>' => Gt, |
238 | 8 | '<' => Lt, |
239 | 0 | '^' => Caret, |
240 | 9 | '~' => Tilde, |
241 | 1.57M | '*' => Star, |
242 | 2.49M | '.' => Dot, |
243 | 2 | ',' => Comma, |
244 | 242k | '-' => Hyphen, |
245 | 25 | '+' => Plus, |
246 | 2.74M | '0'..='9' | 'a'..='z' | 'A'..='Z' => { |
247 | 2.73M | self.step(); |
248 | 2.73M | return Some(self.component(start)); |
249 | | } |
250 | 741 | c => return Some(Err(UnexpectedChar(c))), |
251 | | }; |
252 | | |
253 | 4.31M | self.step(); |
254 | 4.31M | return Some(Ok(tok)); |
255 | 59 | }; |
256 | | |
257 | 59 | return None; |
258 | | } |
259 | 7.05M | } |
260 | | } |
261 | | |
262 | | #[cfg(test)] |
263 | | mod tests { |
264 | | use super::*; |
265 | | |
266 | | fn lex(input: &str) -> Vec<Token> { |
267 | | Lexer::new(input).map(Result::unwrap).collect::<Vec<_>>() |
268 | | } |
269 | | |
270 | | #[test] |
271 | | pub fn simple_tokens() { |
272 | | assert_eq!( |
273 | | lex("=><<=>=^~*.,-+||"), |
274 | | vec![Eq, Gt, Lt, LtEq, GtEq, Caret, Tilde, Star, Dot, Comma, Hyphen, Plus, Or,] |
275 | | ); |
276 | | } |
277 | | |
278 | | #[test] |
279 | | pub fn whitespace() { |
280 | | assert_eq!( |
281 | | lex(" foo \t\n\rbar"), |
282 | | vec![ |
283 | | Whitespace(0, 2), |
284 | | AlphaNumeric("foo"), |
285 | | Whitespace(5, 9), |
286 | | AlphaNumeric("bar"), |
287 | | ] |
288 | | ); |
289 | | } |
290 | | |
291 | | #[test] |
292 | | pub fn components() { |
293 | | assert_eq!(lex("42"), vec![Numeric(42)]); |
294 | | assert_eq!(lex("0"), vec![Numeric(0)]); |
295 | | assert_eq!(lex("01"), vec![AlphaNumeric("01")]); |
296 | | assert_eq!(lex("01"), vec![AlphaNumeric("01")]); |
297 | | assert_eq!(lex("5885644aa"), vec![AlphaNumeric("5885644aa")]); |
298 | | assert_eq!(lex("beta2"), vec![AlphaNumeric("beta2")]); |
299 | | assert_eq!(lex("beta.2"), vec![AlphaNumeric("beta"), Dot, Numeric(2)]); |
300 | | } |
301 | | |
302 | | #[test] |
303 | | pub fn is_wildcard() { |
304 | | assert_eq!(Star.is_wildcard(), true); |
305 | | assert_eq!(AlphaNumeric("x").is_wildcard(), true); |
306 | | assert_eq!(AlphaNumeric("X").is_wildcard(), true); |
307 | | assert_eq!(AlphaNumeric("other").is_wildcard(), false); |
308 | | } |
309 | | |
310 | | #[test] |
311 | | pub fn empty() { |
312 | | assert_eq!(lex(""), vec![]); |
313 | | } |
314 | | |
315 | | #[test] |
316 | | pub fn numeric_all_numbers() { |
317 | | let expected: Vec<Token> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9] |
318 | | .into_iter() |
319 | | .map(Numeric) |
320 | | .collect::<Vec<_>>(); |
321 | | |
322 | | let actual: Vec<_> = lex("0 1 2 3 4 5 6 7 8 9") |
323 | | .into_iter() |
324 | | .filter(|t| !t.is_whitespace()) |
325 | | .collect(); |
326 | | |
327 | | assert_eq!(actual, expected); |
328 | | } |
329 | | } |