Coverage Report

Created: 2025-11-09 06:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/semver-parser/src/lexer.rs
Line
Count
Source
1
//! Lexer for semver ranges.
2
//!
3
//! Breaks a string of input into an iterator of tokens that can be used with a parser.
4
//!
5
//! This should be used with the [`parser`] module.
6
//!
7
//! [`parser`]: ../parser/index.html
8
//!
9
//! # Examples
10
//!
11
//! Example without errors:
12
//!
13
//! ```rust
14
//! use semver_parser::lexer::{Lexer, Token};
15
//!
16
//! let mut l = Lexer::new("foo 123 *");
17
//!
18
//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
19
//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
20
//! assert_eq!(Some(Ok(Token::Numeric(123))), l.next());
21
//! assert_eq!(Some(Ok(Token::Whitespace(7, 8))), l.next());
22
//! assert_eq!(Some(Ok(Token::Star)), l.next());
23
//! assert_eq!(None, l.next());
24
//! ```
25
//!
26
//! Example with error:
27
//!
28
//! ```rust
29
//! use semver_parser::lexer::{Lexer, Token, Error};
30
//!
31
//! let mut l = Lexer::new("foo / *");
32
//!
33
//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
34
//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
35
//! assert_eq!(Some(Err(Error::UnexpectedChar('/'))), l.next());
36
//! ```
37
38
use self::Error::*;
39
use self::Token::*;
40
use std::str;
41
42
macro_rules! scan_while {
43
    ($slf:expr, $start:expr, $first:pat $(| $rest:pat)*) => {{
44
        let mut __end = $start;
45
46
        loop {
47
            if let Some((idx, c)) = $slf.one() {
48
                __end = idx;
49
50
                match c {
51
                    $first $(| $rest)* => $slf.step(),
52
                    _ => break,
53
                }
54
55
                continue;
56
            } else {
57
                __end = $slf.input.len();
58
            }
59
60
            break;
61
        }
62
63
        __end
64
    }}
65
}
66
67
/// Semver tokens.
68
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
69
pub enum Token<'input> {
70
    /// `=`
71
    Eq,
72
    /// `>`
73
    Gt,
74
    /// `<`
75
    Lt,
76
    /// `<=`
77
    LtEq,
78
    /// `>=`
79
    GtEq,
80
    /// '^`
81
    Caret,
82
    /// '~`
83
    Tilde,
84
    /// '*`
85
    Star,
86
    /// `.`
87
    Dot,
88
    /// `,`
89
    Comma,
90
    /// `-`
91
    Hyphen,
92
    /// `+`
93
    Plus,
94
    /// '||'
95
    Or,
96
    /// any number of whitespace (`\t\r\n `) and its span.
97
    Whitespace(usize, usize),
98
    /// Numeric component, like `0` or `42`.
99
    Numeric(u64),
100
    /// Alphanumeric component, like `alpha1` or `79deadbe`.
101
    AlphaNumeric(&'input str),
102
}
103
104
impl<'input> Token<'input> {
105
    /// Check if the current token is a whitespace token.
106
0
    pub fn is_whitespace(&self) -> bool {
107
0
        match *self {
108
0
            Whitespace(..) => true,
109
0
            _ => false,
110
        }
111
0
    }
112
113
    /// Check if the current token is a wildcard token.
114
0
    pub fn is_wildcard(&self) -> bool {
115
0
        match *self {
116
0
            Star | AlphaNumeric("X") | AlphaNumeric("x") => true,
117
0
            _ => false,
118
        }
119
0
    }
120
}
121
122
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
123
pub enum Error {
124
    /// Unexpected character.
125
    UnexpectedChar(char),
126
}
127
128
/// Lexer for semver tokens belonging to a range.
129
#[derive(Debug)]
130
pub struct Lexer<'input> {
131
    input: &'input str,
132
    chars: str::CharIndices<'input>,
133
    // lookahead
134
    c1: Option<(usize, char)>,
135
    c2: Option<(usize, char)>,
136
}
137
138
impl<'input> Lexer<'input> {
139
    /// Construct a new lexer for the given input.
140
854
    pub fn new(input: &str) -> Lexer {
141
854
        let mut chars = input.char_indices();
142
854
        let c1 = chars.next();
143
854
        let c2 = chars.next();
144
145
854
        Lexer {
146
854
            input,
147
854
            chars,
148
854
            c1,
149
854
            c2,
150
854
        }
151
854
    }
152
153
    /// Shift all lookahead storage by one.
154
44.1M
    fn step(&mut self) {
155
44.1M
        self.c1 = self.c2;
156
44.1M
        self.c2 = self.chars.next();
157
44.1M
    }
158
159
0
    fn step_n(&mut self, n: usize) {
160
0
        for _ in 0..n {
161
0
            self.step();
162
0
        }
163
0
    }
164
165
    /// Access the one character, or set it if it is not set.
166
46.8M
    fn one(&mut self) -> Option<(usize, char)> {
167
46.8M
        self.c1
168
46.8M
    }
169
170
    /// Access two characters.
171
7.05M
    fn two(&mut self) -> Option<(usize, char, char)> {
172
7.05M
        self.c1
173
7.05M
            .and_then(|(start, c1)| self.c2.map(|(_, c2)| (start, c1, c2)))
174
7.05M
    }
175
176
    /// Consume a component.
177
    ///
178
    /// A component can either be an alphanumeric or numeric.
179
    /// Does not permit leading zeroes if numeric.
180
2.73M
    fn component(&mut self, start: usize) -> Result<Token<'input>, Error> {
181
34.9M
        let end = scan_while!(self, start, '0'..='9' | 'A'..='Z' | 'a'..='z');
182
2.73M
        let input = &self.input[start..end];
183
184
2.73M
        let mut it = input.chars();
185
2.73M
        let (a, b) = (it.next(), it.next());
186
187
        // exactly zero
188
2.73M
        if a == Some('0') && b.is_none() {
189
26.5k
            return Ok(Numeric(0));
190
2.71M
        }
191
192
2.71M
        if a != Some('0') {
193
2.71M
            if let Ok(numeric) = input.parse::<u64>() {
194
2.05M
                return Ok(Numeric(numeric));
195
657k
            }
196
271
        }
197
198
658k
        Ok(AlphaNumeric(input))
199
2.73M
    }
200
201
    /// Consume whitespace.
202
54
    fn whitespace(&mut self, start: usize) -> Result<Token<'input>, Error> {
203
2.09M
        let end = scan_while!(self, start, ' ' | '\t' | '\n' | '\r');
204
54
        Ok(Whitespace(start, end))
205
54
    }
206
}
207
208
impl<'input> Iterator for Lexer<'input> {
209
    type Item = Result<Token<'input>, Error>;
210
211
7.05M
    fn next(&mut self) -> Option<Self::Item> {
212
        #[allow(clippy::never_loop)]
213
        loop {
214
            // two subsequent char tokens.
215
7.05M
            if let Some((_, a, b)) = self.two() {
216
7.05M
                let two = match (a, b) {
217
0
                    ('<', '=') => Some(LtEq),
218
0
                    ('>', '=') => Some(GtEq),
219
0
                    ('|', '|') => Some(Or),
220
7.05M
                    _ => None,
221
                };
222
223
7.05M
                if let Some(two) = two {
224
0
                    self.step_n(2);
225
0
                    return Some(Ok(two));
226
7.05M
                }
227
91
            }
228
229
            // single char and start of numeric tokens.
230
7.05M
            if let Some((start, c)) = self.one() {
231
7.05M
                let tok = match c {
232
                    ' ' | '\t' | '\n' | '\r' => {
233
54
                        self.step();
234
54
                        return Some(self.whitespace(start));
235
                    }
236
49
                    '=' => Eq,
237
5
                    '>' => Gt,
238
8
                    '<' => Lt,
239
0
                    '^' => Caret,
240
9
                    '~' => Tilde,
241
1.57M
                    '*' => Star,
242
2.49M
                    '.' => Dot,
243
2
                    ',' => Comma,
244
242k
                    '-' => Hyphen,
245
25
                    '+' => Plus,
246
2.74M
                    '0'..='9' | 'a'..='z' | 'A'..='Z' => {
247
2.73M
                        self.step();
248
2.73M
                        return Some(self.component(start));
249
                    }
250
741
                    c => return Some(Err(UnexpectedChar(c))),
251
                };
252
253
4.31M
                self.step();
254
4.31M
                return Some(Ok(tok));
255
59
            };
256
257
59
            return None;
258
        }
259
7.05M
    }
260
}
261
262
#[cfg(test)]
263
mod tests {
264
    use super::*;
265
266
    fn lex(input: &str) -> Vec<Token> {
267
        Lexer::new(input).map(Result::unwrap).collect::<Vec<_>>()
268
    }
269
270
    #[test]
271
    pub fn simple_tokens() {
272
        assert_eq!(
273
            lex("=><<=>=^~*.,-+||"),
274
            vec![Eq, Gt, Lt, LtEq, GtEq, Caret, Tilde, Star, Dot, Comma, Hyphen, Plus, Or,]
275
        );
276
    }
277
278
    #[test]
279
    pub fn whitespace() {
280
        assert_eq!(
281
            lex("  foo \t\n\rbar"),
282
            vec![
283
                Whitespace(0, 2),
284
                AlphaNumeric("foo"),
285
                Whitespace(5, 9),
286
                AlphaNumeric("bar"),
287
            ]
288
        );
289
    }
290
291
    #[test]
292
    pub fn components() {
293
        assert_eq!(lex("42"), vec![Numeric(42)]);
294
        assert_eq!(lex("0"), vec![Numeric(0)]);
295
        assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
296
        assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
297
        assert_eq!(lex("5885644aa"), vec![AlphaNumeric("5885644aa")]);
298
        assert_eq!(lex("beta2"), vec![AlphaNumeric("beta2")]);
299
        assert_eq!(lex("beta.2"), vec![AlphaNumeric("beta"), Dot, Numeric(2)]);
300
    }
301
302
    #[test]
303
    pub fn is_wildcard() {
304
        assert_eq!(Star.is_wildcard(), true);
305
        assert_eq!(AlphaNumeric("x").is_wildcard(), true);
306
        assert_eq!(AlphaNumeric("X").is_wildcard(), true);
307
        assert_eq!(AlphaNumeric("other").is_wildcard(), false);
308
    }
309
310
    #[test]
311
    pub fn empty() {
312
        assert_eq!(lex(""), vec![]);
313
    }
314
315
    #[test]
316
    pub fn numeric_all_numbers() {
317
        let expected: Vec<Token> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
318
            .into_iter()
319
            .map(Numeric)
320
            .collect::<Vec<_>>();
321
322
        let actual: Vec<_> = lex("0 1 2 3 4 5 6 7 8 9")
323
            .into_iter()
324
            .filter(|t| !t.is_whitespace())
325
            .collect();
326
327
        assert_eq!(actual, expected);
328
    }
329
}