/src/semver-parser/src/lexer.rs

Source
//! Lexer for semver ranges.
//!
//! Breaks a string of input into an iterator of tokens that can be used with a parser.
//!
//! This should be used with the [`parser`] module.
//!
//! [`parser`]: ../parser/index.html
//!
//! # Examples
//!
//! Example without errors:
//!
//! ```rust
//! use semver_parser::lexer::{Lexer, Token};
//!
//! let mut l = Lexer::new("foo 123 *");
//!
//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
//! assert_eq!(Some(Ok(Token::Numeric(123))), l.next());
//! assert_eq!(Some(Ok(Token::Whitespace(7, 8))), l.next());
//! assert_eq!(Some(Ok(Token::Star)), l.next());
//! assert_eq!(None, l.next());
//! ```
//!
//! Example with error:
//!
//! ```rust
//! use semver_parser::lexer::{Lexer, Token, Error};
//!
//! let mut l = Lexer::new("foo / *");
//!
//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
//! assert_eq!(Some(Err(Error::UnexpectedChar('/'))), l.next());
//! ```

use self::Error::*;
use self::Token::*;
use std::str;

macro_rules! scan_while {
    ($slf:expr, $start:expr, $first:pat $(| $rest:pat)*) => {{
        let mut __end = $start;

        loop {
            if let Some((idx, c)) = $slf.one() {
                __end = idx;

                match c {
                    $first $(| $rest)* => $slf.step(),
                    _ => break,
                }

                continue;
            } else {
                __end = $slf.input.len();
            }

            break;
        }

        __end
    }}
}

/// Semver tokens.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Token<'input> {
    /// `=`
    Eq,
    /// `>`
    Gt,
    /// `<`
    Lt,
    /// `<=`
    LtEq,
    /// `>=`
    GtEq,
    /// '^`
    Caret,
    /// '~`
    Tilde,
    /// '*`
    Star,
    /// `.`
    Dot,
    /// `,`
    Comma,
    /// `-`
    Hyphen,
    /// `+`
    Plus,
    /// '||'
    Or,
    /// any number of whitespace (`\t\r\n `) and its span.
    Whitespace(usize, usize),
    /// Numeric component, like `0` or `42`.
    Numeric(u64),
    /// Alphanumeric component, like `alpha1` or `79deadbe`.
    AlphaNumeric(&'input str),
}

impl<'input> Token<'input> {
    /// Check if the current token is a whitespace token.
    pub fn is_whitespace(&self) -> bool {
        match *self {
            Whitespace(..) => true,
            _ => false,
        }
    }

    /// Check if the current token is a wildcard token.
    pub fn is_wildcard(&self) -> bool {
        match *self {
            Star | AlphaNumeric("X") | AlphaNumeric("x") => true,
            _ => false,
        }
    }
}

#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Error {
    /// Unexpected character.
    UnexpectedChar(char),
}

/// Lexer for semver tokens belonging to a range.
#[derive(Debug)]
pub struct Lexer<'input> {
    input: &'input str,
    chars: str::CharIndices<'input>,
    // lookahead
    c1: Option<(usize, char)>,
    c2: Option<(usize, char)>,
}

impl<'input> Lexer<'input> {
    /// Construct a new lexer for the given input.
    pub fn new(input: &str) -> Lexer {
        let mut chars = input.char_indices();
        let c1 = chars.next();
        let c2 = chars.next();

        Lexer {
            input,
            chars,
            c1,
            c2,
        }
    }

    /// Shift all lookahead storage by one.
    fn step(&mut self) {
        self.c1 = self.c2;
        self.c2 = self.chars.next();
    }

    fn step_n(&mut self, n: usize) {
        for _ in 0..n {
            self.step();
        }
    }

    /// Access the one character, or set it if it is not set.
    fn one(&mut self) -> Option<(usize, char)> {
        self.c1
    }

    /// Access two characters.
    fn two(&mut self) -> Option<(usize, char, char)> {
        self.c1
            .and_then(|(start, c1)| self.c2.map(|(_, c2)| (start, c1, c2)))
    }

    /// Consume a component.
    ///
    /// A component can either be an alphanumeric or numeric.
    /// Does not permit leading zeroes if numeric.
    fn component(&mut self, start: usize) -> Result<Token<'input>, Error> {
        let end = scan_while!(self, start, '0'..='9' | 'A'..='Z' | 'a'..='z');
        let input = &self.input[start..end];

        let mut it = input.chars();
        let (a, b) = (it.next(), it.next());

        // exactly zero
        if a == Some('0') && b.is_none() {
            return Ok(Numeric(0));
        }

        if a != Some('0') {
            if let Ok(numeric) = input.parse::<u64>() {
                return Ok(Numeric(numeric));
            }
        }

        Ok(AlphaNumeric(input))
    }

    /// Consume whitespace.
    fn whitespace(&mut self, start: usize) -> Result<Token<'input>, Error> {
        let end = scan_while!(self, start, ' ' | '\t' | '\n' | '\r');
        Ok(Whitespace(start, end))
    }
}

impl<'input> Iterator for Lexer<'input> {
    type Item = Result<Token<'input>, Error>;

    fn next(&mut self) -> Option<Self::Item> {
        #[allow(clippy::never_loop)]
        loop {
            // two subsequent char tokens.
            if let Some((_, a, b)) = self.two() {
                let two = match (a, b) {
                    ('<', '=') => Some(LtEq),
                    ('>', '=') => Some(GtEq),
                    ('|', '|') => Some(Or),
                    _ => None,
                };

                if let Some(two) = two {
                    self.step_n(2);
                    return Some(Ok(two));
                }
            }

            // single char and start of numeric tokens.
            if let Some((start, c)) = self.one() {
                let tok = match c {
                    ' ' | '\t' | '\n' | '\r' => {
                        self.step();
                        return Some(self.whitespace(start));
                    }
                    '=' => Eq,
                    '>' => Gt,
                    '<' => Lt,
                    '^' => Caret,
                    '~' => Tilde,
                    '*' => Star,
                    '.' => Dot,
                    ',' => Comma,
                    '-' => Hyphen,
                    '+' => Plus,
                    '0'..='9' | 'a'..='z' | 'A'..='Z' => {
                        self.step();
                        return Some(self.component(start));
                    }
                    c => return Some(Err(UnexpectedChar(c))),
                };

                self.step();
                return Some(Ok(tok));
            };

            return None;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn lex(input: &str) -> Vec<Token> {
        Lexer::new(input).map(Result::unwrap).collect::<Vec<_>>()
    }

    #[test]
    pub fn simple_tokens() {
        assert_eq!(
            lex("=><<=>=^~*.,-+||"),
            vec![Eq, Gt, Lt, LtEq, GtEq, Caret, Tilde, Star, Dot, Comma, Hyphen, Plus, Or,]
        );
    }

    #[test]
    pub fn whitespace() {
        assert_eq!(
            lex("  foo \t\n\rbar"),
            vec![
                Whitespace(0, 2),
                AlphaNumeric("foo"),
                Whitespace(5, 9),
                AlphaNumeric("bar"),
            ]
        );
    }

    #[test]
    pub fn components() {
        assert_eq!(lex("42"), vec![Numeric(42)]);
        assert_eq!(lex("0"), vec![Numeric(0)]);
        assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
        assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
        assert_eq!(lex("5885644aa"), vec![AlphaNumeric("5885644aa")]);
        assert_eq!(lex("beta2"), vec![AlphaNumeric("beta2")]);
        assert_eq!(lex("beta.2"), vec![AlphaNumeric("beta"), Dot, Numeric(2)]);
    }

    #[test]
    pub fn is_wildcard() {
        assert_eq!(Star.is_wildcard(), true);
        assert_eq!(AlphaNumeric("x").is_wildcard(), true);
        assert_eq!(AlphaNumeric("X").is_wildcard(), true);
        assert_eq!(AlphaNumeric("other").is_wildcard(), false);
    }

    #[test]
    pub fn empty() {
        assert_eq!(lex(""), vec![]);
    }

    #[test]
    pub fn numeric_all_numbers() {
        let expected: Vec<Token> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            .into_iter()
            .map(Numeric)
            .collect::<Vec<_>>();

        let actual: Vec<_> = lex("0 1 2 3 4 5 6 7 8 9")
            .into_iter()
            .filter(|t| !t.is_whitespace())
            .collect();

        assert_eq!(actual, expected);
    }
}

Coverage Report

Created: 2025-11-09 06:19

Line	Count	Source
1		//! Lexer for semver ranges.
2		//!
3		//! Breaks a string of input into an iterator of tokens that can be used with a parser.
4		//!
5		//! This should be used with the [`parser`] module.
6		//!
7		//! [`parser`]: ../parser/index.html
8		//!
9		//! # Examples
10		//!
11		//! Example without errors:
12		//!
13		//! ```rust
14		//! use semver_parser::lexer::{Lexer, Token};
15		//!
16		//! let mut l = Lexer::new("foo 123 *");
17		//!
18		//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
19		//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
20		//! assert_eq!(Some(Ok(Token::Numeric(123))), l.next());
21		//! assert_eq!(Some(Ok(Token::Whitespace(7, 8))), l.next());
22		//! assert_eq!(Some(Ok(Token::Star)), l.next());
23		//! assert_eq!(None, l.next());
24		//! ```
25		//!
26		//! Example with error:
27		//!
28		//! ```rust
29		//! use semver_parser::lexer::{Lexer, Token, Error};
30		//!
31		//! let mut l = Lexer::new("foo / *");
32		//!
33		//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
34		//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
35		//! assert_eq!(Some(Err(Error::UnexpectedChar('/'))), l.next());
36		//! ```
37
38		use self::Error::*;
39		use self::Token::*;
40		use std::str;
41
42		macro_rules! scan_while {
43		($slf:expr, $start:expr, $first:pat $(\| $rest:pat)*) => {{
44		let mut __end = $start;
45
46		loop {
47		if let Some((idx, c)) = $slf.one() {
48		__end = idx;
49
50		match c {
51		$first $(\| $rest)* => $slf.step(),
52		_ => break,
53		}
54
55		continue;
56		} else {
57		__end = $slf.input.len();
58		}
59
60		break;
61		}
62
63		__end
64		}}
65		}
66
67		/// Semver tokens.
68		#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
69		pub enum Token<'input> {
70		/// `=`
71		Eq,
72		/// `>`
73		Gt,
74		/// `<`
75		Lt,
76		/// `<=`
77		LtEq,
78		/// `>=`
79		GtEq,
80		/// '^`
81		Caret,
82		/// '~`
83		Tilde,
84		/// '*`
85		Star,
86		/// `.`
87		Dot,
88		/// `,`
89		Comma,
90		/// `-`
91		Hyphen,
92		/// `+`
93		Plus,
94		/// '\|\|'
95		Or,
96		/// any number of whitespace (`\t\r\n `) and its span.
97		Whitespace(usize, usize),
98		/// Numeric component, like `0` or `42`.
99		Numeric(u64),
100		/// Alphanumeric component, like `alpha1` or `79deadbe`.
101		AlphaNumeric(&'input str),
102		}
103
104		impl<'input> Token<'input> {
105		/// Check if the current token is a whitespace token.
106	0	pub fn is_whitespace(&self) -> bool {
107	0	match *self {
108	0	Whitespace(..) => true,
109	0	_ => false,
110		}
111	0	}
112
113		/// Check if the current token is a wildcard token.
114	0	pub fn is_wildcard(&self) -> bool {
115	0	match *self {
116	0	Star \| AlphaNumeric("X") \| AlphaNumeric("x") => true,
117	0	_ => false,
118		}
119	0	}
120		}
121
122		#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
123		pub enum Error {
124		/// Unexpected character.
125		UnexpectedChar(char),
126		}
127
128		/// Lexer for semver tokens belonging to a range.
129		#[derive(Debug)]
130		pub struct Lexer<'input> {
131		input: &'input str,
132		chars: str::CharIndices<'input>,
133		// lookahead
134		c1: Option<(usize, char)>,
135		c2: Option<(usize, char)>,
136		}
137
138		impl<'input> Lexer<'input> {
139		/// Construct a new lexer for the given input.
140	854	pub fn new(input: &str) -> Lexer {
141	854	let mut chars = input.char_indices();
142	854	let c1 = chars.next();
143	854	let c2 = chars.next();
144
145	854	Lexer {
146	854	input,
147	854	chars,
148	854	c1,
149	854	c2,
150	854	}
151	854	}
152
153		/// Shift all lookahead storage by one.
154	44.1M	fn step(&mut self) {
155	44.1M	self.c1 = self.c2;
156	44.1M	self.c2 = self.chars.next();
157	44.1M	}
158
159	0	fn step_n(&mut self, n: usize) {
160	0	for _ in 0..n {
161	0	self.step();
162	0	}
163	0	}
164
165		/// Access the one character, or set it if it is not set.
166	46.8M	fn one(&mut self) -> Option<(usize, char)> {
167	46.8M	self.c1
168	46.8M	}
169
170		/// Access two characters.
171	7.05M	fn two(&mut self) -> Option<(usize, char, char)> {
172	7.05M	self.c1
173	7.05M	.and_then(\|(start, c1)\| self.c2.map(\|(_, c2)\| (start, c1, c2)))
174	7.05M	}
175
176		/// Consume a component.
177		///
178		/// A component can either be an alphanumeric or numeric.
179		/// Does not permit leading zeroes if numeric.
180	2.73M	fn component(&mut self, start: usize) -> Result<Token<'input>, Error> {
181	34.9M	let end = scan_while!(self, start, '0'..='9' \| 'A'..='Z' \| 'a'..='z');
182	2.73M	let input = &self.input[start..end];
183
184	2.73M	let mut it = input.chars();
185	2.73M	let (a, b) = (it.next(), it.next());
186
187		// exactly zero
188	2.73M	if a == Some('0') && b.is_none() {
189	26.5k	return Ok(Numeric(0));
190	2.71M	}
191
192	2.71M	if a != Some('0') {
193	2.71M	if let Ok(numeric) = input.parse::<u64>() {
194	2.05M	return Ok(Numeric(numeric));
195	657k	}
196	271	}
197
198	658k	Ok(AlphaNumeric(input))
199	2.73M	}
200
201		/// Consume whitespace.
202	54	fn whitespace(&mut self, start: usize) -> Result<Token<'input>, Error> {
203	2.09M	let end = scan_while!(self, start, ' ' \| '\t' \| '\n' \| '\r');
204	54	Ok(Whitespace(start, end))
205	54	}
206		}
207
208		impl<'input> Iterator for Lexer<'input> {
209		type Item = Result<Token<'input>, Error>;
210
211	7.05M	fn next(&mut self) -> Option<Self::Item> {
212		#[allow(clippy::never_loop)]
213		loop {
214		// two subsequent char tokens.
215	7.05M	if let Some((_, a, b)) = self.two() {
216	7.05M	let two = match (a, b) {
217	0	('<', '=') => Some(LtEq),
218	0	('>', '=') => Some(GtEq),
219	0	('\|', '\|') => Some(Or),
220	7.05M	_ => None,
221		};
222
223	7.05M	if let Some(two) = two {
224	0	self.step_n(2);
225	0	return Some(Ok(two));
226	7.05M	}
227	91	}
228
229		// single char and start of numeric tokens.
230	7.05M	if let Some((start, c)) = self.one() {
231	7.05M	let tok = match c {
232		' ' \| '\t' \| '\n' \| '\r' => {
233	54	self.step();
234	54	return Some(self.whitespace(start));
235		}
236	49	'=' => Eq,
237	5	'>' => Gt,
238	8	'<' => Lt,
239	0	'^' => Caret,
240	9	'~' => Tilde,
241	1.57M	'*' => Star,
242	2.49M	'.' => Dot,
243	2	',' => Comma,
244	242k	'-' => Hyphen,
245	25	'+' => Plus,
246	2.74M	'0'..='9' \| 'a'..='z' \| 'A'..='Z' => {
247	2.73M	self.step();
248	2.73M	return Some(self.component(start));
249		}
250	741	c => return Some(Err(UnexpectedChar(c))),
251		};
252
253	4.31M	self.step();
254	4.31M	return Some(Ok(tok));
255	59	};
256
257	59	return None;
258		}
259	7.05M	}
260		}
261
262		#[cfg(test)]
263		mod tests {
264		use super::*;
265
266		fn lex(input: &str) -> Vec<Token> {
267		Lexer::new(input).map(Result::unwrap).collect::<Vec<_>>()
268		}
269
270		#[test]
271		pub fn simple_tokens() {
272		assert_eq!(
273		lex("=><<=>=^~*.,-+\|\|"),
274		vec![Eq, Gt, Lt, LtEq, GtEq, Caret, Tilde, Star, Dot, Comma, Hyphen, Plus, Or,]
275		);
276		}
277
278		#[test]
279		pub fn whitespace() {
280		assert_eq!(
281		lex(" foo \t\n\rbar"),
282		vec![
283		Whitespace(0, 2),
284		AlphaNumeric("foo"),
285		Whitespace(5, 9),
286		AlphaNumeric("bar"),
287		]
288		);
289		}
290
291		#[test]
292		pub fn components() {
293		assert_eq!(lex("42"), vec![Numeric(42)]);
294		assert_eq!(lex("0"), vec![Numeric(0)]);
295		assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
296		assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
297		assert_eq!(lex("5885644aa"), vec![AlphaNumeric("5885644aa")]);
298		assert_eq!(lex("beta2"), vec![AlphaNumeric("beta2")]);
299		assert_eq!(lex("beta.2"), vec![AlphaNumeric("beta"), Dot, Numeric(2)]);
300		}
301
302		#[test]
303		pub fn is_wildcard() {
304		assert_eq!(Star.is_wildcard(), true);
305		assert_eq!(AlphaNumeric("x").is_wildcard(), true);
306		assert_eq!(AlphaNumeric("X").is_wildcard(), true);
307		assert_eq!(AlphaNumeric("other").is_wildcard(), false);
308		}
309
310		#[test]
311		pub fn empty() {
312		assert_eq!(lex(""), vec![]);
313		}
314
315		#[test]
316		pub fn numeric_all_numbers() {
317		let expected: Vec<Token> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
318		.into_iter()
319		.map(Numeric)
320		.collect::<Vec<_>>();
321
322		let actual: Vec<_> = lex("0 1 2 3 4 5 6 7 8 9")
323		.into_iter()
324		.filter(\|t\| !t.is_whitespace())
325		.collect();
326
327		assert_eq!(actual, expected);
328		}
329		}