/src/wasm-tools/crates/wit-parser/src/ast/lex.rs
Line | Count | Source |
1 | | use anyhow::{Result, bail}; |
2 | | use std::char; |
3 | | use std::fmt; |
4 | | use std::str; |
5 | | use unicode_xid::UnicodeXID; |
6 | | |
7 | | use self::Token::*; |
8 | | |
9 | | #[derive(Clone)] |
10 | | pub struct Tokenizer<'a> { |
11 | | input: &'a str, |
12 | | span_offset: u32, |
13 | | chars: CrlfFold<'a>, |
14 | | require_f32_f64: bool, |
15 | | } |
16 | | |
17 | | #[derive(Clone)] |
18 | | struct CrlfFold<'a> { |
19 | | chars: str::CharIndices<'a>, |
20 | | } |
21 | | |
22 | | /// A span, designating a range of bytes where a token is located. |
23 | | #[derive(Eq, PartialEq, Debug, Clone, Copy)] |
24 | | pub struct Span { |
25 | | /// The start of the range. |
26 | | pub start: u32, |
27 | | /// The end of the range (exclusive). |
28 | | pub end: u32, |
29 | | } |
30 | | |
31 | | #[derive(Eq, PartialEq, Debug, Copy, Clone)] |
32 | | pub enum Token { |
33 | | Whitespace, |
34 | | Comment, |
35 | | |
36 | | Equals, |
37 | | Comma, |
38 | | Colon, |
39 | | Period, |
40 | | Semicolon, |
41 | | LeftParen, |
42 | | RightParen, |
43 | | LeftBrace, |
44 | | RightBrace, |
45 | | LessThan, |
46 | | GreaterThan, |
47 | | RArrow, |
48 | | Star, |
49 | | At, |
50 | | Slash, |
51 | | Plus, |
52 | | Minus, |
53 | | |
54 | | Use, |
55 | | Type, |
56 | | Func, |
57 | | U8, |
58 | | U16, |
59 | | U32, |
60 | | U64, |
61 | | S8, |
62 | | S16, |
63 | | S32, |
64 | | S64, |
65 | | F32, |
66 | | F64, |
67 | | Char, |
68 | | Record, |
69 | | Resource, |
70 | | Own, |
71 | | Borrow, |
72 | | Flags, |
73 | | Variant, |
74 | | Enum, |
75 | | Bool, |
76 | | String_, |
77 | | Option_, |
78 | | Result_, |
79 | | Future, |
80 | | Stream, |
81 | | ErrorContext, |
82 | | List, |
83 | | Underscore, |
84 | | As, |
85 | | From_, |
86 | | Static, |
87 | | Interface, |
88 | | Tuple, |
89 | | Import, |
90 | | Export, |
91 | | World, |
92 | | Package, |
93 | | Constructor, |
94 | | Async, |
95 | | |
96 | | Id, |
97 | | ExplicitId, |
98 | | |
99 | | Integer, |
100 | | |
101 | | Include, |
102 | | With, |
103 | | } |
104 | | |
105 | | #[derive(Eq, PartialEq, Debug)] |
106 | | #[allow(dead_code)] |
107 | | pub enum Error { |
108 | | InvalidCharInId(u32, char), |
109 | | IdPartEmpty(u32), |
110 | | InvalidEscape(u32, char), |
111 | | Unexpected(u32, char), |
112 | | UnterminatedComment(u32), |
113 | | Wanted { |
114 | | at: u32, |
115 | | expected: &'static str, |
116 | | found: &'static str, |
117 | | }, |
118 | | } |
119 | | |
120 | | // NB: keep in sync with `crates/wit-component/src/printing.rs`. |
121 | | const REQUIRE_F32_F64_BY_DEFAULT: bool = true; |
122 | | |
123 | | impl<'a> Tokenizer<'a> { |
124 | 20.5k | pub fn new( |
125 | 20.5k | input: &'a str, |
126 | 20.5k | span_offset: u32, |
127 | 20.5k | require_f32_f64: Option<bool>, |
128 | 20.5k | ) -> Result<Tokenizer<'a>> { |
129 | 20.5k | detect_invalid_input(input)?; |
130 | | |
131 | 20.5k | let mut t = Tokenizer { |
132 | 20.5k | input, |
133 | 20.5k | span_offset, |
134 | 20.5k | chars: CrlfFold { |
135 | 20.5k | chars: input.char_indices(), |
136 | 20.5k | }, |
137 | 20.5k | require_f32_f64: require_f32_f64.unwrap_or_else(|| { |
138 | 20.5k | match std::env::var("WIT_REQUIRE_F32_F64") { |
139 | 0 | Ok(s) => s == "1", |
140 | 20.5k | Err(_) => REQUIRE_F32_F64_BY_DEFAULT, |
141 | | } |
142 | 20.5k | }), |
143 | | }; |
144 | | // Eat utf-8 BOM |
145 | 20.5k | t.eatc('\u{feff}'); |
146 | 20.5k | Ok(t) |
147 | 20.5k | } |
148 | | |
149 | 38.1k | pub fn expect_semicolon(&mut self) -> Result<()> { |
150 | 38.1k | self.expect(Token::Semicolon)?; |
151 | 38.1k | Ok(()) |
152 | 38.1k | } |
153 | | |
154 | 536k | pub fn get_span(&self, span: Span) -> &'a str { |
155 | 536k | let start = usize::try_from(span.start - self.span_offset).unwrap(); |
156 | 536k | let end = usize::try_from(span.end - self.span_offset).unwrap(); |
157 | 536k | &self.input[start..end] |
158 | 536k | } |
159 | | |
160 | 172k | pub fn parse_id(&self, span: Span) -> Result<&'a str> { |
161 | 172k | let ret = self.get_span(span); |
162 | 172k | validate_id(span.start, &ret)?; |
163 | 172k | Ok(ret) |
164 | 172k | } |
165 | | |
166 | 234k | pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> { |
167 | 234k | let token = self.get_span(span); |
168 | 234k | let id_part = token.strip_prefix('%').unwrap(); |
169 | 234k | validate_id(span.start, id_part)?; |
170 | 234k | Ok(id_part) |
171 | 234k | } |
172 | | |
173 | 3.56M | pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> { |
174 | | loop { |
175 | 4.15M | match self.next_raw()? { |
176 | 591k | Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {} |
177 | 3.56M | other => break Ok(other), |
178 | | } |
179 | | } |
180 | 3.56M | } |
181 | | |
182 | | /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an |
183 | | /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more |
184 | | /// tokens available. |
185 | 5.17M | pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> { |
186 | 5.17M | let (str_start, ch) = match self.chars.next() { |
187 | 5.13M | Some(pair) => pair, |
188 | 42.2k | None => return Ok(None), |
189 | | }; |
190 | 5.13M | let start = self.span_offset + u32::try_from(str_start).unwrap(); |
191 | 5.13M | let token = match ch { |
192 | | '\n' | '\t' | ' ' => { |
193 | | // Eat all contiguous whitespace tokens |
194 | 1.68M | while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {} |
195 | 1.06M | Whitespace |
196 | | } |
197 | | '/' => { |
198 | | // Eat a line comment if it's `//...` |
199 | 7.95k | if self.eatc('/') { |
200 | 0 | for (_, ch) in &mut self.chars { |
201 | 0 | if ch == '\n' { |
202 | 0 | break; |
203 | 0 | } |
204 | | } |
205 | 0 | Comment |
206 | | // eat a block comment if it's `/*...` |
207 | 7.95k | } else if self.eatc('*') { |
208 | 0 | let mut depth = 1; |
209 | 0 | while depth > 0 { |
210 | 0 | let (_, ch) = match self.chars.next() { |
211 | 0 | Some(pair) => pair, |
212 | 0 | None => return Err(Error::UnterminatedComment(start)), |
213 | | }; |
214 | 0 | match ch { |
215 | 0 | '/' if self.eatc('*') => depth += 1, |
216 | 0 | '*' if self.eatc('/') => depth -= 1, |
217 | 0 | _ => {} |
218 | | } |
219 | | } |
220 | 0 | Comment |
221 | | } else { |
222 | 7.95k | Slash |
223 | | } |
224 | | } |
225 | 6.86k | '=' => Equals, |
226 | 358k | ',' => Comma, |
227 | 108k | ':' => Colon, |
228 | 116k | '.' => Period, |
229 | 81.3k | ';' => Semicolon, |
230 | 40.2k | '(' => LeftParen, |
231 | 68.1k | ')' => RightParen, |
232 | 120k | '{' => LeftBrace, |
233 | 228k | '}' => RightBrace, |
234 | 202k | '<' => LessThan, |
235 | 234k | '>' => GreaterThan, |
236 | 0 | '*' => Star, |
237 | 31.4k | '@' => At, |
238 | | '-' => { |
239 | 33.7k | if self.eatc('>') { |
240 | 15.7k | RArrow |
241 | | } else { |
242 | 18.0k | Minus |
243 | | } |
244 | | } |
245 | 34.0k | '+' => Plus, |
246 | | '%' => { |
247 | 482k | let mut iter = self.chars.clone(); |
248 | 482k | if let Some((_, ch)) = iter.next() { |
249 | 482k | if is_keylike_start(ch) { |
250 | 482k | self.chars = iter.clone(); |
251 | 3.47M | while let Some((_, ch)) = iter.next() { |
252 | 3.47M | if !is_keylike_continue(ch) { |
253 | 482k | break; |
254 | 2.99M | } |
255 | 2.99M | self.chars = iter.clone(); |
256 | | } |
257 | 0 | } |
258 | 0 | } |
259 | 482k | ExplicitId |
260 | | } |
261 | 1.91M | ch if is_keylike_start(ch) => { |
262 | 1.66M | let remaining = self.chars.chars.as_str().len(); |
263 | 1.66M | let mut iter = self.chars.clone(); |
264 | 9.42M | while let Some((_, ch)) = iter.next() { |
265 | 9.42M | if !is_keylike_continue(ch) { |
266 | 1.66M | break; |
267 | 7.75M | } |
268 | 7.75M | self.chars = iter.clone(); |
269 | | } |
270 | 1.66M | let str_end = |
271 | 1.66M | str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len()); |
272 | 1.66M | match &self.input[str_start..str_end] { |
273 | 1.66M | "use" => Use, |
274 | 1.60M | "type" => Type, |
275 | 1.59M | "func" => Func, |
276 | 1.56M | "u8" => U8, |
277 | 1.55M | "u16" => U16, |
278 | 1.55M | "u32" => U32, |
279 | 1.54M | "u64" => U64, |
280 | 1.54M | "s8" => S8, |
281 | 1.53M | "s16" => S16, |
282 | 1.53M | "s32" => S32, |
283 | 1.53M | "s64" => S64, |
284 | 1.52M | "f32" => F32, |
285 | 1.51M | "f64" => F64, |
286 | 1.50M | "float32" if !self.require_f32_f64 => F32, |
287 | 1.50M | "float64" if !self.require_f32_f64 => F64, |
288 | 1.50M | "char" => Char, |
289 | 1.49M | "resource" => Resource, |
290 | 1.48M | "own" => Own, |
291 | 1.47M | "borrow" => Borrow, |
292 | 1.47M | "record" => Record, |
293 | 1.46M | "flags" => Flags, |
294 | 1.45M | "variant" => Variant, |
295 | 1.43M | "enum" => Enum, |
296 | 1.32M | "bool" => Bool, |
297 | 1.14M | "string" => String_, |
298 | 1.14M | "option" => Option_, |
299 | 1.11M | "result" => Result_, |
300 | 1.05M | "future" => Future, |
301 | 1.04M | "stream" => Stream, |
302 | 1.03M | "error-context" => ErrorContext, |
303 | 1.01M | "list" => List, |
304 | 898k | "_" => Underscore, |
305 | 896k | "as" => As, |
306 | 882k | "from" => From_, |
307 | 882k | "static" => Static, |
308 | 880k | "interface" => Interface, |
309 | 613k | "tuple" => Tuple, |
310 | 561k | "world" => World, |
311 | 472k | "import" => Import, |
312 | 453k | "export" => Export, |
313 | 440k | "package" => Package, |
314 | 408k | "constructor" => Constructor, |
315 | 405k | "include" => Include, |
316 | 405k | "with" => With, |
317 | 405k | "async" => Async, |
318 | 388k | _ => Id, |
319 | | } |
320 | | } |
321 | | |
322 | 242k | ch if ch.is_ascii_digit() => { |
323 | 242k | let mut iter = self.chars.clone(); |
324 | 247k | while let Some((_, ch)) = iter.next() { |
325 | 247k | if !ch.is_ascii_digit() { |
326 | 242k | break; |
327 | 4.82k | } |
328 | 4.82k | self.chars = iter.clone(); |
329 | | } |
330 | | |
331 | 242k | Integer |
332 | | } |
333 | | |
334 | 0 | ch => return Err(Error::Unexpected(start, ch)), |
335 | | }; |
336 | 5.13M | let end = match self.chars.clone().next() { |
337 | 5.11M | Some((i, _)) => i, |
338 | 19.9k | None => self.input.len(), |
339 | | }; |
340 | | |
341 | 5.13M | let end = self.span_offset + u32::try_from(end).unwrap(); |
342 | 5.13M | Ok(Some((Span { start, end }, token))) |
343 | 5.17M | } |
344 | | |
345 | 1.29M | pub fn eat(&mut self, expected: Token) -> Result<bool, Error> { |
346 | 1.29M | let mut other = self.clone(); |
347 | 1.29M | match other.next()? { |
348 | 1.29M | Some((_span, found)) if expected == found => { |
349 | 666k | *self = other; |
350 | 666k | Ok(true) |
351 | | } |
352 | 629k | Some(_) => Ok(false), |
353 | 591 | None => Ok(false), |
354 | | } |
355 | 1.29M | } |
356 | | |
357 | 967k | pub fn expect(&mut self, expected: Token) -> Result<Span, Error> { |
358 | 967k | match self.next()? { |
359 | 967k | Some((span, found)) => { |
360 | 967k | if expected == found { |
361 | 967k | Ok(span) |
362 | | } else { |
363 | 0 | Err(Error::Wanted { |
364 | 0 | at: span.start, |
365 | 0 | expected: expected.describe(), |
366 | 0 | found: found.describe(), |
367 | 0 | }) |
368 | | } |
369 | | } |
370 | 0 | None => Err(Error::Wanted { |
371 | 0 | at: self.span_offset + u32::try_from(self.input.len()).unwrap(), |
372 | 0 | expected: expected.describe(), |
373 | 0 | found: "eof", |
374 | 0 | }), |
375 | | } |
376 | 967k | } |
377 | | |
378 | 4.13M | fn eatc(&mut self, ch: char) -> bool { |
379 | 4.13M | let mut iter = self.chars.clone(); |
380 | 4.13M | match iter.next() { |
381 | 4.07M | Some((_, ch2)) if ch == ch2 => { |
382 | 635k | self.chars = iter; |
383 | 635k | true |
384 | | } |
385 | 3.49M | _ => false, |
386 | | } |
387 | 4.13M | } |
388 | | |
389 | 0 | pub fn eof_span(&self) -> Span { |
390 | 0 | let end = self.span_offset + u32::try_from(self.input.len()).unwrap(); |
391 | 0 | Span { start: end, end } |
392 | 0 | } |
393 | | } |
394 | | |
395 | | impl<'a> Iterator for CrlfFold<'a> { |
396 | | type Item = (usize, char); |
397 | | |
398 | 28.0M | fn next(&mut self) -> Option<(usize, char)> { |
399 | 28.0M | self.chars.next().map(|(i, c)| { |
400 | 27.9M | if c == '\r' { |
401 | 0 | let mut attempt = self.chars.clone(); |
402 | 0 | if let Some((_, '\n')) = attempt.next() { |
403 | 0 | self.chars = attempt; |
404 | 0 | return (i, '\n'); |
405 | 0 | } |
406 | 27.9M | } |
407 | 27.9M | (i, c) |
408 | 27.9M | }) |
409 | 28.0M | } |
410 | | } |
411 | | |
412 | 20.5k | fn detect_invalid_input(input: &str) -> Result<()> { |
413 | | // Disallow specific codepoints. |
414 | 20.5k | let mut line = 1; |
415 | 9.27M | for ch in input.chars() { |
416 | 8.76M | match ch { |
417 | 513k | '\n' => line += 1, |
418 | 0 | '\r' | '\t' => {} |
419 | | |
420 | | // Bidirectional override codepoints can be used to craft source code that |
421 | | // appears to have a different meaning than its actual meaning. See |
422 | | // [CVE-2021-42574] for background and motivation. |
423 | | // |
424 | | // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574 |
425 | | '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}' |
426 | | | '\u{2067}' | '\u{2068}' | '\u{2069}' => { |
427 | 0 | bail!( |
428 | 0 | "Input contains bidirectional override codepoint {:?} at line {}", |
429 | 0 | ch.escape_unicode(), |
430 | | line |
431 | | ); |
432 | | } |
433 | | |
434 | | // Disallow several characters which are deprecated or discouraged in Unicode. |
435 | | // |
436 | | // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs. |
437 | | // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks. |
438 | | // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels. |
439 | | // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see |
440 | | // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged. |
441 | | '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}' |
442 | | | '\u{17b4}' | '\u{17b5}' => { |
443 | 0 | bail!( |
444 | 0 | "Codepoint {:?} at line {} is discouraged by Unicode", |
445 | 0 | ch.escape_unicode(), |
446 | | line |
447 | | ); |
448 | | } |
449 | | |
450 | | // Disallow control codes other than the ones explicitly recognized above, |
451 | | // so that viewing a wit file on a terminal doesn't have surprising side |
452 | | // effects or appear to have a different meaning than its actual meaning. |
453 | 8.76M | ch if ch.is_control() => { |
454 | 0 | bail!("Control code '{}' at line {}", ch.escape_unicode(), line); |
455 | | } |
456 | | |
457 | 8.76M | _ => {} |
458 | | } |
459 | | } |
460 | | |
461 | 20.5k | Ok(()) |
462 | 20.5k | } |
463 | | |
464 | 2.39M | fn is_keylike_start(ch: char) -> bool { |
465 | | // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars, |
466 | | // but we'll diagnose that after we've lexed the full string. |
467 | 2.39M | UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-' |
468 | 2.39M | } |
469 | | |
470 | 12.8M | fn is_keylike_continue(ch: char) -> bool { |
471 | | // Lex any XID continue (which includes `_`) or '-'. |
472 | 12.8M | UnicodeXID::is_xid_continue(ch) || ch == '-' |
473 | 12.8M | } |
474 | | |
475 | 407k | pub fn validate_id(start: u32, id: &str) -> Result<(), Error> { |
476 | | // IDs must have at least one part. |
477 | 407k | if id.is_empty() { |
478 | 0 | return Err(Error::IdPartEmpty(start)); |
479 | 407k | } |
480 | | |
481 | | // Ids consist of parts separated by '-'s. |
482 | 407k | for part in id.split('-') { |
483 | | // Parts must be non-empty and contain either all ASCII lowercase or |
484 | | // all ASCII uppercase. |
485 | 407k | let upper = match part.chars().next() { |
486 | 0 | None => return Err(Error::IdPartEmpty(start)), |
487 | 407k | Some(first) => { |
488 | 407k | if first.is_ascii_lowercase() { |
489 | 407k | false |
490 | 0 | } else if first.is_ascii_uppercase() { |
491 | 0 | true |
492 | | } else { |
493 | 0 | return Err(Error::InvalidCharInId(start, first)); |
494 | | } |
495 | | } |
496 | | }; |
497 | | |
498 | 2.76M | for ch in part.chars() { |
499 | 2.76M | if ch.is_ascii_digit() { |
500 | 385k | // Digits are accepted in both uppercase and lowercase segments. |
501 | 2.38M | } else if upper { |
502 | 0 | if !ch.is_ascii_uppercase() { |
503 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
504 | 0 | } |
505 | 2.38M | } else if !ch.is_ascii_lowercase() { |
506 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
507 | 2.38M | } |
508 | | } |
509 | | } |
510 | | |
511 | 407k | Ok(()) |
512 | 407k | } |
513 | | |
514 | | impl Token { |
515 | 0 | pub fn describe(&self) -> &'static str { |
516 | 0 | match self { |
517 | 0 | Whitespace => "whitespace", |
518 | 0 | Comment => "a comment", |
519 | 0 | Equals => "'='", |
520 | 0 | Comma => "','", |
521 | 0 | Colon => "':'", |
522 | 0 | Period => "'.'", |
523 | 0 | Semicolon => "';'", |
524 | 0 | LeftParen => "'('", |
525 | 0 | RightParen => "')'", |
526 | 0 | LeftBrace => "'{'", |
527 | 0 | RightBrace => "'}'", |
528 | 0 | LessThan => "'<'", |
529 | 0 | GreaterThan => "'>'", |
530 | 0 | Use => "keyword `use`", |
531 | 0 | Type => "keyword `type`", |
532 | 0 | Func => "keyword `func`", |
533 | 0 | U8 => "keyword `u8`", |
534 | 0 | U16 => "keyword `u16`", |
535 | 0 | U32 => "keyword `u32`", |
536 | 0 | U64 => "keyword `u64`", |
537 | 0 | S8 => "keyword `s8`", |
538 | 0 | S16 => "keyword `s16`", |
539 | 0 | S32 => "keyword `s32`", |
540 | 0 | S64 => "keyword `s64`", |
541 | 0 | F32 => "keyword `f32`", |
542 | 0 | F64 => "keyword `f64`", |
543 | 0 | Char => "keyword `char`", |
544 | 0 | Own => "keyword `own`", |
545 | 0 | Borrow => "keyword `borrow`", |
546 | 0 | Resource => "keyword `resource`", |
547 | 0 | Record => "keyword `record`", |
548 | 0 | Flags => "keyword `flags`", |
549 | 0 | Variant => "keyword `variant`", |
550 | 0 | Enum => "keyword `enum`", |
551 | 0 | Bool => "keyword `bool`", |
552 | 0 | String_ => "keyword `string`", |
553 | 0 | Option_ => "keyword `option`", |
554 | 0 | Result_ => "keyword `result`", |
555 | 0 | Future => "keyword `future`", |
556 | 0 | Stream => "keyword `stream`", |
557 | 0 | ErrorContext => "keyword `error-context`", |
558 | 0 | List => "keyword `list`", |
559 | 0 | Underscore => "keyword `_`", |
560 | 0 | Id => "an identifier", |
561 | 0 | ExplicitId => "an '%' identifier", |
562 | 0 | RArrow => "`->`", |
563 | 0 | Star => "`*`", |
564 | 0 | At => "`@`", |
565 | 0 | Slash => "`/`", |
566 | 0 | Plus => "`+`", |
567 | 0 | Minus => "`-`", |
568 | 0 | As => "keyword `as`", |
569 | 0 | From_ => "keyword `from`", |
570 | 0 | Static => "keyword `static`", |
571 | 0 | Interface => "keyword `interface`", |
572 | 0 | Tuple => "keyword `tuple`", |
573 | 0 | Import => "keyword `import`", |
574 | 0 | Export => "keyword `export`", |
575 | 0 | World => "keyword `world`", |
576 | 0 | Package => "keyword `package`", |
577 | 0 | Constructor => "keyword `constructor`", |
578 | 0 | Integer => "an integer", |
579 | 0 | Include => "keyword `include`", |
580 | 0 | With => "keyword `with`", |
581 | 0 | Async => "keyword `async`", |
582 | | } |
583 | 0 | } |
584 | | } |
585 | | |
586 | | impl std::error::Error for Error {} |
587 | | |
588 | | impl fmt::Display for Error { |
589 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
590 | 0 | match self { |
591 | 0 | Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"), |
592 | 0 | Error::UnterminatedComment(_) => write!(f, "unterminated block comment"), |
593 | | Error::Wanted { |
594 | 0 | expected, found, .. |
595 | 0 | } => write!(f, "expected {expected}, found {found}"), |
596 | 0 | Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"), |
597 | 0 | Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"), |
598 | 0 | Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"), |
599 | | } |
600 | 0 | } |
601 | | } |
602 | | |
603 | | #[test] |
604 | | fn test_validate_id() { |
605 | | validate_id(0, "apple").unwrap(); |
606 | | validate_id(0, "apple-pear").unwrap(); |
607 | | validate_id(0, "apple-pear-grape").unwrap(); |
608 | | validate_id(0, "a0").unwrap(); |
609 | | validate_id(0, "a").unwrap(); |
610 | | validate_id(0, "a-a").unwrap(); |
611 | | validate_id(0, "bool").unwrap(); |
612 | | validate_id(0, "APPLE").unwrap(); |
613 | | validate_id(0, "APPLE-PEAR").unwrap(); |
614 | | validate_id(0, "APPLE-PEAR-GRAPE").unwrap(); |
615 | | validate_id(0, "apple-PEAR-grape").unwrap(); |
616 | | validate_id(0, "APPLE-pear-GRAPE").unwrap(); |
617 | | validate_id(0, "ENOENT").unwrap(); |
618 | | validate_id(0, "is-XML").unwrap(); |
619 | | |
620 | | assert!(validate_id(0, "").is_err()); |
621 | | assert!(validate_id(0, "0").is_err()); |
622 | | assert!(validate_id(0, "%").is_err()); |
623 | | assert!(validate_id(0, "$").is_err()); |
624 | | assert!(validate_id(0, "0a").is_err()); |
625 | | assert!(validate_id(0, ".").is_err()); |
626 | | assert!(validate_id(0, "·").is_err()); |
627 | | assert!(validate_id(0, "a a").is_err()); |
628 | | assert!(validate_id(0, "_").is_err()); |
629 | | assert!(validate_id(0, "-").is_err()); |
630 | | assert!(validate_id(0, "a-").is_err()); |
631 | | assert!(validate_id(0, "-a").is_err()); |
632 | | assert!(validate_id(0, "Apple").is_err()); |
633 | | assert!(validate_id(0, "applE").is_err()); |
634 | | assert!(validate_id(0, "-apple-pear").is_err()); |
635 | | assert!(validate_id(0, "apple-pear-").is_err()); |
636 | | assert!(validate_id(0, "apple_pear").is_err()); |
637 | | assert!(validate_id(0, "apple.pear").is_err()); |
638 | | assert!(validate_id(0, "apple pear").is_err()); |
639 | | assert!(validate_id(0, "apple/pear").is_err()); |
640 | | assert!(validate_id(0, "apple|pear").is_err()); |
641 | | assert!(validate_id(0, "apple-Pear").is_err()); |
642 | | assert!(validate_id(0, "apple-0").is_err()); |
643 | | assert!(validate_id(0, "()()").is_err()); |
644 | | assert!(validate_id(0, "").is_err()); |
645 | | assert!(validate_id(0, "*").is_err()); |
646 | | assert!(validate_id(0, "apple\u{5f3}pear").is_err()); |
647 | | assert!(validate_id(0, "apple\u{200c}pear").is_err()); |
648 | | assert!(validate_id(0, "apple\u{200d}pear").is_err()); |
649 | | assert!(validate_id(0, "apple--pear").is_err()); |
650 | | assert!(validate_id(0, "_apple").is_err()); |
651 | | assert!(validate_id(0, "apple_").is_err()); |
652 | | assert!(validate_id(0, "_Znwj").is_err()); |
653 | | assert!(validate_id(0, "__i386").is_err()); |
654 | | assert!(validate_id(0, "__i386__").is_err()); |
655 | | assert!(validate_id(0, "Москва").is_err()); |
656 | | assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err()); |
657 | | assert!(validate_id(0, "😼").is_err(), "non-identifier"); |
658 | | assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii"); |
659 | | } |
660 | | |
661 | | #[test] |
662 | | fn test_tokenizer() { |
663 | | fn collect(s: &str) -> Result<Vec<Token>> { |
664 | | let mut t = Tokenizer::new(s, 0, None)?; |
665 | | let mut tokens = Vec::new(); |
666 | | while let Some(token) = t.next()? { |
667 | | tokens.push(token.1); |
668 | | } |
669 | | Ok(tokens) |
670 | | } |
671 | | |
672 | | assert_eq!(collect("").unwrap(), vec![]); |
673 | | assert_eq!(collect("_").unwrap(), vec![Token::Underscore]); |
674 | | assert_eq!(collect("apple").unwrap(), vec![Token::Id]); |
675 | | assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]); |
676 | | assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]); |
677 | | assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]); |
678 | | assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]); |
679 | | assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]); |
680 | | assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]); |
681 | | assert_eq!(collect("garçon").unwrap(), vec![Token::Id]); |
682 | | assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]); |
683 | | assert_eq!(collect("москва").unwrap(), vec![Token::Id]); |
684 | | assert_eq!(collect("東京").unwrap(), vec![Token::Id]); |
685 | | assert_eq!( |
686 | | collect("garçon-hühnervögel-москва-東京").unwrap(), |
687 | | vec![Token::Id] |
688 | | ); |
689 | | assert_eq!(collect("a0").unwrap(), vec![Token::Id]); |
690 | | assert_eq!(collect("a").unwrap(), vec![Token::Id]); |
691 | | assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]); |
692 | | assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]); |
693 | | assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]); |
694 | | assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]); |
695 | | assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]); |
696 | | assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]); |
697 | | assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]); |
698 | | assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]); |
699 | | assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]); |
700 | | assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]); |
701 | | assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]); |
702 | | |
703 | | assert_eq!(collect("func").unwrap(), vec![Token::Func]); |
704 | | assert_eq!( |
705 | | collect("a: func()").unwrap(), |
706 | | vec![ |
707 | | Token::Id, |
708 | | Token::Colon, |
709 | | Token::Func, |
710 | | Token::LeftParen, |
711 | | Token::RightParen |
712 | | ] |
713 | | ); |
714 | | |
715 | | assert_eq!(collect("resource").unwrap(), vec![Token::Resource]); |
716 | | |
717 | | assert_eq!(collect("own").unwrap(), vec![Token::Own]); |
718 | | assert_eq!( |
719 | | collect("own<some-id>").unwrap(), |
720 | | vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan] |
721 | | ); |
722 | | |
723 | | assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]); |
724 | | assert_eq!( |
725 | | collect("borrow<some-id>").unwrap(), |
726 | | vec![ |
727 | | Token::Borrow, |
728 | | Token::LessThan, |
729 | | Token::Id, |
730 | | Token::GreaterThan |
731 | | ] |
732 | | ); |
733 | | |
734 | | assert!(collect("\u{149}").is_err(), "strongly discouraged"); |
735 | | assert!(collect("\u{673}").is_err(), "strongly discouraged"); |
736 | | assert!(collect("\u{17a3}").is_err(), "strongly discouraged"); |
737 | | assert!(collect("\u{17a4}").is_err(), "strongly discouraged"); |
738 | | assert!(collect("\u{202a}").is_err(), "bidirectional override"); |
739 | | assert!(collect("\u{2068}").is_err(), "bidirectional override"); |
740 | | assert!(collect("\u{0}").is_err(), "control code"); |
741 | | assert!(collect("\u{b}").is_err(), "control code"); |
742 | | assert!(collect("\u{c}").is_err(), "control code"); |
743 | | assert!(collect("\u{85}").is_err(), "control code"); |
744 | | } |