/src/wasm-tools/crates/wit-parser/src/ast/lex.rs
Line | Count | Source |
1 | | #[cfg(test)] |
2 | | use alloc::{vec, vec::Vec}; |
3 | | use anyhow::{Result, bail}; |
4 | | use core::char; |
5 | | use core::fmt; |
6 | | use core::str; |
7 | | use unicode_xid::UnicodeXID; |
8 | | |
9 | | use self::Token::*; |
10 | | |
11 | | #[derive(Clone)] |
12 | | pub struct Tokenizer<'a> { |
13 | | input: &'a str, |
14 | | span_offset: u32, |
15 | | chars: CrlfFold<'a>, |
16 | | } |
17 | | |
18 | | #[derive(Clone)] |
19 | | struct CrlfFold<'a> { |
20 | | chars: str::CharIndices<'a>, |
21 | | } |
22 | | |
23 | | /// A span, designating a range of bytes where a token is located. |
24 | | /// |
25 | | /// Uses `u32::MAX` as a sentinel value to represent unknown spans (e.g., |
26 | | /// decoded from binary). |
27 | | #[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)] |
28 | | pub struct Span { |
29 | | start: u32, |
30 | | end: u32, |
31 | | } |
32 | | |
33 | | impl Default for Span { |
34 | 1.20M | fn default() -> Span { |
35 | 1.20M | Span { |
36 | 1.20M | start: u32::MAX, |
37 | 1.20M | end: u32::MAX, |
38 | 1.20M | } |
39 | 1.20M | } |
40 | | } |
41 | | |
42 | | impl Span { |
43 | 5.43M | pub fn new(start: u32, end: u32) -> Span { |
44 | 5.43M | let span = Span { start, end }; |
45 | 5.43M | assert!(span.is_known(), "cannot create a span with u32::MAX"); |
46 | 5.43M | span |
47 | 5.43M | } |
48 | | |
49 | | /// Adjusts this span by adding the given byte offset to both start and end. |
50 | 664k | pub fn adjust(&mut self, offset: u32) { |
51 | 664k | if self.is_known() { |
52 | 465k | self.start += offset; |
53 | 465k | self.end += offset; |
54 | 465k | } |
55 | 664k | } |
56 | | |
57 | | /// Returns the start offset, panicking if this is an unknown span. |
58 | 1.05M | pub fn start(&self) -> u32 { |
59 | 1.05M | assert!(self.is_known(), "cannot get start of unknown span"); |
60 | 1.05M | self.start |
61 | 1.05M | } |
62 | | |
63 | | /// Returns the end offset, panicking if this is an unknown span. |
64 | 732k | pub fn end(&self) -> u32 { |
65 | 732k | assert!(self.is_known(), "cannot get end of unknown span"); |
66 | 732k | self.end |
67 | 732k | } |
68 | | |
69 | | /// Sets the end offset. If this is unknown, converts to a zero-width span at that position. |
70 | 100k | pub fn set_end(&mut self, new_end: u32) { |
71 | 100k | if !self.is_known() { |
72 | 0 | self.start = new_end; |
73 | 100k | } |
74 | 100k | self.end = new_end; |
75 | 100k | } |
76 | | |
77 | | /// Sets the start offset. If this is unknown, converts to a zero-width span at that position. |
78 | 0 | pub fn set_start(&mut self, new_start: u32) { |
79 | 0 | if !self.is_known() { |
80 | 0 | self.end = new_start; |
81 | 0 | } |
82 | 0 | self.start = new_start; |
83 | 0 | } |
84 | | |
85 | | /// Returns true if this span has a known source location. |
86 | 7.98M | pub fn is_known(&self) -> bool { |
87 | 7.98M | self.start != u32::MAX && self.end != u32::MAX |
88 | 7.98M | } |
89 | | } |
90 | | |
91 | | #[derive(Eq, PartialEq, Debug, Copy, Clone)] |
92 | | pub enum Token { |
93 | | Whitespace, |
94 | | Comment, |
95 | | |
96 | | Equals, |
97 | | Comma, |
98 | | Colon, |
99 | | Period, |
100 | | Semicolon, |
101 | | LeftParen, |
102 | | RightParen, |
103 | | LeftBrace, |
104 | | RightBrace, |
105 | | LessThan, |
106 | | GreaterThan, |
107 | | RArrow, |
108 | | Star, |
109 | | At, |
110 | | Slash, |
111 | | Plus, |
112 | | Minus, |
113 | | |
114 | | Use, |
115 | | Type, |
116 | | Func, |
117 | | U8, |
118 | | U16, |
119 | | U32, |
120 | | U64, |
121 | | S8, |
122 | | S16, |
123 | | S32, |
124 | | S64, |
125 | | F32, |
126 | | F64, |
127 | | Char, |
128 | | Record, |
129 | | Resource, |
130 | | Own, |
131 | | Borrow, |
132 | | Flags, |
133 | | Variant, |
134 | | Enum, |
135 | | Bool, |
136 | | String_, |
137 | | Option_, |
138 | | Result_, |
139 | | Future, |
140 | | Stream, |
141 | | ErrorContext, |
142 | | List, |
143 | | Map, |
144 | | Underscore, |
145 | | As, |
146 | | From_, |
147 | | Static, |
148 | | Interface, |
149 | | Tuple, |
150 | | Import, |
151 | | Export, |
152 | | World, |
153 | | Package, |
154 | | Constructor, |
155 | | Async, |
156 | | |
157 | | Id, |
158 | | ExplicitId, |
159 | | |
160 | | Integer, |
161 | | |
162 | | Include, |
163 | | With, |
164 | | } |
165 | | |
166 | | #[derive(Eq, PartialEq, Debug)] |
167 | | #[allow(dead_code)] |
168 | | pub enum Error { |
169 | | InvalidCharInId(u32, char), |
170 | | IdPartEmpty(u32), |
171 | | InvalidEscape(u32, char), |
172 | | Unexpected(u32, char), |
173 | | UnterminatedComment(u32), |
174 | | Wanted { |
175 | | at: u32, |
176 | | expected: &'static str, |
177 | | found: &'static str, |
178 | | }, |
179 | | } |
180 | | |
181 | | impl<'a> Tokenizer<'a> { |
182 | 22.0k | pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>> { |
183 | 22.0k | detect_invalid_input(input)?; |
184 | | |
185 | 22.0k | let mut t = Tokenizer { |
186 | 22.0k | input, |
187 | 22.0k | span_offset, |
188 | 22.0k | chars: CrlfFold { |
189 | 22.0k | chars: input.char_indices(), |
190 | 22.0k | }, |
191 | 22.0k | }; |
192 | | // Eat utf-8 BOM |
193 | 22.0k | t.eatc('\u{feff}'); |
194 | 22.0k | Ok(t) |
195 | 22.0k | } |
196 | | |
197 | 43.3k | pub fn expect_semicolon(&mut self) -> Result<()> { |
198 | 43.3k | self.expect(Token::Semicolon)?; |
199 | 43.3k | Ok(()) |
200 | 43.3k | } |
201 | | |
202 | 566k | pub fn get_span(&self, span: Span) -> &'a str { |
203 | 566k | let start = usize::try_from(span.start() - self.span_offset).unwrap(); |
204 | 566k | let end = usize::try_from(span.end() - self.span_offset).unwrap(); |
205 | 566k | &self.input[start..end] |
206 | 566k | } |
207 | | |
208 | 170k | pub fn parse_id(&self, span: Span) -> Result<&'a str> { |
209 | 170k | let ret = self.get_span(span); |
210 | 170k | validate_id(span.start(), &ret)?; |
211 | 170k | Ok(ret) |
212 | 170k | } |
213 | | |
214 | 263k | pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> { |
215 | 263k | let token = self.get_span(span); |
216 | 263k | let id_part = token.strip_prefix('%').unwrap(); |
217 | 263k | validate_id(span.start(), id_part)?; |
218 | 263k | Ok(id_part) |
219 | 263k | } |
220 | | |
221 | 3.75M | pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> { |
222 | | loop { |
223 | 4.36M | match self.next_raw()? { |
224 | 616k | Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {} |
225 | 3.75M | other => break Ok(other), |
226 | | } |
227 | | } |
228 | 3.75M | } |
229 | | |
230 | | /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an |
231 | | /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more |
232 | | /// tokens available. |
233 | 5.42M | pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> { |
234 | 5.42M | let (str_start, ch) = match self.chars.next() { |
235 | 5.37M | Some(pair) => pair, |
236 | 45.5k | None => return Ok(None), |
237 | | }; |
238 | 5.37M | let start = self.span_offset + u32::try_from(str_start).unwrap(); |
239 | 5.37M | let token = match ch { |
240 | | '\n' | '\t' | ' ' => { |
241 | | // Eat all contiguous whitespace tokens |
242 | 1.74M | while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {} |
243 | 1.10M | Whitespace |
244 | | } |
245 | | '/' => { |
246 | | // Eat a line comment if it's `//...` |
247 | 10.9k | if self.eatc('/') { |
248 | 0 | for (_, ch) in &mut self.chars { |
249 | 0 | if ch == '\n' { |
250 | 0 | break; |
251 | 0 | } |
252 | | } |
253 | 0 | Comment |
254 | | // eat a block comment if it's `/*...` |
255 | 10.9k | } else if self.eatc('*') { |
256 | 0 | let mut depth = 1; |
257 | 0 | while depth > 0 { |
258 | 0 | let (_, ch) = match self.chars.next() { |
259 | 0 | Some(pair) => pair, |
260 | 0 | None => return Err(Error::UnterminatedComment(start)), |
261 | | }; |
262 | 0 | match ch { |
263 | 0 | '/' if self.eatc('*') => depth += 1, |
264 | 0 | '*' if self.eatc('/') => depth -= 1, |
265 | 0 | _ => {} |
266 | | } |
267 | | } |
268 | 0 | Comment |
269 | | } else { |
270 | 10.9k | Slash |
271 | | } |
272 | | } |
273 | 6.98k | '=' => Equals, |
274 | 361k | ',' => Comma, |
275 | 116k | ':' => Colon, |
276 | 138k | '.' => Period, |
277 | 89.4k | ';' => Semicolon, |
278 | 48.6k | '(' => LeftParen, |
279 | 77.8k | ')' => RightParen, |
280 | 126k | '{' => LeftBrace, |
281 | 238k | '}' => RightBrace, |
282 | 202k | '<' => LessThan, |
283 | 234k | '>' => GreaterThan, |
284 | 0 | '*' => Star, |
285 | 35.0k | '@' => At, |
286 | | '-' => { |
287 | 37.2k | if self.eatc('>') { |
288 | 16.4k | RArrow |
289 | | } else { |
290 | 20.7k | Minus |
291 | | } |
292 | | } |
293 | 39.5k | '+' => Plus, |
294 | | '%' => { |
295 | 529k | let mut iter = self.chars.clone(); |
296 | 529k | if let Some((_, ch)) = iter.next() { |
297 | 529k | if is_keylike_start(ch) { |
298 | 529k | self.chars = iter.clone(); |
299 | 4.28M | while let Some((_, ch)) = iter.next() { |
300 | 4.28M | if !is_keylike_continue(ch) { |
301 | 529k | break; |
302 | 3.75M | } |
303 | 3.75M | self.chars = iter.clone(); |
304 | | } |
305 | 0 | } |
306 | 0 | } |
307 | 529k | ExplicitId |
308 | | } |
309 | 1.97M | ch if is_keylike_start(ch) => { |
310 | 1.71M | let remaining = self.chars.chars.as_str().len(); |
311 | 1.71M | let mut iter = self.chars.clone(); |
312 | 9.80M | while let Some((_, ch)) = iter.next() { |
313 | 9.80M | if !is_keylike_continue(ch) { |
314 | 1.71M | break; |
315 | 8.08M | } |
316 | 8.08M | self.chars = iter.clone(); |
317 | | } |
318 | 1.71M | let str_end = |
319 | 1.71M | str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len()); |
320 | 1.71M | match &self.input[str_start..str_end] { |
321 | 1.71M | "use" => Use, |
322 | 1.63M | "type" => Type, |
323 | 1.62M | "func" => Func, |
324 | 1.59M | "u8" => U8, |
325 | 1.57M | "u16" => U16, |
326 | 1.57M | "u32" => U32, |
327 | 1.57M | "u64" => U64, |
328 | 1.56M | "s8" => S8, |
329 | 1.56M | "s16" => S16, |
330 | 1.55M | "s32" => S32, |
331 | 1.55M | "s64" => S64, |
332 | 1.54M | "f32" => F32, |
333 | 1.53M | "f64" => F64, |
334 | 1.52M | "char" => Char, |
335 | 1.50M | "resource" => Resource, |
336 | 1.49M | "own" => Own, |
337 | 1.49M | "borrow" => Borrow, |
338 | 1.49M | "record" => Record, |
339 | 1.47M | "flags" => Flags, |
340 | 1.47M | "variant" => Variant, |
341 | 1.43M | "enum" => Enum, |
342 | 1.33M | "bool" => Bool, |
343 | 1.15M | "string" => String_, |
344 | 1.15M | "option" => Option_, |
345 | 1.12M | "result" => Result_, |
346 | 1.07M | "future" => Future, |
347 | 1.06M | "stream" => Stream, |
348 | 1.05M | "error-context" => ErrorContext, |
349 | 1.02M | "list" => List, |
350 | 909k | "map" => Map, |
351 | 909k | "_" => Underscore, |
352 | 907k | "as" => As, |
353 | 888k | "from" => From_, |
354 | 888k | "static" => Static, |
355 | 886k | "interface" => Interface, |
356 | 616k | "tuple" => Tuple, |
357 | 564k | "world" => World, |
358 | 476k | "import" => Import, |
359 | 454k | "export" => Export, |
360 | 441k | "package" => Package, |
361 | 406k | "constructor" => Constructor, |
362 | 403k | "include" => Include, |
363 | 403k | "with" => With, |
364 | 403k | "async" => Async, |
365 | 386k | _ => Id, |
366 | | } |
367 | | } |
368 | | |
369 | 263k | ch if ch.is_ascii_digit() => { |
370 | 263k | let mut iter = self.chars.clone(); |
371 | 268k | while let Some((_, ch)) = iter.next() { |
372 | 268k | if !ch.is_ascii_digit() { |
373 | 263k | break; |
374 | 5.29k | } |
375 | 5.29k | self.chars = iter.clone(); |
376 | | } |
377 | | |
378 | 263k | Integer |
379 | | } |
380 | | |
381 | 0 | ch => return Err(Error::Unexpected(start, ch)), |
382 | | }; |
383 | 5.37M | let end = match self.chars.clone().next() { |
384 | 5.35M | Some((i, _)) => i, |
385 | 21.3k | None => self.input.len(), |
386 | | }; |
387 | | |
388 | 5.37M | let end = self.span_offset + u32::try_from(end).unwrap(); |
389 | 5.37M | Ok(Some((Span::new(start, end), token))) |
390 | 5.42M | } |
391 | | |
392 | 1.35M | pub fn eat(&mut self, expected: Token) -> Result<bool, Error> { |
393 | 1.35M | let mut other = self.clone(); |
394 | 1.35M | match other.next()? { |
395 | 1.35M | Some((_span, found)) if expected == found => { |
396 | 699k | *self = other; |
397 | 699k | Ok(true) |
398 | | } |
399 | 659k | Some(_) => Ok(false), |
400 | 713 | None => Ok(false), |
401 | | } |
402 | 1.35M | } |
403 | | |
404 | 1.02M | pub fn expect(&mut self, expected: Token) -> Result<Span, Error> { |
405 | 1.02M | match self.next()? { |
406 | 1.02M | Some((span, found)) => { |
407 | 1.02M | if expected == found { |
408 | 1.02M | Ok(span) |
409 | | } else { |
410 | 0 | Err(Error::Wanted { |
411 | 0 | at: span.start(), |
412 | 0 | expected: expected.describe(), |
413 | 0 | found: found.describe(), |
414 | 0 | }) |
415 | | } |
416 | | } |
417 | 0 | None => Err(Error::Wanted { |
418 | 0 | at: self.span_offset + u32::try_from(self.input.len()).unwrap(), |
419 | 0 | expected: expected.describe(), |
420 | 0 | found: "eof", |
421 | 0 | }), |
422 | | } |
423 | 1.02M | } |
424 | | |
425 | 4.29M | fn eatc(&mut self, ch: char) -> bool { |
426 | 4.29M | let mut iter = self.chars.clone(); |
427 | 4.29M | match iter.next() { |
428 | 4.22M | Some((_, ch2)) if ch == ch2 => { |
429 | 651k | self.chars = iter; |
430 | 651k | true |
431 | | } |
432 | 3.63M | _ => false, |
433 | | } |
434 | 4.29M | } |
435 | | |
436 | 0 | pub fn eof_span(&self) -> Span { |
437 | 0 | let end = self.span_offset + u32::try_from(self.input.len()).unwrap(); |
438 | 0 | Span::new(end, end) |
439 | 0 | } |
440 | | } |
441 | | |
442 | | impl<'a> Iterator for CrlfFold<'a> { |
443 | | type Item = (usize, char); |
444 | | |
445 | 29.9M | fn next(&mut self) -> Option<(usize, char)> { |
446 | 29.9M | self.chars.next().map(|(i, c)| { |
447 | 29.8M | if c == '\r' { |
448 | 0 | let mut attempt = self.chars.clone(); |
449 | 0 | if let Some((_, '\n')) = attempt.next() { |
450 | 0 | self.chars = attempt; |
451 | 0 | return (i, '\n'); |
452 | 0 | } |
453 | 29.8M | } |
454 | 29.8M | (i, c) |
455 | 29.8M | }) |
456 | 29.9M | } |
457 | | } |
458 | | |
459 | 22.0k | fn detect_invalid_input(input: &str) -> Result<()> { |
460 | | // Disallow specific codepoints. |
461 | 22.0k | let mut line = 1; |
462 | 10.1M | for ch in input.chars() { |
463 | 9.60M | match ch { |
464 | 534k | '\n' => line += 1, |
465 | 0 | '\r' | '\t' => {} |
466 | | |
467 | | // Bidirectional override codepoints can be used to craft source code that |
468 | | // appears to have a different meaning than its actual meaning. See |
469 | | // [CVE-2021-42574] for background and motivation. |
470 | | // |
471 | | // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574 |
472 | | '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}' |
473 | | | '\u{2067}' | '\u{2068}' | '\u{2069}' => { |
474 | 0 | bail!( |
475 | | "Input contains bidirectional override codepoint {:?} at line {}", |
476 | 0 | ch.escape_unicode(), |
477 | | line |
478 | | ); |
479 | | } |
480 | | |
481 | | // Disallow several characters which are deprecated or discouraged in Unicode. |
482 | | // |
483 | | // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs. |
484 | | // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks. |
485 | | // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels. |
486 | | // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see |
487 | | // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged. |
488 | | '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}' |
489 | | | '\u{17b4}' | '\u{17b5}' => { |
490 | 0 | bail!( |
491 | | "Codepoint {:?} at line {} is discouraged by Unicode", |
492 | 0 | ch.escape_unicode(), |
493 | | line |
494 | | ); |
495 | | } |
496 | | |
497 | | // Disallow control codes other than the ones explicitly recognized above, |
498 | | // so that viewing a wit file on a terminal doesn't have surprising side |
499 | | // effects or appear to have a different meaning than its actual meaning. |
500 | 9.60M | ch if ch.is_control() => { |
501 | 0 | bail!("Control code '{}' at line {}", ch.escape_unicode(), line); |
502 | | } |
503 | | |
504 | 9.60M | _ => {} |
505 | | } |
506 | | } |
507 | | |
508 | 22.0k | Ok(()) |
509 | 22.0k | } |
510 | | |
511 | 2.50M | fn is_keylike_start(ch: char) -> bool { |
512 | | // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars, |
513 | | // but we'll diagnose that after we've lexed the full string. |
514 | 2.50M | UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-' |
515 | 2.50M | } |
516 | | |
517 | 14.0M | fn is_keylike_continue(ch: char) -> bool { |
518 | | // Lex any XID continue (which includes `_`) or '-'. |
519 | 14.0M | UnicodeXID::is_xid_continue(ch) || ch == '-' |
520 | 14.0M | } |
521 | | |
522 | 434k | pub fn validate_id(start: u32, id: &str) -> Result<(), Error> { |
523 | | // IDs must have at least one part. |
524 | 434k | if id.is_empty() { |
525 | 0 | return Err(Error::IdPartEmpty(start)); |
526 | 434k | } |
527 | | |
528 | | // Ids consist of parts separated by '-'s. |
529 | 434k | for (idx, part) in id.split('-').enumerate() { |
530 | | // Parts must be non-empty and contain either all ASCII lowercase or |
531 | | // all ASCII uppercase. Non-first segment can also start with a digit. |
532 | 434k | let Some(first_char) = part.chars().next() else { |
533 | 0 | return Err(Error::IdPartEmpty(start)); |
534 | | }; |
535 | 434k | if idx == 0 && !first_char.is_ascii_alphabetic() { |
536 | 0 | return Err(Error::InvalidCharInId(start, first_char)); |
537 | 434k | } |
538 | 434k | let mut upper = None; |
539 | 3.33M | for ch in part.chars() { |
540 | 3.33M | if ch.is_ascii_digit() { |
541 | 401k | // Digits are accepted in both uppercase and lowercase segments. |
542 | 2.93M | } else if ch.is_ascii_uppercase() { |
543 | 0 | if upper.is_none() { |
544 | 0 | upper = Some(true); |
545 | 0 | } else if let Some(false) = upper { |
546 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
547 | 0 | } |
548 | 2.93M | } else if ch.is_ascii_lowercase() { |
549 | 2.93M | if upper.is_none() { |
550 | 434k | upper = Some(false); |
551 | 2.50M | } else if let Some(true) = upper { |
552 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
553 | 2.50M | } |
554 | | } else { |
555 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
556 | | } |
557 | | } |
558 | | } |
559 | | |
560 | 434k | Ok(()) |
561 | 434k | } |
562 | | |
563 | | impl Token { |
564 | 0 | pub fn describe(&self) -> &'static str { |
565 | 0 | match self { |
566 | 0 | Whitespace => "whitespace", |
567 | 0 | Comment => "a comment", |
568 | 0 | Equals => "'='", |
569 | 0 | Comma => "','", |
570 | 0 | Colon => "':'", |
571 | 0 | Period => "'.'", |
572 | 0 | Semicolon => "';'", |
573 | 0 | LeftParen => "'('", |
574 | 0 | RightParen => "')'", |
575 | 0 | LeftBrace => "'{'", |
576 | 0 | RightBrace => "'}'", |
577 | 0 | LessThan => "'<'", |
578 | 0 | GreaterThan => "'>'", |
579 | 0 | Use => "keyword `use`", |
580 | 0 | Type => "keyword `type`", |
581 | 0 | Func => "keyword `func`", |
582 | 0 | U8 => "keyword `u8`", |
583 | 0 | U16 => "keyword `u16`", |
584 | 0 | U32 => "keyword `u32`", |
585 | 0 | U64 => "keyword `u64`", |
586 | 0 | S8 => "keyword `s8`", |
587 | 0 | S16 => "keyword `s16`", |
588 | 0 | S32 => "keyword `s32`", |
589 | 0 | S64 => "keyword `s64`", |
590 | 0 | F32 => "keyword `f32`", |
591 | 0 | F64 => "keyword `f64`", |
592 | 0 | Char => "keyword `char`", |
593 | 0 | Own => "keyword `own`", |
594 | 0 | Borrow => "keyword `borrow`", |
595 | 0 | Resource => "keyword `resource`", |
596 | 0 | Record => "keyword `record`", |
597 | 0 | Flags => "keyword `flags`", |
598 | 0 | Variant => "keyword `variant`", |
599 | 0 | Enum => "keyword `enum`", |
600 | 0 | Bool => "keyword `bool`", |
601 | 0 | String_ => "keyword `string`", |
602 | 0 | Option_ => "keyword `option`", |
603 | 0 | Result_ => "keyword `result`", |
604 | 0 | Future => "keyword `future`", |
605 | 0 | Stream => "keyword `stream`", |
606 | 0 | ErrorContext => "keyword `error-context`", |
607 | 0 | List => "keyword `list`", |
608 | 0 | Map => "keyword `map`", |
609 | 0 | Underscore => "keyword `_`", |
610 | 0 | Id => "an identifier", |
611 | 0 | ExplicitId => "an '%' identifier", |
612 | 0 | RArrow => "`->`", |
613 | 0 | Star => "`*`", |
614 | 0 | At => "`@`", |
615 | 0 | Slash => "`/`", |
616 | 0 | Plus => "`+`", |
617 | 0 | Minus => "`-`", |
618 | 0 | As => "keyword `as`", |
619 | 0 | From_ => "keyword `from`", |
620 | 0 | Static => "keyword `static`", |
621 | 0 | Interface => "keyword `interface`", |
622 | 0 | Tuple => "keyword `tuple`", |
623 | 0 | Import => "keyword `import`", |
624 | 0 | Export => "keyword `export`", |
625 | 0 | World => "keyword `world`", |
626 | 0 | Package => "keyword `package`", |
627 | 0 | Constructor => "keyword `constructor`", |
628 | 0 | Integer => "an integer", |
629 | 0 | Include => "keyword `include`", |
630 | 0 | With => "keyword `with`", |
631 | 0 | Async => "keyword `async`", |
632 | | } |
633 | 0 | } |
634 | | } |
635 | | |
636 | | impl core::error::Error for Error {} |
637 | | |
638 | | impl fmt::Display for Error { |
639 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
640 | 0 | match self { |
641 | 0 | Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"), |
642 | 0 | Error::UnterminatedComment(_) => write!(f, "unterminated block comment"), |
643 | | Error::Wanted { |
644 | 0 | expected, found, .. |
645 | 0 | } => write!(f, "expected {expected}, found {found}"), |
646 | 0 | Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"), |
647 | 0 | Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"), |
648 | 0 | Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"), |
649 | | } |
650 | 0 | } |
651 | | } |
652 | | |
653 | | #[test] |
654 | | fn test_validate_id() { |
655 | | validate_id(0, "apple").unwrap(); |
656 | | validate_id(0, "apple-pear").unwrap(); |
657 | | validate_id(0, "apple-pear-grape").unwrap(); |
658 | | validate_id(0, "a0").unwrap(); |
659 | | validate_id(0, "a").unwrap(); |
660 | | validate_id(0, "a-a").unwrap(); |
661 | | validate_id(0, "bool").unwrap(); |
662 | | validate_id(0, "APPLE").unwrap(); |
663 | | validate_id(0, "APPLE-PEAR").unwrap(); |
664 | | validate_id(0, "APPLE-PEAR-GRAPE").unwrap(); |
665 | | validate_id(0, "apple-PEAR-grape").unwrap(); |
666 | | validate_id(0, "APPLE-pear-GRAPE").unwrap(); |
667 | | validate_id(0, "ENOENT").unwrap(); |
668 | | validate_id(0, "is-XML").unwrap(); |
669 | | validate_id(0, "apple-0").unwrap(); |
670 | | validate_id(0, "a0-000-3d4a-54FF").unwrap(); |
671 | | |
672 | | assert!(validate_id(0, "").is_err()); |
673 | | assert!(validate_id(0, "0").is_err()); |
674 | | assert!(validate_id(0, "%").is_err()); |
675 | | assert!(validate_id(0, "$").is_err()); |
676 | | assert!(validate_id(0, "0a").is_err()); |
677 | | assert!(validate_id(0, ".").is_err()); |
678 | | assert!(validate_id(0, "·").is_err()); |
679 | | assert!(validate_id(0, "a a").is_err()); |
680 | | assert!(validate_id(0, "_").is_err()); |
681 | | assert!(validate_id(0, "-").is_err()); |
682 | | assert!(validate_id(0, "a-").is_err()); |
683 | | assert!(validate_id(0, "-a").is_err()); |
684 | | assert!(validate_id(0, "Apple").is_err()); |
685 | | assert!(validate_id(0, "applE").is_err()); |
686 | | assert!(validate_id(0, "-apple-pear").is_err()); |
687 | | assert!(validate_id(0, "apple-pear-").is_err()); |
688 | | assert!(validate_id(0, "apple_pear").is_err()); |
689 | | assert!(validate_id(0, "apple.pear").is_err()); |
690 | | assert!(validate_id(0, "apple pear").is_err()); |
691 | | assert!(validate_id(0, "apple/pear").is_err()); |
692 | | assert!(validate_id(0, "apple|pear").is_err()); |
693 | | assert!(validate_id(0, "apple-Pear").is_err()); |
694 | | assert!(validate_id(0, "()()").is_err()); |
695 | | assert!(validate_id(0, "").is_err()); |
696 | | assert!(validate_id(0, "*").is_err()); |
697 | | assert!(validate_id(0, "apple\u{5f3}pear").is_err()); |
698 | | assert!(validate_id(0, "apple\u{200c}pear").is_err()); |
699 | | assert!(validate_id(0, "apple\u{200d}pear").is_err()); |
700 | | assert!(validate_id(0, "apple--pear").is_err()); |
701 | | assert!(validate_id(0, "_apple").is_err()); |
702 | | assert!(validate_id(0, "apple_").is_err()); |
703 | | assert!(validate_id(0, "_Znwj").is_err()); |
704 | | assert!(validate_id(0, "__i386").is_err()); |
705 | | assert!(validate_id(0, "__i386__").is_err()); |
706 | | assert!(validate_id(0, "Москва").is_err()); |
707 | | assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err()); |
708 | | assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err()); |
709 | | assert!(validate_id(0, "😼").is_err(), "non-identifier"); |
710 | | assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii"); |
711 | | } |
712 | | |
713 | | #[test] |
714 | | fn test_tokenizer() { |
715 | | fn collect(s: &str) -> Result<Vec<Token>> { |
716 | | let mut t = Tokenizer::new(s, 0)?; |
717 | | let mut tokens = Vec::new(); |
718 | | while let Some(token) = t.next()? { |
719 | | tokens.push(token.1); |
720 | | } |
721 | | Ok(tokens) |
722 | | } |
723 | | |
724 | | assert_eq!(collect("").unwrap(), vec![]); |
725 | | assert_eq!(collect("_").unwrap(), vec![Token::Underscore]); |
726 | | assert_eq!(collect("apple").unwrap(), vec![Token::Id]); |
727 | | assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]); |
728 | | assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]); |
729 | | assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]); |
730 | | assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]); |
731 | | assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]); |
732 | | assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]); |
733 | | assert_eq!(collect("garçon").unwrap(), vec![Token::Id]); |
734 | | assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]); |
735 | | assert_eq!(collect("москва").unwrap(), vec![Token::Id]); |
736 | | assert_eq!(collect("東京").unwrap(), vec![Token::Id]); |
737 | | assert_eq!( |
738 | | collect("garçon-hühnervögel-москва-東京").unwrap(), |
739 | | vec![Token::Id] |
740 | | ); |
741 | | assert_eq!(collect("a0").unwrap(), vec![Token::Id]); |
742 | | assert_eq!(collect("a").unwrap(), vec![Token::Id]); |
743 | | assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]); |
744 | | assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]); |
745 | | assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]); |
746 | | assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]); |
747 | | assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]); |
748 | | assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]); |
749 | | assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]); |
750 | | assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]); |
751 | | assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]); |
752 | | assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]); |
753 | | assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]); |
754 | | |
755 | | assert_eq!(collect("func").unwrap(), vec![Token::Func]); |
756 | | assert_eq!( |
757 | | collect("a: func()").unwrap(), |
758 | | vec![ |
759 | | Token::Id, |
760 | | Token::Colon, |
761 | | Token::Func, |
762 | | Token::LeftParen, |
763 | | Token::RightParen |
764 | | ] |
765 | | ); |
766 | | |
767 | | assert_eq!(collect("resource").unwrap(), vec![Token::Resource]); |
768 | | |
769 | | assert_eq!(collect("own").unwrap(), vec![Token::Own]); |
770 | | assert_eq!( |
771 | | collect("own<some-id>").unwrap(), |
772 | | vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan] |
773 | | ); |
774 | | |
775 | | assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]); |
776 | | assert_eq!( |
777 | | collect("borrow<some-id>").unwrap(), |
778 | | vec![ |
779 | | Token::Borrow, |
780 | | Token::LessThan, |
781 | | Token::Id, |
782 | | Token::GreaterThan |
783 | | ] |
784 | | ); |
785 | | |
786 | | assert!(collect("\u{149}").is_err(), "strongly discouraged"); |
787 | | assert!(collect("\u{673}").is_err(), "strongly discouraged"); |
788 | | assert!(collect("\u{17a3}").is_err(), "strongly discouraged"); |
789 | | assert!(collect("\u{17a4}").is_err(), "strongly discouraged"); |
790 | | assert!(collect("\u{202a}").is_err(), "bidirectional override"); |
791 | | assert!(collect("\u{2068}").is_err(), "bidirectional override"); |
792 | | assert!(collect("\u{0}").is_err(), "control code"); |
793 | | assert!(collect("\u{b}").is_err(), "control code"); |
794 | | assert!(collect("\u{c}").is_err(), "control code"); |
795 | | assert!(collect("\u{85}").is_err(), "control code"); |
796 | | } |