/src/wasm-tools/crates/wit-parser/src/ast/lex.rs
Line | Count | Source |
1 | | #[cfg(test)] |
2 | | use alloc::{vec, vec::Vec}; |
3 | | use core::char; |
4 | | use core::fmt; |
5 | | use core::result::Result; |
6 | | use core::str; |
7 | | |
8 | | use self::Token::*; |
9 | | |
10 | | #[derive(Clone)] |
11 | | pub struct Tokenizer<'a> { |
12 | | input: &'a str, |
13 | | span_offset: u32, |
14 | | chars: CrlfFold<'a>, |
15 | | } |
16 | | |
17 | | #[derive(Clone)] |
18 | | struct CrlfFold<'a> { |
19 | | chars: str::CharIndices<'a>, |
20 | | } |
21 | | |
22 | | /// A span, designating a range of bytes where a token is located. |
23 | | /// |
24 | | /// Uses `u32::MAX` as a sentinel value to represent unknown spans (e.g., |
25 | | /// decoded from binary). |
26 | | #[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)] |
27 | | pub struct Span { |
28 | | start: u32, |
29 | | end: u32, |
30 | | } |
31 | | |
32 | | impl Default for Span { |
33 | 1.30M | fn default() -> Span { |
34 | 1.30M | Span { |
35 | 1.30M | start: u32::MAX, |
36 | 1.30M | end: u32::MAX, |
37 | 1.30M | } |
38 | 1.30M | } |
39 | | } |
40 | | |
41 | | impl Span { |
42 | 6.49M | pub fn new(start: u32, end: u32) -> Span { |
43 | 6.49M | let span = Span { start, end }; |
44 | 6.49M | assert!(span.is_known(), "cannot create a span with u32::MAX"); |
45 | 6.49M | span |
46 | 6.49M | } |
47 | | |
48 | | /// Adjusts this span by adding the given byte offset to both start and end. |
49 | 680k | pub fn adjust(&mut self, offset: u32) { |
50 | 680k | if self.is_known() { |
51 | 493k | self.start += offset; |
52 | 493k | self.end += offset; |
53 | 493k | } |
54 | 680k | } |
55 | | |
56 | | /// Returns the start offset, panicking if this is an unknown span. |
57 | 1.29M | pub fn start(&self) -> u32 { |
58 | 1.29M | assert!(self.is_known(), "cannot get start of unknown span"); |
59 | 1.29M | self.start |
60 | 1.29M | } |
61 | | |
62 | | /// Returns the end offset, panicking if this is an unknown span. |
63 | 953k | pub fn end(&self) -> u32 { |
64 | 953k | assert!(self.is_known(), "cannot get end of unknown span"); |
65 | 953k | self.end |
66 | 953k | } |
67 | | |
68 | | /// Sets the end offset. If this is unknown, converts to a zero-width span at that position. |
69 | 175k | pub fn set_end(&mut self, new_end: u32) { |
70 | 175k | if !self.is_known() { |
71 | 0 | self.start = new_end; |
72 | 175k | } |
73 | 175k | self.end = new_end; |
74 | 175k | } |
75 | | |
76 | | /// Sets the start offset. If this is unknown, converts to a zero-width span at that position. |
77 | 0 | pub fn set_start(&mut self, new_start: u32) { |
78 | 0 | if !self.is_known() { |
79 | 0 | self.end = new_start; |
80 | 0 | } |
81 | 0 | self.start = new_start; |
82 | 0 | } |
83 | | |
84 | | /// Returns true if this span has a known source location. |
85 | 9.60M | pub fn is_known(&self) -> bool { |
86 | 9.60M | self.start != u32::MAX && self.end != u32::MAX |
87 | 9.60M | } |
88 | | } |
89 | | |
90 | | #[derive(Eq, PartialEq, Debug, Copy, Clone)] |
91 | | pub enum Token { |
92 | | Whitespace, |
93 | | Comment, |
94 | | |
95 | | Equals, |
96 | | Comma, |
97 | | Colon, |
98 | | Period, |
99 | | Semicolon, |
100 | | LeftParen, |
101 | | RightParen, |
102 | | LeftBrace, |
103 | | RightBrace, |
104 | | LessThan, |
105 | | GreaterThan, |
106 | | RArrow, |
107 | | Star, |
108 | | At, |
109 | | Slash, |
110 | | Plus, |
111 | | Minus, |
112 | | |
113 | | Use, |
114 | | Type, |
115 | | Func, |
116 | | U8, |
117 | | U16, |
118 | | U32, |
119 | | U64, |
120 | | S8, |
121 | | S16, |
122 | | S32, |
123 | | S64, |
124 | | F32, |
125 | | F64, |
126 | | Char, |
127 | | Record, |
128 | | Resource, |
129 | | Own, |
130 | | Borrow, |
131 | | Flags, |
132 | | Variant, |
133 | | Enum, |
134 | | Bool, |
135 | | String_, |
136 | | Option_, |
137 | | Result_, |
138 | | Future, |
139 | | Stream, |
140 | | ErrorContext, |
141 | | List, |
142 | | Map, |
143 | | Underscore, |
144 | | As, |
145 | | From_, |
146 | | Static, |
147 | | Interface, |
148 | | Tuple, |
149 | | Import, |
150 | | Export, |
151 | | World, |
152 | | Package, |
153 | | Constructor, |
154 | | Async, |
155 | | |
156 | | Id, |
157 | | ExplicitId, |
158 | | |
159 | | Integer, |
160 | | |
161 | | Include, |
162 | | With, |
163 | | } |
164 | | |
165 | | #[derive(Eq, PartialEq, Debug)] |
166 | | #[allow(dead_code)] |
167 | | pub enum Error { |
168 | | ControlCodepoint(u32, char), |
169 | | DeprecatedCodepoint(u32, char), |
170 | | ForbiddenCodepoint(u32, char), |
171 | | InvalidCharInId(u32, char), |
172 | | IdPartEmpty(u32), |
173 | | InvalidEscape(u32, char), |
174 | | Unexpected(u32, char), |
175 | | UnterminatedComment(u32), |
176 | | Wanted { |
177 | | at: u32, |
178 | | expected: &'static str, |
179 | | found: &'static str, |
180 | | }, |
181 | | } |
182 | | |
183 | | impl<'a> Tokenizer<'a> { |
184 | 21.7k | pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>, Error> { |
185 | 21.7k | detect_invalid_input(input)?; |
186 | | |
187 | 21.7k | let mut t = Tokenizer { |
188 | 21.7k | input, |
189 | 21.7k | span_offset, |
190 | 21.7k | chars: CrlfFold { |
191 | 21.7k | chars: input.char_indices(), |
192 | 21.7k | }, |
193 | 21.7k | }; |
194 | | // Eat utf-8 BOM |
195 | 21.7k | t.eatc('\u{feff}'); |
196 | 21.7k | Ok(t) |
197 | 21.7k | } |
198 | | |
199 | 73.9k | pub fn expect_semicolon(&mut self) -> Result<(), Error> { |
200 | 73.9k | self.expect(Token::Semicolon)?; |
201 | 73.9k | Ok(()) |
202 | 73.9k | } |
203 | | |
204 | 679k | pub fn get_span(&self, span: Span) -> &'a str { |
205 | 679k | let start = usize::try_from(span.start() - self.span_offset).unwrap(); |
206 | 679k | let end = usize::try_from(span.end() - self.span_offset).unwrap(); |
207 | 679k | &self.input[start..end] |
208 | 679k | } |
209 | | |
210 | 197k | pub fn parse_id(&self, span: Span) -> Result<&'a str, Error> { |
211 | 197k | let ret = self.get_span(span); |
212 | 197k | validate_id(span.start(), &ret)?; |
213 | 197k | Ok(ret) |
214 | 197k | } |
215 | | |
216 | 333k | pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str, Error> { |
217 | 333k | let token = self.get_span(span); |
218 | 333k | let id_part = token.strip_prefix('%').unwrap(); |
219 | 333k | validate_id(span.start(), id_part)?; |
220 | 333k | Ok(id_part) |
221 | 333k | } |
222 | | |
223 | 4.60M | pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> { |
224 | | loop { |
225 | 5.37M | match self.next_raw()? { |
226 | 764k | Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {} |
227 | 4.60M | other => break Ok(other), |
228 | | } |
229 | | } |
230 | 4.60M | } |
231 | | |
232 | | /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an |
233 | | /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more |
234 | | /// tokens available. |
235 | 6.45M | pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> { |
236 | 6.45M | let (str_start, ch) = match self.chars.next() { |
237 | 6.41M | Some(pair) => pair, |
238 | 44.7k | None => return Ok(None), |
239 | | }; |
240 | 6.41M | let start = self.span_offset + u32::try_from(str_start).unwrap(); |
241 | 6.41M | let token = match ch { |
242 | | '\n' | '\t' | ' ' => { |
243 | | // Eat all contiguous whitespace tokens |
244 | 1.86M | while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {} |
245 | 1.26M | Whitespace |
246 | | } |
247 | | '/' => { |
248 | | // Eat a line comment if it's `//...` |
249 | 28.1k | if self.eatc('/') { |
250 | 0 | for (_, ch) in &mut self.chars { |
251 | 0 | if ch == '\n' { |
252 | 0 | break; |
253 | 0 | } |
254 | | } |
255 | 0 | Comment |
256 | | // eat a block comment if it's `/*...` |
257 | 28.1k | } else if self.eatc('*') { |
258 | 0 | let mut depth = 1; |
259 | 0 | while depth > 0 { |
260 | 0 | let (_, ch) = match self.chars.next() { |
261 | 0 | Some(pair) => pair, |
262 | 0 | None => return Err(Error::UnterminatedComment(start)), |
263 | | }; |
264 | 0 | match ch { |
265 | 0 | '/' if self.eatc('*') => depth += 1, |
266 | 0 | '*' if self.eatc('/') => depth -= 1, |
267 | 0 | _ => {} |
268 | | } |
269 | | } |
270 | 0 | Comment |
271 | | } else { |
272 | 28.1k | Slash |
273 | | } |
274 | | } |
275 | 8.83k | '=' => Equals, |
276 | 368k | ',' => Comma, |
277 | 167k | ':' => Colon, |
278 | 234k | '.' => Period, |
279 | 143k | ';' => Semicolon, |
280 | 56.9k | '(' => LeftParen, |
281 | 90.1k | ')' => RightParen, |
282 | 133k | '{' => LeftBrace, |
283 | 240k | '}' => RightBrace, |
284 | 229k | '<' => LessThan, |
285 | 274k | '>' => GreaterThan, |
286 | 0 | '*' => Star, |
287 | 52.2k | '@' => At, |
288 | | '-' => { |
289 | 55.0k | if self.eatc('>') { |
290 | 19.7k | RArrow |
291 | | } else { |
292 | 35.2k | Minus |
293 | | } |
294 | | } |
295 | 69.8k | '+' => Plus, |
296 | | '%' => { |
297 | 629k | let mut iter = self.chars.clone(); |
298 | 629k | if let Some((_, ch)) = iter.next() { |
299 | 629k | if is_keylike_start(ch) { |
300 | 629k | self.chars = iter.clone(); |
301 | 5.51M | while let Some((_, ch)) = iter.next() { |
302 | 5.51M | if !is_keylike_continue(ch) { |
303 | 629k | break; |
304 | 4.88M | } |
305 | 4.88M | self.chars = iter.clone(); |
306 | | } |
307 | 0 | } |
308 | 0 | } |
309 | 629k | ExplicitId |
310 | | } |
311 | 2.36M | ch if is_keylike_start(ch) => { |
312 | 1.99M | let remaining = self.chars.chars.as_str().len(); |
313 | 1.99M | let mut iter = self.chars.clone(); |
314 | 11.3M | while let Some((_, ch)) = iter.next() { |
315 | 11.3M | if !is_keylike_continue(ch) { |
316 | 1.99M | break; |
317 | 9.38M | } |
318 | 9.38M | self.chars = iter.clone(); |
319 | | } |
320 | 1.99M | let str_end = |
321 | 1.99M | str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len()); |
322 | 1.99M | match &self.input[str_start..str_end] { |
323 | 1.99M | "use" => Use, |
324 | 1.86M | "type" => Type, |
325 | 1.84M | "func" => Func, |
326 | 1.80M | "u8" => U8, |
327 | 1.79M | "u16" => U16, |
328 | 1.79M | "u32" => U32, |
329 | 1.78M | "u64" => U64, |
330 | 1.78M | "s8" => S8, |
331 | 1.77M | "s16" => S16, |
332 | 1.77M | "s32" => S32, |
333 | 1.76M | "s64" => S64, |
334 | 1.75M | "f32" => F32, |
335 | 1.74M | "f64" => F64, |
336 | 1.72M | "char" => Char, |
337 | 1.70M | "resource" => Resource, |
338 | 1.69M | "own" => Own, |
339 | 1.69M | "borrow" => Borrow, |
340 | 1.69M | "record" => Record, |
341 | 1.67M | "flags" => Flags, |
342 | 1.67M | "variant" => Variant, |
343 | 1.63M | "enum" => Enum, |
344 | 1.56M | "bool" => Bool, |
345 | 1.35M | "string" => String_, |
346 | 1.35M | "option" => Option_, |
347 | 1.32M | "result" => Result_, |
348 | 1.24M | "future" => Future, |
349 | 1.23M | "stream" => Stream, |
350 | 1.21M | "error-context" => ErrorContext, |
351 | 1.17M | "list" => List, |
352 | 1.06M | "map" => Map, |
353 | 1.06M | "_" => Underscore, |
354 | 1.05M | "as" => As, |
355 | 1.02M | "from" => From_, |
356 | 1.02M | "static" => Static, |
357 | 1.02M | "interface" => Interface, |
358 | 765k | "tuple" => Tuple, |
359 | 697k | "world" => World, |
360 | 608k | "import" => Import, |
361 | 585k | "export" => Export, |
362 | 497k | "package" => Package, |
363 | 461k | "constructor" => Constructor, |
364 | 458k | "include" => Include, |
365 | 458k | "with" => With, |
366 | 458k | "async" => Async, |
367 | 439k | _ => Id, |
368 | | } |
369 | | } |
370 | | |
371 | 372k | ch if ch.is_ascii_digit() => { |
372 | 372k | let mut iter = self.chars.clone(); |
373 | 378k | while let Some((_, ch)) = iter.next() { |
374 | 378k | if !ch.is_ascii_digit() { |
375 | 372k | break; |
376 | 6.05k | } |
377 | 6.05k | self.chars = iter.clone(); |
378 | | } |
379 | | |
380 | 372k | Integer |
381 | | } |
382 | | |
383 | 0 | ch => return Err(Error::Unexpected(start, ch)), |
384 | | }; |
385 | 6.41M | let end = match self.chars.clone().next() { |
386 | 6.38M | Some((i, _)) => i, |
387 | 21.2k | None => self.input.len(), |
388 | | }; |
389 | | |
390 | 6.41M | let end = self.span_offset + u32::try_from(end).unwrap(); |
391 | 6.41M | Ok(Some((Span::new(start, end), token))) |
392 | 6.45M | } |
393 | | |
394 | 1.64M | pub fn eat(&mut self, expected: Token) -> Result<bool, Error> { |
395 | 1.64M | let mut other = self.clone(); |
396 | 1.64M | match other.next()? { |
397 | 1.64M | Some((_span, found)) if expected == found => { |
398 | 810k | *self = other; |
399 | 810k | Ok(true) |
400 | | } |
401 | 838k | Some(_) => Ok(false), |
402 | 575 | None => Ok(false), |
403 | | } |
404 | 1.64M | } |
405 | | |
406 | 1.25M | pub fn expect(&mut self, expected: Token) -> Result<Span, Error> { |
407 | 1.25M | match self.next()? { |
408 | 1.25M | Some((span, found)) => { |
409 | 1.25M | if expected == found { |
410 | 1.25M | Ok(span) |
411 | | } else { |
412 | 0 | Err(Error::Wanted { |
413 | 0 | at: span.start(), |
414 | 0 | expected: expected.describe(), |
415 | 0 | found: found.describe(), |
416 | 0 | }) |
417 | | } |
418 | | } |
419 | 0 | None => Err(Error::Wanted { |
420 | 0 | at: self.span_offset + u32::try_from(self.input.len()).unwrap(), |
421 | 0 | expected: expected.describe(), |
422 | 0 | found: "eof", |
423 | 0 | }), |
424 | | } |
425 | 1.25M | } |
426 | | |
427 | 4.79M | fn eatc(&mut self, ch: char) -> bool { |
428 | 4.79M | let mut iter = self.chars.clone(); |
429 | 4.79M | match iter.next() { |
430 | 4.73M | Some((_, ch2)) if ch == ch2 => { |
431 | 622k | self.chars = iter; |
432 | 622k | true |
433 | | } |
434 | 4.17M | _ => false, |
435 | | } |
436 | 4.79M | } |
437 | | |
438 | 0 | pub fn eof_span(&self) -> Span { |
439 | 0 | let end = self.span_offset + u32::try_from(self.input.len()).unwrap(); |
440 | 0 | Span::new(end, end) |
441 | 0 | } |
442 | | } |
443 | | |
444 | | impl<'a> Iterator for CrlfFold<'a> { |
445 | | type Item = (usize, char); |
446 | | |
447 | 35.5M | fn next(&mut self) -> Option<(usize, char)> { |
448 | 35.5M | self.chars.next().map(|(i, c)| { |
449 | 35.4M | if c == '\r' { |
450 | 0 | let mut attempt = self.chars.clone(); |
451 | 0 | if let Some((_, '\n')) = attempt.next() { |
452 | 0 | self.chars = attempt; |
453 | 0 | return (i, '\n'); |
454 | 0 | } |
455 | 35.4M | } |
456 | 35.4M | (i, c) |
457 | 35.4M | }) |
458 | 35.5M | } |
459 | | } |
460 | | |
461 | 21.7k | fn detect_invalid_input(input: &str) -> Result<(), Error> { |
462 | | // Disallow specific codepoints. |
463 | 12.3M | for (pos, ch) in input.char_indices() { |
464 | 11.8M | match ch { |
465 | 535k | '\n' | '\r' | '\t' => {} |
466 | | |
467 | | // Bidirectional override codepoints can be used to craft source code that |
468 | | // appears to have a different meaning than its actual meaning. See |
469 | | // [CVE-2021-42574] for background and motivation. |
470 | | // |
471 | | // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574 |
472 | | '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}' |
473 | | | '\u{2067}' | '\u{2068}' | '\u{2069}' => { |
474 | 0 | return Err(Error::ForbiddenCodepoint(u32::try_from(pos).unwrap(), ch)); |
475 | | } |
476 | | |
477 | | // Disallow several characters which are deprecated or discouraged in Unicode. |
478 | | // |
479 | | // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs. |
480 | | // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks. |
481 | | // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels. |
482 | | // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see |
483 | | // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged. |
484 | | '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}' |
485 | | | '\u{17b4}' | '\u{17b5}' => { |
486 | 0 | return Err(Error::DeprecatedCodepoint(u32::try_from(pos).unwrap(), ch)); |
487 | | } |
488 | | |
489 | | // Disallow control codes other than the ones explicitly recognized above, |
490 | | // so that viewing a wit file on a terminal doesn't have surprising side |
491 | | // effects or appear to have a different meaning than its actual meaning. |
492 | 11.8M | ch if ch.is_control() => { |
493 | 0 | return Err(Error::ControlCodepoint(u32::try_from(pos).unwrap(), ch)); |
494 | | } |
495 | | |
496 | 11.8M | _ => {} |
497 | | } |
498 | | } |
499 | | |
500 | 21.7k | Ok(()) |
501 | 21.7k | } |
502 | | |
503 | 2.99M | fn is_keylike_start(ch: char) -> bool { |
504 | | // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars, |
505 | | // but we'll diagnose that after we've lexed the full string. |
506 | 2.99M | unicode_ident::is_xid_start(ch) || ch == '_' || ch == '-' |
507 | 2.99M | } |
508 | | |
509 | 16.8M | fn is_keylike_continue(ch: char) -> bool { |
510 | | // Lex any XID continue (which includes `_`) or '-'. |
511 | 16.8M | unicode_ident::is_xid_continue(ch) || ch == '-' |
512 | 16.8M | } |
513 | | |
514 | 530k | pub fn validate_id(start: u32, id: &str) -> Result<(), Error> { |
515 | | // IDs must have at least one part. |
516 | 530k | if id.is_empty() { |
517 | 0 | return Err(Error::IdPartEmpty(start)); |
518 | 530k | } |
519 | | |
520 | | // Ids consist of parts separated by '-'s. |
521 | 530k | for (idx, part) in id.split('-').enumerate() { |
522 | | // Parts must be non-empty and contain either all ASCII lowercase or |
523 | | // all ASCII uppercase. Non-first segment can also start with a digit. |
524 | 530k | let Some(first_char) = part.chars().next() else { |
525 | 0 | return Err(Error::IdPartEmpty(start)); |
526 | | }; |
527 | 530k | if idx == 0 && !first_char.is_ascii_alphabetic() { |
528 | 0 | return Err(Error::InvalidCharInId(start, first_char)); |
529 | 530k | } |
530 | 530k | let mut upper = None; |
531 | 4.54M | for ch in part.chars() { |
532 | 4.54M | if ch.is_ascii_digit() { |
533 | 430k | // Digits are accepted in both uppercase and lowercase segments. |
534 | 4.11M | } else if ch.is_ascii_uppercase() { |
535 | 0 | if upper.is_none() { |
536 | 0 | upper = Some(true); |
537 | 0 | } else if let Some(false) = upper { |
538 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
539 | 0 | } |
540 | 4.11M | } else if ch.is_ascii_lowercase() { |
541 | 4.11M | if upper.is_none() { |
542 | 530k | upper = Some(false); |
543 | 3.58M | } else if let Some(true) = upper { |
544 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
545 | 3.58M | } |
546 | | } else { |
547 | 0 | return Err(Error::InvalidCharInId(start, ch)); |
548 | | } |
549 | | } |
550 | | } |
551 | | |
552 | 530k | Ok(()) |
553 | 530k | } |
554 | | |
555 | | impl Token { |
556 | 0 | pub fn describe(&self) -> &'static str { |
557 | 0 | match self { |
558 | 0 | Whitespace => "whitespace", |
559 | 0 | Comment => "a comment", |
560 | 0 | Equals => "'='", |
561 | 0 | Comma => "','", |
562 | 0 | Colon => "':'", |
563 | 0 | Period => "'.'", |
564 | 0 | Semicolon => "';'", |
565 | 0 | LeftParen => "'('", |
566 | 0 | RightParen => "')'", |
567 | 0 | LeftBrace => "'{'", |
568 | 0 | RightBrace => "'}'", |
569 | 0 | LessThan => "'<'", |
570 | 0 | GreaterThan => "'>'", |
571 | 0 | Use => "keyword `use`", |
572 | 0 | Type => "keyword `type`", |
573 | 0 | Func => "keyword `func`", |
574 | 0 | U8 => "keyword `u8`", |
575 | 0 | U16 => "keyword `u16`", |
576 | 0 | U32 => "keyword `u32`", |
577 | 0 | U64 => "keyword `u64`", |
578 | 0 | S8 => "keyword `s8`", |
579 | 0 | S16 => "keyword `s16`", |
580 | 0 | S32 => "keyword `s32`", |
581 | 0 | S64 => "keyword `s64`", |
582 | 0 | F32 => "keyword `f32`", |
583 | 0 | F64 => "keyword `f64`", |
584 | 0 | Char => "keyword `char`", |
585 | 0 | Own => "keyword `own`", |
586 | 0 | Borrow => "keyword `borrow`", |
587 | 0 | Resource => "keyword `resource`", |
588 | 0 | Record => "keyword `record`", |
589 | 0 | Flags => "keyword `flags`", |
590 | 0 | Variant => "keyword `variant`", |
591 | 0 | Enum => "keyword `enum`", |
592 | 0 | Bool => "keyword `bool`", |
593 | 0 | String_ => "keyword `string`", |
594 | 0 | Option_ => "keyword `option`", |
595 | 0 | Result_ => "keyword `result`", |
596 | 0 | Future => "keyword `future`", |
597 | 0 | Stream => "keyword `stream`", |
598 | 0 | ErrorContext => "keyword `error-context`", |
599 | 0 | List => "keyword `list`", |
600 | 0 | Map => "keyword `map`", |
601 | 0 | Underscore => "keyword `_`", |
602 | 0 | Id => "an identifier", |
603 | 0 | ExplicitId => "an '%' identifier", |
604 | 0 | RArrow => "`->`", |
605 | 0 | Star => "`*`", |
606 | 0 | At => "`@`", |
607 | 0 | Slash => "`/`", |
608 | 0 | Plus => "`+`", |
609 | 0 | Minus => "`-`", |
610 | 0 | As => "keyword `as`", |
611 | 0 | From_ => "keyword `from`", |
612 | 0 | Static => "keyword `static`", |
613 | 0 | Interface => "keyword `interface`", |
614 | 0 | Tuple => "keyword `tuple`", |
615 | 0 | Import => "keyword `import`", |
616 | 0 | Export => "keyword `export`", |
617 | 0 | World => "keyword `world`", |
618 | 0 | Package => "keyword `package`", |
619 | 0 | Constructor => "keyword `constructor`", |
620 | 0 | Integer => "an integer", |
621 | 0 | Include => "keyword `include`", |
622 | 0 | With => "keyword `with`", |
623 | 0 | Async => "keyword `async`", |
624 | | } |
625 | 0 | } |
626 | | } |
627 | | |
628 | | impl core::error::Error for Error {} |
629 | | |
630 | | impl Error { |
631 | | /// Returns the byte offset in the source map where this error occurred. |
632 | 0 | pub fn position(&self) -> u32 { |
633 | 0 | match self { |
634 | 0 | Error::ControlCodepoint(at, _) |
635 | 0 | | Error::DeprecatedCodepoint(at, _) |
636 | 0 | | Error::ForbiddenCodepoint(at, _) |
637 | 0 | | Error::InvalidCharInId(at, _) |
638 | 0 | | Error::IdPartEmpty(at) |
639 | 0 | | Error::InvalidEscape(at, _) |
640 | 0 | | Error::Unexpected(at, _) |
641 | 0 | | Error::UnterminatedComment(at) => *at, |
642 | 0 | Error::Wanted { at, .. } => *at, |
643 | | } |
644 | 0 | } |
645 | | } |
646 | | |
647 | | impl fmt::Display for Error { |
648 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
649 | 0 | match self { |
650 | 0 | Error::ControlCodepoint(_, ch) => write!(f, "Control code '{}'", ch.escape_unicode()), |
651 | 0 | Error::DeprecatedCodepoint(_, ch) => { |
652 | 0 | write!( |
653 | 0 | f, |
654 | | "Codepoint {:?} is discouraged by Unicode", |
655 | 0 | ch.escape_unicode() |
656 | | ) |
657 | | } |
658 | 0 | Error::ForbiddenCodepoint(_, ch) => { |
659 | 0 | write!( |
660 | 0 | f, |
661 | | "Input contains bidirectional override codepoint {:?}", |
662 | 0 | ch.escape_unicode() |
663 | | ) |
664 | | } |
665 | 0 | Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"), |
666 | 0 | Error::UnterminatedComment(_) => write!(f, "unterminated block comment"), |
667 | | Error::Wanted { |
668 | 0 | expected, found, .. |
669 | 0 | } => write!(f, "expected {expected}, found {found}"), |
670 | 0 | Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"), |
671 | 0 | Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"), |
672 | 0 | Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"), |
673 | | } |
674 | 0 | } |
675 | | } |
676 | | |
677 | | #[test] |
678 | | fn test_validate_id() { |
679 | | validate_id(0, "apple").unwrap(); |
680 | | validate_id(0, "apple-pear").unwrap(); |
681 | | validate_id(0, "apple-pear-grape").unwrap(); |
682 | | validate_id(0, "a0").unwrap(); |
683 | | validate_id(0, "a").unwrap(); |
684 | | validate_id(0, "a-a").unwrap(); |
685 | | validate_id(0, "bool").unwrap(); |
686 | | validate_id(0, "APPLE").unwrap(); |
687 | | validate_id(0, "APPLE-PEAR").unwrap(); |
688 | | validate_id(0, "APPLE-PEAR-GRAPE").unwrap(); |
689 | | validate_id(0, "apple-PEAR-grape").unwrap(); |
690 | | validate_id(0, "APPLE-pear-GRAPE").unwrap(); |
691 | | validate_id(0, "ENOENT").unwrap(); |
692 | | validate_id(0, "is-XML").unwrap(); |
693 | | validate_id(0, "apple-0").unwrap(); |
694 | | validate_id(0, "a0-000-3d4a-54FF").unwrap(); |
695 | | |
696 | | assert!(validate_id(0, "").is_err()); |
697 | | assert!(validate_id(0, "0").is_err()); |
698 | | assert!(validate_id(0, "%").is_err()); |
699 | | assert!(validate_id(0, "$").is_err()); |
700 | | assert!(validate_id(0, "0a").is_err()); |
701 | | assert!(validate_id(0, ".").is_err()); |
702 | | assert!(validate_id(0, "·").is_err()); |
703 | | assert!(validate_id(0, "a a").is_err()); |
704 | | assert!(validate_id(0, "_").is_err()); |
705 | | assert!(validate_id(0, "-").is_err()); |
706 | | assert!(validate_id(0, "a-").is_err()); |
707 | | assert!(validate_id(0, "-a").is_err()); |
708 | | assert!(validate_id(0, "Apple").is_err()); |
709 | | assert!(validate_id(0, "applE").is_err()); |
710 | | assert!(validate_id(0, "-apple-pear").is_err()); |
711 | | assert!(validate_id(0, "apple-pear-").is_err()); |
712 | | assert!(validate_id(0, "apple_pear").is_err()); |
713 | | assert!(validate_id(0, "apple.pear").is_err()); |
714 | | assert!(validate_id(0, "apple pear").is_err()); |
715 | | assert!(validate_id(0, "apple/pear").is_err()); |
716 | | assert!(validate_id(0, "apple|pear").is_err()); |
717 | | assert!(validate_id(0, "apple-Pear").is_err()); |
718 | | assert!(validate_id(0, "()()").is_err()); |
719 | | assert!(validate_id(0, "").is_err()); |
720 | | assert!(validate_id(0, "*").is_err()); |
721 | | assert!(validate_id(0, "apple\u{5f3}pear").is_err()); |
722 | | assert!(validate_id(0, "apple\u{200c}pear").is_err()); |
723 | | assert!(validate_id(0, "apple\u{200d}pear").is_err()); |
724 | | assert!(validate_id(0, "apple--pear").is_err()); |
725 | | assert!(validate_id(0, "_apple").is_err()); |
726 | | assert!(validate_id(0, "apple_").is_err()); |
727 | | assert!(validate_id(0, "_Znwj").is_err()); |
728 | | assert!(validate_id(0, "__i386").is_err()); |
729 | | assert!(validate_id(0, "__i386__").is_err()); |
730 | | assert!(validate_id(0, "Москва").is_err()); |
731 | | assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err()); |
732 | | assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err()); |
733 | | assert!(validate_id(0, "😼").is_err(), "non-identifier"); |
734 | | assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii"); |
735 | | } |
736 | | |
737 | | #[test] |
738 | | fn test_tokenizer() { |
739 | | fn collect(s: &str) -> Result<Vec<Token>, Error> { |
740 | | let mut t = Tokenizer::new(s, 0)?; |
741 | | let mut tokens = Vec::new(); |
742 | | while let Some(token) = t.next()? { |
743 | | tokens.push(token.1); |
744 | | } |
745 | | Ok(tokens) |
746 | | } |
747 | | |
748 | | assert_eq!(collect("").unwrap(), vec![]); |
749 | | assert_eq!(collect("_").unwrap(), vec![Token::Underscore]); |
750 | | assert_eq!(collect("apple").unwrap(), vec![Token::Id]); |
751 | | assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]); |
752 | | assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]); |
753 | | assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]); |
754 | | assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]); |
755 | | assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]); |
756 | | assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]); |
757 | | assert_eq!(collect("garçon").unwrap(), vec![Token::Id]); |
758 | | assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]); |
759 | | assert_eq!(collect("москва").unwrap(), vec![Token::Id]); |
760 | | assert_eq!(collect("東京").unwrap(), vec![Token::Id]); |
761 | | assert_eq!( |
762 | | collect("garçon-hühnervögel-москва-東京").unwrap(), |
763 | | vec![Token::Id] |
764 | | ); |
765 | | assert_eq!(collect("a0").unwrap(), vec![Token::Id]); |
766 | | assert_eq!(collect("a").unwrap(), vec![Token::Id]); |
767 | | assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]); |
768 | | assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]); |
769 | | assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]); |
770 | | assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]); |
771 | | assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]); |
772 | | assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]); |
773 | | assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]); |
774 | | assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]); |
775 | | assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]); |
776 | | assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]); |
777 | | assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]); |
778 | | |
779 | | assert_eq!(collect("func").unwrap(), vec![Token::Func]); |
780 | | assert_eq!( |
781 | | collect("a: func()").unwrap(), |
782 | | vec![ |
783 | | Token::Id, |
784 | | Token::Colon, |
785 | | Token::Func, |
786 | | Token::LeftParen, |
787 | | Token::RightParen |
788 | | ] |
789 | | ); |
790 | | |
791 | | assert_eq!(collect("resource").unwrap(), vec![Token::Resource]); |
792 | | |
793 | | assert_eq!(collect("own").unwrap(), vec![Token::Own]); |
794 | | assert_eq!( |
795 | | collect("own<some-id>").unwrap(), |
796 | | vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan] |
797 | | ); |
798 | | |
799 | | assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]); |
800 | | assert_eq!( |
801 | | collect("borrow<some-id>").unwrap(), |
802 | | vec![ |
803 | | Token::Borrow, |
804 | | Token::LessThan, |
805 | | Token::Id, |
806 | | Token::GreaterThan |
807 | | ] |
808 | | ); |
809 | | |
810 | | assert!(collect("\u{149}").is_err(), "strongly discouraged"); |
811 | | assert!(collect("\u{673}").is_err(), "strongly discouraged"); |
812 | | assert!(collect("\u{17a3}").is_err(), "strongly discouraged"); |
813 | | assert!(collect("\u{17a4}").is_err(), "strongly discouraged"); |
814 | | assert!(collect("\u{202a}").is_err(), "bidirectional override"); |
815 | | assert!(collect("\u{2068}").is_err(), "bidirectional override"); |
816 | | assert!(collect("\u{0}").is_err(), "control code"); |
817 | | assert!(collect("\u{b}").is_err(), "control code"); |
818 | | assert!(collect("\u{c}").is_err(), "control code"); |
819 | | assert!(collect("\u{85}").is_err(), "control code"); |
820 | | } |