Coverage Report

Created: 2026-02-23 07:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wasm-tools/crates/wit-parser/src/ast/lex.rs
Line
Count
Source
1
#[cfg(test)]
2
use alloc::{vec, vec::Vec};
3
use anyhow::{Result, bail};
4
use core::char;
5
use core::fmt;
6
use core::str;
7
use unicode_xid::UnicodeXID;
8
9
use self::Token::*;
10
11
#[derive(Clone)]
12
pub struct Tokenizer<'a> {
13
    input: &'a str,
14
    span_offset: u32,
15
    chars: CrlfFold<'a>,
16
}
17
18
#[derive(Clone)]
19
struct CrlfFold<'a> {
20
    chars: str::CharIndices<'a>,
21
}
22
23
/// A span, designating a range of bytes where a token is located.
24
///
25
/// Uses `u32::MAX` as a sentinel value to represent unknown spans (e.g.,
26
/// decoded from binary).
27
#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)]
28
pub struct Span {
29
    start: u32,
30
    end: u32,
31
}
32
33
impl Default for Span {
34
1.20M
    fn default() -> Span {
35
1.20M
        Span {
36
1.20M
            start: u32::MAX,
37
1.20M
            end: u32::MAX,
38
1.20M
        }
39
1.20M
    }
40
}
41
42
impl Span {
43
5.43M
    pub fn new(start: u32, end: u32) -> Span {
44
5.43M
        let span = Span { start, end };
45
5.43M
        assert!(span.is_known(), "cannot create a span with u32::MAX");
46
5.43M
        span
47
5.43M
    }
48
49
    /// Adjusts this span by adding the given byte offset to both start and end.
50
664k
    pub fn adjust(&mut self, offset: u32) {
51
664k
        if self.is_known() {
52
465k
            self.start += offset;
53
465k
            self.end += offset;
54
465k
        }
55
664k
    }
56
57
    /// Returns the start offset, panicking if this is an unknown span.
58
1.05M
    pub fn start(&self) -> u32 {
59
1.05M
        assert!(self.is_known(), "cannot get start of unknown span");
60
1.05M
        self.start
61
1.05M
    }
62
63
    /// Returns the end offset, panicking if this is an unknown span.
64
732k
    pub fn end(&self) -> u32 {
65
732k
        assert!(self.is_known(), "cannot get end of unknown span");
66
732k
        self.end
67
732k
    }
68
69
    /// Sets the end offset. If this is unknown, converts to a zero-width span at that position.
70
100k
    pub fn set_end(&mut self, new_end: u32) {
71
100k
        if !self.is_known() {
72
0
            self.start = new_end;
73
100k
        }
74
100k
        self.end = new_end;
75
100k
    }
76
77
    /// Sets the start offset. If this is unknown, converts to a zero-width span at that position.
78
0
    pub fn set_start(&mut self, new_start: u32) {
79
0
        if !self.is_known() {
80
0
            self.end = new_start;
81
0
        }
82
0
        self.start = new_start;
83
0
    }
84
85
    /// Returns true if this span has a known source location.
86
7.98M
    pub fn is_known(&self) -> bool {
87
7.98M
        self.start != u32::MAX && self.end != u32::MAX
88
7.98M
    }
89
}
90
91
#[derive(Eq, PartialEq, Debug, Copy, Clone)]
92
pub enum Token {
93
    Whitespace,
94
    Comment,
95
96
    Equals,
97
    Comma,
98
    Colon,
99
    Period,
100
    Semicolon,
101
    LeftParen,
102
    RightParen,
103
    LeftBrace,
104
    RightBrace,
105
    LessThan,
106
    GreaterThan,
107
    RArrow,
108
    Star,
109
    At,
110
    Slash,
111
    Plus,
112
    Minus,
113
114
    Use,
115
    Type,
116
    Func,
117
    U8,
118
    U16,
119
    U32,
120
    U64,
121
    S8,
122
    S16,
123
    S32,
124
    S64,
125
    F32,
126
    F64,
127
    Char,
128
    Record,
129
    Resource,
130
    Own,
131
    Borrow,
132
    Flags,
133
    Variant,
134
    Enum,
135
    Bool,
136
    String_,
137
    Option_,
138
    Result_,
139
    Future,
140
    Stream,
141
    ErrorContext,
142
    List,
143
    Map,
144
    Underscore,
145
    As,
146
    From_,
147
    Static,
148
    Interface,
149
    Tuple,
150
    Import,
151
    Export,
152
    World,
153
    Package,
154
    Constructor,
155
    Async,
156
157
    Id,
158
    ExplicitId,
159
160
    Integer,
161
162
    Include,
163
    With,
164
}
165
166
#[derive(Eq, PartialEq, Debug)]
167
#[allow(dead_code)]
168
pub enum Error {
169
    InvalidCharInId(u32, char),
170
    IdPartEmpty(u32),
171
    InvalidEscape(u32, char),
172
    Unexpected(u32, char),
173
    UnterminatedComment(u32),
174
    Wanted {
175
        at: u32,
176
        expected: &'static str,
177
        found: &'static str,
178
    },
179
}
180
181
impl<'a> Tokenizer<'a> {
182
22.0k
    pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>> {
183
22.0k
        detect_invalid_input(input)?;
184
185
22.0k
        let mut t = Tokenizer {
186
22.0k
            input,
187
22.0k
            span_offset,
188
22.0k
            chars: CrlfFold {
189
22.0k
                chars: input.char_indices(),
190
22.0k
            },
191
22.0k
        };
192
        // Eat utf-8 BOM
193
22.0k
        t.eatc('\u{feff}');
194
22.0k
        Ok(t)
195
22.0k
    }
196
197
43.3k
    pub fn expect_semicolon(&mut self) -> Result<()> {
198
43.3k
        self.expect(Token::Semicolon)?;
199
43.3k
        Ok(())
200
43.3k
    }
201
202
566k
    pub fn get_span(&self, span: Span) -> &'a str {
203
566k
        let start = usize::try_from(span.start() - self.span_offset).unwrap();
204
566k
        let end = usize::try_from(span.end() - self.span_offset).unwrap();
205
566k
        &self.input[start..end]
206
566k
    }
207
208
170k
    pub fn parse_id(&self, span: Span) -> Result<&'a str> {
209
170k
        let ret = self.get_span(span);
210
170k
        validate_id(span.start(), &ret)?;
211
170k
        Ok(ret)
212
170k
    }
213
214
263k
    pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
215
263k
        let token = self.get_span(span);
216
263k
        let id_part = token.strip_prefix('%').unwrap();
217
263k
        validate_id(span.start(), id_part)?;
218
263k
        Ok(id_part)
219
263k
    }
220
221
3.75M
    pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
222
        loop {
223
4.36M
            match self.next_raw()? {
224
616k
                Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
225
3.75M
                other => break Ok(other),
226
            }
227
        }
228
3.75M
    }
229
230
    /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
231
    /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
232
    /// tokens available.
233
5.42M
    pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
234
5.42M
        let (str_start, ch) = match self.chars.next() {
235
5.37M
            Some(pair) => pair,
236
45.5k
            None => return Ok(None),
237
        };
238
5.37M
        let start = self.span_offset + u32::try_from(str_start).unwrap();
239
5.37M
        let token = match ch {
240
            '\n' | '\t' | ' ' => {
241
                // Eat all contiguous whitespace tokens
242
1.74M
                while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
243
1.10M
                Whitespace
244
            }
245
            '/' => {
246
                // Eat a line comment if it's `//...`
247
10.9k
                if self.eatc('/') {
248
0
                    for (_, ch) in &mut self.chars {
249
0
                        if ch == '\n' {
250
0
                            break;
251
0
                        }
252
                    }
253
0
                    Comment
254
                // eat a block comment if it's `/*...`
255
10.9k
                } else if self.eatc('*') {
256
0
                    let mut depth = 1;
257
0
                    while depth > 0 {
258
0
                        let (_, ch) = match self.chars.next() {
259
0
                            Some(pair) => pair,
260
0
                            None => return Err(Error::UnterminatedComment(start)),
261
                        };
262
0
                        match ch {
263
0
                            '/' if self.eatc('*') => depth += 1,
264
0
                            '*' if self.eatc('/') => depth -= 1,
265
0
                            _ => {}
266
                        }
267
                    }
268
0
                    Comment
269
                } else {
270
10.9k
                    Slash
271
                }
272
            }
273
6.98k
            '=' => Equals,
274
361k
            ',' => Comma,
275
116k
            ':' => Colon,
276
138k
            '.' => Period,
277
89.4k
            ';' => Semicolon,
278
48.6k
            '(' => LeftParen,
279
77.8k
            ')' => RightParen,
280
126k
            '{' => LeftBrace,
281
238k
            '}' => RightBrace,
282
202k
            '<' => LessThan,
283
234k
            '>' => GreaterThan,
284
0
            '*' => Star,
285
35.0k
            '@' => At,
286
            '-' => {
287
37.2k
                if self.eatc('>') {
288
16.4k
                    RArrow
289
                } else {
290
20.7k
                    Minus
291
                }
292
            }
293
39.5k
            '+' => Plus,
294
            '%' => {
295
529k
                let mut iter = self.chars.clone();
296
529k
                if let Some((_, ch)) = iter.next() {
297
529k
                    if is_keylike_start(ch) {
298
529k
                        self.chars = iter.clone();
299
4.28M
                        while let Some((_, ch)) = iter.next() {
300
4.28M
                            if !is_keylike_continue(ch) {
301
529k
                                break;
302
3.75M
                            }
303
3.75M
                            self.chars = iter.clone();
304
                        }
305
0
                    }
306
0
                }
307
529k
                ExplicitId
308
            }
309
1.97M
            ch if is_keylike_start(ch) => {
310
1.71M
                let remaining = self.chars.chars.as_str().len();
311
1.71M
                let mut iter = self.chars.clone();
312
9.80M
                while let Some((_, ch)) = iter.next() {
313
9.80M
                    if !is_keylike_continue(ch) {
314
1.71M
                        break;
315
8.08M
                    }
316
8.08M
                    self.chars = iter.clone();
317
                }
318
1.71M
                let str_end =
319
1.71M
                    str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
320
1.71M
                match &self.input[str_start..str_end] {
321
1.71M
                    "use" => Use,
322
1.63M
                    "type" => Type,
323
1.62M
                    "func" => Func,
324
1.59M
                    "u8" => U8,
325
1.57M
                    "u16" => U16,
326
1.57M
                    "u32" => U32,
327
1.57M
                    "u64" => U64,
328
1.56M
                    "s8" => S8,
329
1.56M
                    "s16" => S16,
330
1.55M
                    "s32" => S32,
331
1.55M
                    "s64" => S64,
332
1.54M
                    "f32" => F32,
333
1.53M
                    "f64" => F64,
334
1.52M
                    "char" => Char,
335
1.50M
                    "resource" => Resource,
336
1.49M
                    "own" => Own,
337
1.49M
                    "borrow" => Borrow,
338
1.49M
                    "record" => Record,
339
1.47M
                    "flags" => Flags,
340
1.47M
                    "variant" => Variant,
341
1.43M
                    "enum" => Enum,
342
1.33M
                    "bool" => Bool,
343
1.15M
                    "string" => String_,
344
1.15M
                    "option" => Option_,
345
1.12M
                    "result" => Result_,
346
1.07M
                    "future" => Future,
347
1.06M
                    "stream" => Stream,
348
1.05M
                    "error-context" => ErrorContext,
349
1.02M
                    "list" => List,
350
909k
                    "map" => Map,
351
909k
                    "_" => Underscore,
352
907k
                    "as" => As,
353
888k
                    "from" => From_,
354
888k
                    "static" => Static,
355
886k
                    "interface" => Interface,
356
616k
                    "tuple" => Tuple,
357
564k
                    "world" => World,
358
476k
                    "import" => Import,
359
454k
                    "export" => Export,
360
441k
                    "package" => Package,
361
406k
                    "constructor" => Constructor,
362
403k
                    "include" => Include,
363
403k
                    "with" => With,
364
403k
                    "async" => Async,
365
386k
                    _ => Id,
366
                }
367
            }
368
369
263k
            ch if ch.is_ascii_digit() => {
370
263k
                let mut iter = self.chars.clone();
371
268k
                while let Some((_, ch)) = iter.next() {
372
268k
                    if !ch.is_ascii_digit() {
373
263k
                        break;
374
5.29k
                    }
375
5.29k
                    self.chars = iter.clone();
376
                }
377
378
263k
                Integer
379
            }
380
381
0
            ch => return Err(Error::Unexpected(start, ch)),
382
        };
383
5.37M
        let end = match self.chars.clone().next() {
384
5.35M
            Some((i, _)) => i,
385
21.3k
            None => self.input.len(),
386
        };
387
388
5.37M
        let end = self.span_offset + u32::try_from(end).unwrap();
389
5.37M
        Ok(Some((Span::new(start, end), token)))
390
5.42M
    }
391
392
1.35M
    pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
393
1.35M
        let mut other = self.clone();
394
1.35M
        match other.next()? {
395
1.35M
            Some((_span, found)) if expected == found => {
396
699k
                *self = other;
397
699k
                Ok(true)
398
            }
399
659k
            Some(_) => Ok(false),
400
713
            None => Ok(false),
401
        }
402
1.35M
    }
403
404
1.02M
    pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
405
1.02M
        match self.next()? {
406
1.02M
            Some((span, found)) => {
407
1.02M
                if expected == found {
408
1.02M
                    Ok(span)
409
                } else {
410
0
                    Err(Error::Wanted {
411
0
                        at: span.start(),
412
0
                        expected: expected.describe(),
413
0
                        found: found.describe(),
414
0
                    })
415
                }
416
            }
417
0
            None => Err(Error::Wanted {
418
0
                at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
419
0
                expected: expected.describe(),
420
0
                found: "eof",
421
0
            }),
422
        }
423
1.02M
    }
424
425
4.29M
    fn eatc(&mut self, ch: char) -> bool {
426
4.29M
        let mut iter = self.chars.clone();
427
4.29M
        match iter.next() {
428
4.22M
            Some((_, ch2)) if ch == ch2 => {
429
651k
                self.chars = iter;
430
651k
                true
431
            }
432
3.63M
            _ => false,
433
        }
434
4.29M
    }
435
436
0
    pub fn eof_span(&self) -> Span {
437
0
        let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
438
0
        Span::new(end, end)
439
0
    }
440
}
441
442
impl<'a> Iterator for CrlfFold<'a> {
443
    type Item = (usize, char);
444
445
29.9M
    fn next(&mut self) -> Option<(usize, char)> {
446
29.9M
        self.chars.next().map(|(i, c)| {
447
29.8M
            if c == '\r' {
448
0
                let mut attempt = self.chars.clone();
449
0
                if let Some((_, '\n')) = attempt.next() {
450
0
                    self.chars = attempt;
451
0
                    return (i, '\n');
452
0
                }
453
29.8M
            }
454
29.8M
            (i, c)
455
29.8M
        })
456
29.9M
    }
457
}
458
459
22.0k
fn detect_invalid_input(input: &str) -> Result<()> {
460
    // Disallow specific codepoints.
461
22.0k
    let mut line = 1;
462
10.1M
    for ch in input.chars() {
463
9.60M
        match ch {
464
534k
            '\n' => line += 1,
465
0
            '\r' | '\t' => {}
466
467
            // Bidirectional override codepoints can be used to craft source code that
468
            // appears to have a different meaning than its actual meaning. See
469
            // [CVE-2021-42574] for background and motivation.
470
            //
471
            // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
472
            '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
473
            | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
474
0
                bail!(
475
                    "Input contains bidirectional override codepoint {:?} at line {}",
476
0
                    ch.escape_unicode(),
477
                    line
478
                );
479
            }
480
481
            // Disallow several characters which are deprecated or discouraged in Unicode.
482
            //
483
            // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
484
            // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
485
            // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
486
            // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
487
            // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
488
            '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
489
            | '\u{17b4}' | '\u{17b5}' => {
490
0
                bail!(
491
                    "Codepoint {:?} at line {} is discouraged by Unicode",
492
0
                    ch.escape_unicode(),
493
                    line
494
                );
495
            }
496
497
            // Disallow control codes other than the ones explicitly recognized above,
498
            // so that viewing a wit file on a terminal doesn't have surprising side
499
            // effects or appear to have a different meaning than its actual meaning.
500
9.60M
            ch if ch.is_control() => {
501
0
                bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
502
            }
503
504
9.60M
            _ => {}
505
        }
506
    }
507
508
22.0k
    Ok(())
509
22.0k
}
510
511
2.50M
fn is_keylike_start(ch: char) -> bool {
512
    // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
513
    // but we'll diagnose that after we've lexed the full string.
514
2.50M
    UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-'
515
2.50M
}
516
517
14.0M
fn is_keylike_continue(ch: char) -> bool {
518
    // Lex any XID continue (which includes `_`) or '-'.
519
14.0M
    UnicodeXID::is_xid_continue(ch) || ch == '-'
520
14.0M
}
521
522
434k
pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
523
    // IDs must have at least one part.
524
434k
    if id.is_empty() {
525
0
        return Err(Error::IdPartEmpty(start));
526
434k
    }
527
528
    // Ids consist of parts separated by '-'s.
529
434k
    for (idx, part) in id.split('-').enumerate() {
530
        // Parts must be non-empty and contain either all ASCII lowercase or
531
        // all ASCII uppercase. Non-first segment can also start with a digit.
532
434k
        let Some(first_char) = part.chars().next() else {
533
0
            return Err(Error::IdPartEmpty(start));
534
        };
535
434k
        if idx == 0 && !first_char.is_ascii_alphabetic() {
536
0
            return Err(Error::InvalidCharInId(start, first_char));
537
434k
        }
538
434k
        let mut upper = None;
539
3.33M
        for ch in part.chars() {
540
3.33M
            if ch.is_ascii_digit() {
541
401k
                // Digits are accepted in both uppercase and lowercase segments.
542
2.93M
            } else if ch.is_ascii_uppercase() {
543
0
                if upper.is_none() {
544
0
                    upper = Some(true);
545
0
                } else if let Some(false) = upper {
546
0
                    return Err(Error::InvalidCharInId(start, ch));
547
0
                }
548
2.93M
            } else if ch.is_ascii_lowercase() {
549
2.93M
                if upper.is_none() {
550
434k
                    upper = Some(false);
551
2.50M
                } else if let Some(true) = upper {
552
0
                    return Err(Error::InvalidCharInId(start, ch));
553
2.50M
                }
554
            } else {
555
0
                return Err(Error::InvalidCharInId(start, ch));
556
            }
557
        }
558
    }
559
560
434k
    Ok(())
561
434k
}
562
563
impl Token {
564
0
    pub fn describe(&self) -> &'static str {
565
0
        match self {
566
0
            Whitespace => "whitespace",
567
0
            Comment => "a comment",
568
0
            Equals => "'='",
569
0
            Comma => "','",
570
0
            Colon => "':'",
571
0
            Period => "'.'",
572
0
            Semicolon => "';'",
573
0
            LeftParen => "'('",
574
0
            RightParen => "')'",
575
0
            LeftBrace => "'{'",
576
0
            RightBrace => "'}'",
577
0
            LessThan => "'<'",
578
0
            GreaterThan => "'>'",
579
0
            Use => "keyword `use`",
580
0
            Type => "keyword `type`",
581
0
            Func => "keyword `func`",
582
0
            U8 => "keyword `u8`",
583
0
            U16 => "keyword `u16`",
584
0
            U32 => "keyword `u32`",
585
0
            U64 => "keyword `u64`",
586
0
            S8 => "keyword `s8`",
587
0
            S16 => "keyword `s16`",
588
0
            S32 => "keyword `s32`",
589
0
            S64 => "keyword `s64`",
590
0
            F32 => "keyword `f32`",
591
0
            F64 => "keyword `f64`",
592
0
            Char => "keyword `char`",
593
0
            Own => "keyword `own`",
594
0
            Borrow => "keyword `borrow`",
595
0
            Resource => "keyword `resource`",
596
0
            Record => "keyword `record`",
597
0
            Flags => "keyword `flags`",
598
0
            Variant => "keyword `variant`",
599
0
            Enum => "keyword `enum`",
600
0
            Bool => "keyword `bool`",
601
0
            String_ => "keyword `string`",
602
0
            Option_ => "keyword `option`",
603
0
            Result_ => "keyword `result`",
604
0
            Future => "keyword `future`",
605
0
            Stream => "keyword `stream`",
606
0
            ErrorContext => "keyword `error-context`",
607
0
            List => "keyword `list`",
608
0
            Map => "keyword `map`",
609
0
            Underscore => "keyword `_`",
610
0
            Id => "an identifier",
611
0
            ExplicitId => "an '%' identifier",
612
0
            RArrow => "`->`",
613
0
            Star => "`*`",
614
0
            At => "`@`",
615
0
            Slash => "`/`",
616
0
            Plus => "`+`",
617
0
            Minus => "`-`",
618
0
            As => "keyword `as`",
619
0
            From_ => "keyword `from`",
620
0
            Static => "keyword `static`",
621
0
            Interface => "keyword `interface`",
622
0
            Tuple => "keyword `tuple`",
623
0
            Import => "keyword `import`",
624
0
            Export => "keyword `export`",
625
0
            World => "keyword `world`",
626
0
            Package => "keyword `package`",
627
0
            Constructor => "keyword `constructor`",
628
0
            Integer => "an integer",
629
0
            Include => "keyword `include`",
630
0
            With => "keyword `with`",
631
0
            Async => "keyword `async`",
632
        }
633
0
    }
634
}
635
636
impl core::error::Error for Error {}
637
638
impl fmt::Display for Error {
639
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
640
0
        match self {
641
0
            Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
642
0
            Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
643
            Error::Wanted {
644
0
                expected, found, ..
645
0
            } => write!(f, "expected {expected}, found {found}"),
646
0
            Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
647
0
            Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
648
0
            Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
649
        }
650
0
    }
651
}
652
653
#[test]
654
fn test_validate_id() {
655
    validate_id(0, "apple").unwrap();
656
    validate_id(0, "apple-pear").unwrap();
657
    validate_id(0, "apple-pear-grape").unwrap();
658
    validate_id(0, "a0").unwrap();
659
    validate_id(0, "a").unwrap();
660
    validate_id(0, "a-a").unwrap();
661
    validate_id(0, "bool").unwrap();
662
    validate_id(0, "APPLE").unwrap();
663
    validate_id(0, "APPLE-PEAR").unwrap();
664
    validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
665
    validate_id(0, "apple-PEAR-grape").unwrap();
666
    validate_id(0, "APPLE-pear-GRAPE").unwrap();
667
    validate_id(0, "ENOENT").unwrap();
668
    validate_id(0, "is-XML").unwrap();
669
    validate_id(0, "apple-0").unwrap();
670
    validate_id(0, "a0-000-3d4a-54FF").unwrap();
671
672
    assert!(validate_id(0, "").is_err());
673
    assert!(validate_id(0, "0").is_err());
674
    assert!(validate_id(0, "%").is_err());
675
    assert!(validate_id(0, "$").is_err());
676
    assert!(validate_id(0, "0a").is_err());
677
    assert!(validate_id(0, ".").is_err());
678
    assert!(validate_id(0, "·").is_err());
679
    assert!(validate_id(0, "a a").is_err());
680
    assert!(validate_id(0, "_").is_err());
681
    assert!(validate_id(0, "-").is_err());
682
    assert!(validate_id(0, "a-").is_err());
683
    assert!(validate_id(0, "-a").is_err());
684
    assert!(validate_id(0, "Apple").is_err());
685
    assert!(validate_id(0, "applE").is_err());
686
    assert!(validate_id(0, "-apple-pear").is_err());
687
    assert!(validate_id(0, "apple-pear-").is_err());
688
    assert!(validate_id(0, "apple_pear").is_err());
689
    assert!(validate_id(0, "apple.pear").is_err());
690
    assert!(validate_id(0, "apple pear").is_err());
691
    assert!(validate_id(0, "apple/pear").is_err());
692
    assert!(validate_id(0, "apple|pear").is_err());
693
    assert!(validate_id(0, "apple-Pear").is_err());
694
    assert!(validate_id(0, "()()").is_err());
695
    assert!(validate_id(0, "").is_err());
696
    assert!(validate_id(0, "*").is_err());
697
    assert!(validate_id(0, "apple\u{5f3}pear").is_err());
698
    assert!(validate_id(0, "apple\u{200c}pear").is_err());
699
    assert!(validate_id(0, "apple\u{200d}pear").is_err());
700
    assert!(validate_id(0, "apple--pear").is_err());
701
    assert!(validate_id(0, "_apple").is_err());
702
    assert!(validate_id(0, "apple_").is_err());
703
    assert!(validate_id(0, "_Znwj").is_err());
704
    assert!(validate_id(0, "__i386").is_err());
705
    assert!(validate_id(0, "__i386__").is_err());
706
    assert!(validate_id(0, "Москва").is_err());
707
    assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
708
    assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err());
709
    assert!(validate_id(0, "😼").is_err(), "non-identifier");
710
    assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
711
}
712
713
#[test]
714
fn test_tokenizer() {
715
    fn collect(s: &str) -> Result<Vec<Token>> {
716
        let mut t = Tokenizer::new(s, 0)?;
717
        let mut tokens = Vec::new();
718
        while let Some(token) = t.next()? {
719
            tokens.push(token.1);
720
        }
721
        Ok(tokens)
722
    }
723
724
    assert_eq!(collect("").unwrap(), vec![]);
725
    assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
726
    assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
727
    assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
728
    assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
729
    assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
730
    assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
731
    assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
732
    assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
733
    assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
734
    assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
735
    assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
736
    assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
737
    assert_eq!(
738
        collect("garçon-hühnervögel-москва-東京").unwrap(),
739
        vec![Token::Id]
740
    );
741
    assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
742
    assert_eq!(collect("a").unwrap(), vec![Token::Id]);
743
    assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
744
    assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
745
    assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
746
    assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
747
    assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
748
    assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
749
    assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
750
    assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
751
    assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
752
    assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
753
    assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
754
755
    assert_eq!(collect("func").unwrap(), vec![Token::Func]);
756
    assert_eq!(
757
        collect("a: func()").unwrap(),
758
        vec![
759
            Token::Id,
760
            Token::Colon,
761
            Token::Func,
762
            Token::LeftParen,
763
            Token::RightParen
764
        ]
765
    );
766
767
    assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
768
769
    assert_eq!(collect("own").unwrap(), vec![Token::Own]);
770
    assert_eq!(
771
        collect("own<some-id>").unwrap(),
772
        vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
773
    );
774
775
    assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
776
    assert_eq!(
777
        collect("borrow<some-id>").unwrap(),
778
        vec![
779
            Token::Borrow,
780
            Token::LessThan,
781
            Token::Id,
782
            Token::GreaterThan
783
        ]
784
    );
785
786
    assert!(collect("\u{149}").is_err(), "strongly discouraged");
787
    assert!(collect("\u{673}").is_err(), "strongly discouraged");
788
    assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
789
    assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
790
    assert!(collect("\u{202a}").is_err(), "bidirectional override");
791
    assert!(collect("\u{2068}").is_err(), "bidirectional override");
792
    assert!(collect("\u{0}").is_err(), "control code");
793
    assert!(collect("\u{b}").is_err(), "control code");
794
    assert!(collect("\u{c}").is_err(), "control code");
795
    assert!(collect("\u{85}").is_err(), "control code");
796
}