Coverage Report

Created: 2025-10-12 07:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wasm-tools/crates/wit-parser/src/ast/lex.rs
Line
Count
Source
1
use anyhow::{Result, bail};
2
use std::char;
3
use std::fmt;
4
use std::str;
5
use unicode_xid::UnicodeXID;
6
7
use self::Token::*;
8
9
#[derive(Clone)]
10
pub struct Tokenizer<'a> {
11
    input: &'a str,
12
    span_offset: u32,
13
    chars: CrlfFold<'a>,
14
    require_f32_f64: bool,
15
}
16
17
#[derive(Clone)]
18
struct CrlfFold<'a> {
19
    chars: str::CharIndices<'a>,
20
}
21
22
/// A span, designating a range of bytes where a token is located.
23
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
24
pub struct Span {
25
    /// The start of the range.
26
    pub start: u32,
27
    /// The end of the range (exclusive).
28
    pub end: u32,
29
}
30
31
#[derive(Eq, PartialEq, Debug, Copy, Clone)]
32
pub enum Token {
33
    Whitespace,
34
    Comment,
35
36
    Equals,
37
    Comma,
38
    Colon,
39
    Period,
40
    Semicolon,
41
    LeftParen,
42
    RightParen,
43
    LeftBrace,
44
    RightBrace,
45
    LessThan,
46
    GreaterThan,
47
    RArrow,
48
    Star,
49
    At,
50
    Slash,
51
    Plus,
52
    Minus,
53
54
    Use,
55
    Type,
56
    Func,
57
    U8,
58
    U16,
59
    U32,
60
    U64,
61
    S8,
62
    S16,
63
    S32,
64
    S64,
65
    F32,
66
    F64,
67
    Char,
68
    Record,
69
    Resource,
70
    Own,
71
    Borrow,
72
    Flags,
73
    Variant,
74
    Enum,
75
    Bool,
76
    String_,
77
    Option_,
78
    Result_,
79
    Future,
80
    Stream,
81
    ErrorContext,
82
    List,
83
    Underscore,
84
    As,
85
    From_,
86
    Static,
87
    Interface,
88
    Tuple,
89
    Import,
90
    Export,
91
    World,
92
    Package,
93
    Constructor,
94
    Async,
95
96
    Id,
97
    ExplicitId,
98
99
    Integer,
100
101
    Include,
102
    With,
103
}
104
105
#[derive(Eq, PartialEq, Debug)]
106
#[allow(dead_code)]
107
pub enum Error {
108
    InvalidCharInId(u32, char),
109
    IdPartEmpty(u32),
110
    InvalidEscape(u32, char),
111
    Unexpected(u32, char),
112
    UnterminatedComment(u32),
113
    Wanted {
114
        at: u32,
115
        expected: &'static str,
116
        found: &'static str,
117
    },
118
}
119
120
// NB: keep in sync with `crates/wit-component/src/printing.rs`.
121
const REQUIRE_F32_F64_BY_DEFAULT: bool = true;
122
123
impl<'a> Tokenizer<'a> {
124
20.5k
    pub fn new(
125
20.5k
        input: &'a str,
126
20.5k
        span_offset: u32,
127
20.5k
        require_f32_f64: Option<bool>,
128
20.5k
    ) -> Result<Tokenizer<'a>> {
129
20.5k
        detect_invalid_input(input)?;
130
131
20.5k
        let mut t = Tokenizer {
132
20.5k
            input,
133
20.5k
            span_offset,
134
20.5k
            chars: CrlfFold {
135
20.5k
                chars: input.char_indices(),
136
20.5k
            },
137
20.5k
            require_f32_f64: require_f32_f64.unwrap_or_else(|| {
138
20.5k
                match std::env::var("WIT_REQUIRE_F32_F64") {
139
0
                    Ok(s) => s == "1",
140
20.5k
                    Err(_) => REQUIRE_F32_F64_BY_DEFAULT,
141
                }
142
20.5k
            }),
143
        };
144
        // Eat utf-8 BOM
145
20.5k
        t.eatc('\u{feff}');
146
20.5k
        Ok(t)
147
20.5k
    }
148
149
38.1k
    pub fn expect_semicolon(&mut self) -> Result<()> {
150
38.1k
        self.expect(Token::Semicolon)?;
151
38.1k
        Ok(())
152
38.1k
    }
153
154
536k
    pub fn get_span(&self, span: Span) -> &'a str {
155
536k
        let start = usize::try_from(span.start - self.span_offset).unwrap();
156
536k
        let end = usize::try_from(span.end - self.span_offset).unwrap();
157
536k
        &self.input[start..end]
158
536k
    }
159
160
172k
    pub fn parse_id(&self, span: Span) -> Result<&'a str> {
161
172k
        let ret = self.get_span(span);
162
172k
        validate_id(span.start, &ret)?;
163
172k
        Ok(ret)
164
172k
    }
165
166
234k
    pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
167
234k
        let token = self.get_span(span);
168
234k
        let id_part = token.strip_prefix('%').unwrap();
169
234k
        validate_id(span.start, id_part)?;
170
234k
        Ok(id_part)
171
234k
    }
172
173
3.56M
    pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
174
        loop {
175
4.15M
            match self.next_raw()? {
176
591k
                Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
177
3.56M
                other => break Ok(other),
178
            }
179
        }
180
3.56M
    }
181
182
    /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
183
    /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
184
    /// tokens available.
185
5.17M
    pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
186
5.17M
        let (str_start, ch) = match self.chars.next() {
187
5.13M
            Some(pair) => pair,
188
42.2k
            None => return Ok(None),
189
        };
190
5.13M
        let start = self.span_offset + u32::try_from(str_start).unwrap();
191
5.13M
        let token = match ch {
192
            '\n' | '\t' | ' ' => {
193
                // Eat all contiguous whitespace tokens
194
1.68M
                while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
195
1.06M
                Whitespace
196
            }
197
            '/' => {
198
                // Eat a line comment if it's `//...`
199
7.95k
                if self.eatc('/') {
200
0
                    for (_, ch) in &mut self.chars {
201
0
                        if ch == '\n' {
202
0
                            break;
203
0
                        }
204
                    }
205
0
                    Comment
206
                // eat a block comment if it's `/*...`
207
7.95k
                } else if self.eatc('*') {
208
0
                    let mut depth = 1;
209
0
                    while depth > 0 {
210
0
                        let (_, ch) = match self.chars.next() {
211
0
                            Some(pair) => pair,
212
0
                            None => return Err(Error::UnterminatedComment(start)),
213
                        };
214
0
                        match ch {
215
0
                            '/' if self.eatc('*') => depth += 1,
216
0
                            '*' if self.eatc('/') => depth -= 1,
217
0
                            _ => {}
218
                        }
219
                    }
220
0
                    Comment
221
                } else {
222
7.95k
                    Slash
223
                }
224
            }
225
6.86k
            '=' => Equals,
226
358k
            ',' => Comma,
227
108k
            ':' => Colon,
228
116k
            '.' => Period,
229
81.3k
            ';' => Semicolon,
230
40.2k
            '(' => LeftParen,
231
68.1k
            ')' => RightParen,
232
120k
            '{' => LeftBrace,
233
228k
            '}' => RightBrace,
234
202k
            '<' => LessThan,
235
234k
            '>' => GreaterThan,
236
0
            '*' => Star,
237
31.4k
            '@' => At,
238
            '-' => {
239
33.7k
                if self.eatc('>') {
240
15.7k
                    RArrow
241
                } else {
242
18.0k
                    Minus
243
                }
244
            }
245
34.0k
            '+' => Plus,
246
            '%' => {
247
482k
                let mut iter = self.chars.clone();
248
482k
                if let Some((_, ch)) = iter.next() {
249
482k
                    if is_keylike_start(ch) {
250
482k
                        self.chars = iter.clone();
251
3.47M
                        while let Some((_, ch)) = iter.next() {
252
3.47M
                            if !is_keylike_continue(ch) {
253
482k
                                break;
254
2.99M
                            }
255
2.99M
                            self.chars = iter.clone();
256
                        }
257
0
                    }
258
0
                }
259
482k
                ExplicitId
260
            }
261
1.91M
            ch if is_keylike_start(ch) => {
262
1.66M
                let remaining = self.chars.chars.as_str().len();
263
1.66M
                let mut iter = self.chars.clone();
264
9.42M
                while let Some((_, ch)) = iter.next() {
265
9.42M
                    if !is_keylike_continue(ch) {
266
1.66M
                        break;
267
7.75M
                    }
268
7.75M
                    self.chars = iter.clone();
269
                }
270
1.66M
                let str_end =
271
1.66M
                    str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
272
1.66M
                match &self.input[str_start..str_end] {
273
1.66M
                    "use" => Use,
274
1.60M
                    "type" => Type,
275
1.59M
                    "func" => Func,
276
1.56M
                    "u8" => U8,
277
1.55M
                    "u16" => U16,
278
1.55M
                    "u32" => U32,
279
1.54M
                    "u64" => U64,
280
1.54M
                    "s8" => S8,
281
1.53M
                    "s16" => S16,
282
1.53M
                    "s32" => S32,
283
1.53M
                    "s64" => S64,
284
1.52M
                    "f32" => F32,
285
1.51M
                    "f64" => F64,
286
1.50M
                    "float32" if !self.require_f32_f64 => F32,
287
1.50M
                    "float64" if !self.require_f32_f64 => F64,
288
1.50M
                    "char" => Char,
289
1.49M
                    "resource" => Resource,
290
1.48M
                    "own" => Own,
291
1.47M
                    "borrow" => Borrow,
292
1.47M
                    "record" => Record,
293
1.46M
                    "flags" => Flags,
294
1.45M
                    "variant" => Variant,
295
1.43M
                    "enum" => Enum,
296
1.32M
                    "bool" => Bool,
297
1.14M
                    "string" => String_,
298
1.14M
                    "option" => Option_,
299
1.11M
                    "result" => Result_,
300
1.05M
                    "future" => Future,
301
1.04M
                    "stream" => Stream,
302
1.03M
                    "error-context" => ErrorContext,
303
1.01M
                    "list" => List,
304
898k
                    "_" => Underscore,
305
896k
                    "as" => As,
306
882k
                    "from" => From_,
307
882k
                    "static" => Static,
308
880k
                    "interface" => Interface,
309
613k
                    "tuple" => Tuple,
310
561k
                    "world" => World,
311
472k
                    "import" => Import,
312
453k
                    "export" => Export,
313
440k
                    "package" => Package,
314
408k
                    "constructor" => Constructor,
315
405k
                    "include" => Include,
316
405k
                    "with" => With,
317
405k
                    "async" => Async,
318
388k
                    _ => Id,
319
                }
320
            }
321
322
242k
            ch if ch.is_ascii_digit() => {
323
242k
                let mut iter = self.chars.clone();
324
247k
                while let Some((_, ch)) = iter.next() {
325
247k
                    if !ch.is_ascii_digit() {
326
242k
                        break;
327
4.82k
                    }
328
4.82k
                    self.chars = iter.clone();
329
                }
330
331
242k
                Integer
332
            }
333
334
0
            ch => return Err(Error::Unexpected(start, ch)),
335
        };
336
5.13M
        let end = match self.chars.clone().next() {
337
5.11M
            Some((i, _)) => i,
338
19.9k
            None => self.input.len(),
339
        };
340
341
5.13M
        let end = self.span_offset + u32::try_from(end).unwrap();
342
5.13M
        Ok(Some((Span { start, end }, token)))
343
5.17M
    }
344
345
1.29M
    pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
346
1.29M
        let mut other = self.clone();
347
1.29M
        match other.next()? {
348
1.29M
            Some((_span, found)) if expected == found => {
349
666k
                *self = other;
350
666k
                Ok(true)
351
            }
352
629k
            Some(_) => Ok(false),
353
591
            None => Ok(false),
354
        }
355
1.29M
    }
356
357
967k
    pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
358
967k
        match self.next()? {
359
967k
            Some((span, found)) => {
360
967k
                if expected == found {
361
967k
                    Ok(span)
362
                } else {
363
0
                    Err(Error::Wanted {
364
0
                        at: span.start,
365
0
                        expected: expected.describe(),
366
0
                        found: found.describe(),
367
0
                    })
368
                }
369
            }
370
0
            None => Err(Error::Wanted {
371
0
                at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
372
0
                expected: expected.describe(),
373
0
                found: "eof",
374
0
            }),
375
        }
376
967k
    }
377
378
4.13M
    fn eatc(&mut self, ch: char) -> bool {
379
4.13M
        let mut iter = self.chars.clone();
380
4.13M
        match iter.next() {
381
4.07M
            Some((_, ch2)) if ch == ch2 => {
382
635k
                self.chars = iter;
383
635k
                true
384
            }
385
3.49M
            _ => false,
386
        }
387
4.13M
    }
388
389
0
    pub fn eof_span(&self) -> Span {
390
0
        let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
391
0
        Span { start: end, end }
392
0
    }
393
}
394
395
impl<'a> Iterator for CrlfFold<'a> {
396
    type Item = (usize, char);
397
398
28.0M
    fn next(&mut self) -> Option<(usize, char)> {
399
28.0M
        self.chars.next().map(|(i, c)| {
400
27.9M
            if c == '\r' {
401
0
                let mut attempt = self.chars.clone();
402
0
                if let Some((_, '\n')) = attempt.next() {
403
0
                    self.chars = attempt;
404
0
                    return (i, '\n');
405
0
                }
406
27.9M
            }
407
27.9M
            (i, c)
408
27.9M
        })
409
28.0M
    }
410
}
411
412
20.5k
fn detect_invalid_input(input: &str) -> Result<()> {
413
    // Disallow specific codepoints.
414
20.5k
    let mut line = 1;
415
9.27M
    for ch in input.chars() {
416
8.76M
        match ch {
417
513k
            '\n' => line += 1,
418
0
            '\r' | '\t' => {}
419
420
            // Bidirectional override codepoints can be used to craft source code that
421
            // appears to have a different meaning than its actual meaning. See
422
            // [CVE-2021-42574] for background and motivation.
423
            //
424
            // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
425
            '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
426
            | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
427
0
                bail!(
428
0
                    "Input contains bidirectional override codepoint {:?} at line {}",
429
0
                    ch.escape_unicode(),
430
                    line
431
                );
432
            }
433
434
            // Disallow several characters which are deprecated or discouraged in Unicode.
435
            //
436
            // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
437
            // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
438
            // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
439
            // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
440
            // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
441
            '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
442
            | '\u{17b4}' | '\u{17b5}' => {
443
0
                bail!(
444
0
                    "Codepoint {:?} at line {} is discouraged by Unicode",
445
0
                    ch.escape_unicode(),
446
                    line
447
                );
448
            }
449
450
            // Disallow control codes other than the ones explicitly recognized above,
451
            // so that viewing a wit file on a terminal doesn't have surprising side
452
            // effects or appear to have a different meaning than its actual meaning.
453
8.76M
            ch if ch.is_control() => {
454
0
                bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
455
            }
456
457
8.76M
            _ => {}
458
        }
459
    }
460
461
20.5k
    Ok(())
462
20.5k
}
463
464
2.39M
fn is_keylike_start(ch: char) -> bool {
465
    // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
466
    // but we'll diagnose that after we've lexed the full string.
467
2.39M
    UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-'
468
2.39M
}
469
470
12.8M
fn is_keylike_continue(ch: char) -> bool {
471
    // Lex any XID continue (which includes `_`) or '-'.
472
12.8M
    UnicodeXID::is_xid_continue(ch) || ch == '-'
473
12.8M
}
474
475
407k
pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
476
    // IDs must have at least one part.
477
407k
    if id.is_empty() {
478
0
        return Err(Error::IdPartEmpty(start));
479
407k
    }
480
481
    // Ids consist of parts separated by '-'s.
482
407k
    for part in id.split('-') {
483
        // Parts must be non-empty and contain either all ASCII lowercase or
484
        // all ASCII uppercase.
485
407k
        let upper = match part.chars().next() {
486
0
            None => return Err(Error::IdPartEmpty(start)),
487
407k
            Some(first) => {
488
407k
                if first.is_ascii_lowercase() {
489
407k
                    false
490
0
                } else if first.is_ascii_uppercase() {
491
0
                    true
492
                } else {
493
0
                    return Err(Error::InvalidCharInId(start, first));
494
                }
495
            }
496
        };
497
498
2.76M
        for ch in part.chars() {
499
2.76M
            if ch.is_ascii_digit() {
500
385k
                // Digits are accepted in both uppercase and lowercase segments.
501
2.38M
            } else if upper {
502
0
                if !ch.is_ascii_uppercase() {
503
0
                    return Err(Error::InvalidCharInId(start, ch));
504
0
                }
505
2.38M
            } else if !ch.is_ascii_lowercase() {
506
0
                return Err(Error::InvalidCharInId(start, ch));
507
2.38M
            }
508
        }
509
    }
510
511
407k
    Ok(())
512
407k
}
513
514
impl Token {
515
0
    pub fn describe(&self) -> &'static str {
516
0
        match self {
517
0
            Whitespace => "whitespace",
518
0
            Comment => "a comment",
519
0
            Equals => "'='",
520
0
            Comma => "','",
521
0
            Colon => "':'",
522
0
            Period => "'.'",
523
0
            Semicolon => "';'",
524
0
            LeftParen => "'('",
525
0
            RightParen => "')'",
526
0
            LeftBrace => "'{'",
527
0
            RightBrace => "'}'",
528
0
            LessThan => "'<'",
529
0
            GreaterThan => "'>'",
530
0
            Use => "keyword `use`",
531
0
            Type => "keyword `type`",
532
0
            Func => "keyword `func`",
533
0
            U8 => "keyword `u8`",
534
0
            U16 => "keyword `u16`",
535
0
            U32 => "keyword `u32`",
536
0
            U64 => "keyword `u64`",
537
0
            S8 => "keyword `s8`",
538
0
            S16 => "keyword `s16`",
539
0
            S32 => "keyword `s32`",
540
0
            S64 => "keyword `s64`",
541
0
            F32 => "keyword `f32`",
542
0
            F64 => "keyword `f64`",
543
0
            Char => "keyword `char`",
544
0
            Own => "keyword `own`",
545
0
            Borrow => "keyword `borrow`",
546
0
            Resource => "keyword `resource`",
547
0
            Record => "keyword `record`",
548
0
            Flags => "keyword `flags`",
549
0
            Variant => "keyword `variant`",
550
0
            Enum => "keyword `enum`",
551
0
            Bool => "keyword `bool`",
552
0
            String_ => "keyword `string`",
553
0
            Option_ => "keyword `option`",
554
0
            Result_ => "keyword `result`",
555
0
            Future => "keyword `future`",
556
0
            Stream => "keyword `stream`",
557
0
            ErrorContext => "keyword `error-context`",
558
0
            List => "keyword `list`",
559
0
            Underscore => "keyword `_`",
560
0
            Id => "an identifier",
561
0
            ExplicitId => "an '%' identifier",
562
0
            RArrow => "`->`",
563
0
            Star => "`*`",
564
0
            At => "`@`",
565
0
            Slash => "`/`",
566
0
            Plus => "`+`",
567
0
            Minus => "`-`",
568
0
            As => "keyword `as`",
569
0
            From_ => "keyword `from`",
570
0
            Static => "keyword `static`",
571
0
            Interface => "keyword `interface`",
572
0
            Tuple => "keyword `tuple`",
573
0
            Import => "keyword `import`",
574
0
            Export => "keyword `export`",
575
0
            World => "keyword `world`",
576
0
            Package => "keyword `package`",
577
0
            Constructor => "keyword `constructor`",
578
0
            Integer => "an integer",
579
0
            Include => "keyword `include`",
580
0
            With => "keyword `with`",
581
0
            Async => "keyword `async`",
582
        }
583
0
    }
584
}
585
586
impl std::error::Error for Error {}
587
588
impl fmt::Display for Error {
589
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
590
0
        match self {
591
0
            Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
592
0
            Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
593
            Error::Wanted {
594
0
                expected, found, ..
595
0
            } => write!(f, "expected {expected}, found {found}"),
596
0
            Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
597
0
            Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
598
0
            Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
599
        }
600
0
    }
601
}
602
603
#[test]
604
fn test_validate_id() {
605
    validate_id(0, "apple").unwrap();
606
    validate_id(0, "apple-pear").unwrap();
607
    validate_id(0, "apple-pear-grape").unwrap();
608
    validate_id(0, "a0").unwrap();
609
    validate_id(0, "a").unwrap();
610
    validate_id(0, "a-a").unwrap();
611
    validate_id(0, "bool").unwrap();
612
    validate_id(0, "APPLE").unwrap();
613
    validate_id(0, "APPLE-PEAR").unwrap();
614
    validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
615
    validate_id(0, "apple-PEAR-grape").unwrap();
616
    validate_id(0, "APPLE-pear-GRAPE").unwrap();
617
    validate_id(0, "ENOENT").unwrap();
618
    validate_id(0, "is-XML").unwrap();
619
620
    assert!(validate_id(0, "").is_err());
621
    assert!(validate_id(0, "0").is_err());
622
    assert!(validate_id(0, "%").is_err());
623
    assert!(validate_id(0, "$").is_err());
624
    assert!(validate_id(0, "0a").is_err());
625
    assert!(validate_id(0, ".").is_err());
626
    assert!(validate_id(0, "·").is_err());
627
    assert!(validate_id(0, "a a").is_err());
628
    assert!(validate_id(0, "_").is_err());
629
    assert!(validate_id(0, "-").is_err());
630
    assert!(validate_id(0, "a-").is_err());
631
    assert!(validate_id(0, "-a").is_err());
632
    assert!(validate_id(0, "Apple").is_err());
633
    assert!(validate_id(0, "applE").is_err());
634
    assert!(validate_id(0, "-apple-pear").is_err());
635
    assert!(validate_id(0, "apple-pear-").is_err());
636
    assert!(validate_id(0, "apple_pear").is_err());
637
    assert!(validate_id(0, "apple.pear").is_err());
638
    assert!(validate_id(0, "apple pear").is_err());
639
    assert!(validate_id(0, "apple/pear").is_err());
640
    assert!(validate_id(0, "apple|pear").is_err());
641
    assert!(validate_id(0, "apple-Pear").is_err());
642
    assert!(validate_id(0, "apple-0").is_err());
643
    assert!(validate_id(0, "()()").is_err());
644
    assert!(validate_id(0, "").is_err());
645
    assert!(validate_id(0, "*").is_err());
646
    assert!(validate_id(0, "apple\u{5f3}pear").is_err());
647
    assert!(validate_id(0, "apple\u{200c}pear").is_err());
648
    assert!(validate_id(0, "apple\u{200d}pear").is_err());
649
    assert!(validate_id(0, "apple--pear").is_err());
650
    assert!(validate_id(0, "_apple").is_err());
651
    assert!(validate_id(0, "apple_").is_err());
652
    assert!(validate_id(0, "_Znwj").is_err());
653
    assert!(validate_id(0, "__i386").is_err());
654
    assert!(validate_id(0, "__i386__").is_err());
655
    assert!(validate_id(0, "Москва").is_err());
656
    assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
657
    assert!(validate_id(0, "😼").is_err(), "non-identifier");
658
    assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
659
}
660
661
#[test]
662
fn test_tokenizer() {
663
    fn collect(s: &str) -> Result<Vec<Token>> {
664
        let mut t = Tokenizer::new(s, 0, None)?;
665
        let mut tokens = Vec::new();
666
        while let Some(token) = t.next()? {
667
            tokens.push(token.1);
668
        }
669
        Ok(tokens)
670
    }
671
672
    assert_eq!(collect("").unwrap(), vec![]);
673
    assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
674
    assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
675
    assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
676
    assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
677
    assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
678
    assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
679
    assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
680
    assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
681
    assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
682
    assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
683
    assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
684
    assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
685
    assert_eq!(
686
        collect("garçon-hühnervögel-москва-東京").unwrap(),
687
        vec![Token::Id]
688
    );
689
    assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
690
    assert_eq!(collect("a").unwrap(), vec![Token::Id]);
691
    assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
692
    assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
693
    assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
694
    assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
695
    assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
696
    assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
697
    assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
698
    assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
699
    assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
700
    assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
701
    assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
702
703
    assert_eq!(collect("func").unwrap(), vec![Token::Func]);
704
    assert_eq!(
705
        collect("a: func()").unwrap(),
706
        vec![
707
            Token::Id,
708
            Token::Colon,
709
            Token::Func,
710
            Token::LeftParen,
711
            Token::RightParen
712
        ]
713
    );
714
715
    assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
716
717
    assert_eq!(collect("own").unwrap(), vec![Token::Own]);
718
    assert_eq!(
719
        collect("own<some-id>").unwrap(),
720
        vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
721
    );
722
723
    assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
724
    assert_eq!(
725
        collect("borrow<some-id>").unwrap(),
726
        vec![
727
            Token::Borrow,
728
            Token::LessThan,
729
            Token::Id,
730
            Token::GreaterThan
731
        ]
732
    );
733
734
    assert!(collect("\u{149}").is_err(), "strongly discouraged");
735
    assert!(collect("\u{673}").is_err(), "strongly discouraged");
736
    assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
737
    assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
738
    assert!(collect("\u{202a}").is_err(), "bidirectional override");
739
    assert!(collect("\u{2068}").is_err(), "bidirectional override");
740
    assert!(collect("\u{0}").is_err(), "control code");
741
    assert!(collect("\u{b}").is_err(), "control code");
742
    assert!(collect("\u{c}").is_err(), "control code");
743
    assert!(collect("\u{85}").is_err(), "control code");
744
}