Coverage Report

Created: 2026-06-21 07:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wasm-tools/crates/wit-parser/src/ast/lex.rs
Line
Count
Source
1
#[cfg(test)]
2
use alloc::{vec, vec::Vec};
3
use core::char;
4
use core::fmt;
5
use core::result::Result;
6
use core::str;
7
8
use self::Token::*;
9
10
#[derive(Clone)]
11
pub struct Tokenizer<'a> {
12
    input: &'a str,
13
    span_offset: u32,
14
    chars: CrlfFold<'a>,
15
}
16
17
#[derive(Clone)]
18
struct CrlfFold<'a> {
19
    chars: str::CharIndices<'a>,
20
}
21
22
/// A span, designating a range of bytes where a token is located.
23
///
24
/// Uses `u32::MAX` as a sentinel value to represent unknown spans (e.g.,
25
/// decoded from binary).
26
#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)]
27
pub struct Span {
28
    start: u32,
29
    end: u32,
30
}
31
32
impl Default for Span {
33
1.30M
    fn default() -> Span {
34
1.30M
        Span {
35
1.30M
            start: u32::MAX,
36
1.30M
            end: u32::MAX,
37
1.30M
        }
38
1.30M
    }
39
}
40
41
impl Span {
42
6.49M
    pub fn new(start: u32, end: u32) -> Span {
43
6.49M
        let span = Span { start, end };
44
6.49M
        assert!(span.is_known(), "cannot create a span with u32::MAX");
45
6.49M
        span
46
6.49M
    }
47
48
    /// Adjusts this span by adding the given byte offset to both start and end.
49
680k
    pub fn adjust(&mut self, offset: u32) {
50
680k
        if self.is_known() {
51
493k
            self.start += offset;
52
493k
            self.end += offset;
53
493k
        }
54
680k
    }
55
56
    /// Returns the start offset, panicking if this is an unknown span.
57
1.29M
    pub fn start(&self) -> u32 {
58
1.29M
        assert!(self.is_known(), "cannot get start of unknown span");
59
1.29M
        self.start
60
1.29M
    }
61
62
    /// Returns the end offset, panicking if this is an unknown span.
63
953k
    pub fn end(&self) -> u32 {
64
953k
        assert!(self.is_known(), "cannot get end of unknown span");
65
953k
        self.end
66
953k
    }
67
68
    /// Sets the end offset. If this is unknown, converts to a zero-width span at that position.
69
175k
    pub fn set_end(&mut self, new_end: u32) {
70
175k
        if !self.is_known() {
71
0
            self.start = new_end;
72
175k
        }
73
175k
        self.end = new_end;
74
175k
    }
75
76
    /// Sets the start offset. If this is unknown, converts to a zero-width span at that position.
77
0
    pub fn set_start(&mut self, new_start: u32) {
78
0
        if !self.is_known() {
79
0
            self.end = new_start;
80
0
        }
81
0
        self.start = new_start;
82
0
    }
83
84
    /// Returns true if this span has a known source location.
85
9.60M
    pub fn is_known(&self) -> bool {
86
9.60M
        self.start != u32::MAX && self.end != u32::MAX
87
9.60M
    }
88
}
89
90
#[derive(Eq, PartialEq, Debug, Copy, Clone)]
91
pub enum Token {
92
    Whitespace,
93
    Comment,
94
95
    Equals,
96
    Comma,
97
    Colon,
98
    Period,
99
    Semicolon,
100
    LeftParen,
101
    RightParen,
102
    LeftBrace,
103
    RightBrace,
104
    LessThan,
105
    GreaterThan,
106
    RArrow,
107
    Star,
108
    At,
109
    Slash,
110
    Plus,
111
    Minus,
112
113
    Use,
114
    Type,
115
    Func,
116
    U8,
117
    U16,
118
    U32,
119
    U64,
120
    S8,
121
    S16,
122
    S32,
123
    S64,
124
    F32,
125
    F64,
126
    Char,
127
    Record,
128
    Resource,
129
    Own,
130
    Borrow,
131
    Flags,
132
    Variant,
133
    Enum,
134
    Bool,
135
    String_,
136
    Option_,
137
    Result_,
138
    Future,
139
    Stream,
140
    ErrorContext,
141
    List,
142
    Map,
143
    Underscore,
144
    As,
145
    From_,
146
    Static,
147
    Interface,
148
    Tuple,
149
    Import,
150
    Export,
151
    World,
152
    Package,
153
    Constructor,
154
    Async,
155
156
    Id,
157
    ExplicitId,
158
159
    Integer,
160
161
    Include,
162
    With,
163
}
164
165
#[derive(Eq, PartialEq, Debug)]
166
#[allow(dead_code)]
167
pub enum Error {
168
    ControlCodepoint(u32, char),
169
    DeprecatedCodepoint(u32, char),
170
    ForbiddenCodepoint(u32, char),
171
    InvalidCharInId(u32, char),
172
    IdPartEmpty(u32),
173
    InvalidEscape(u32, char),
174
    Unexpected(u32, char),
175
    UnterminatedComment(u32),
176
    Wanted {
177
        at: u32,
178
        expected: &'static str,
179
        found: &'static str,
180
    },
181
}
182
183
impl<'a> Tokenizer<'a> {
184
21.7k
    pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>, Error> {
185
21.7k
        detect_invalid_input(input)?;
186
187
21.7k
        let mut t = Tokenizer {
188
21.7k
            input,
189
21.7k
            span_offset,
190
21.7k
            chars: CrlfFold {
191
21.7k
                chars: input.char_indices(),
192
21.7k
            },
193
21.7k
        };
194
        // Eat utf-8 BOM
195
21.7k
        t.eatc('\u{feff}');
196
21.7k
        Ok(t)
197
21.7k
    }
198
199
73.9k
    pub fn expect_semicolon(&mut self) -> Result<(), Error> {
200
73.9k
        self.expect(Token::Semicolon)?;
201
73.9k
        Ok(())
202
73.9k
    }
203
204
679k
    pub fn get_span(&self, span: Span) -> &'a str {
205
679k
        let start = usize::try_from(span.start() - self.span_offset).unwrap();
206
679k
        let end = usize::try_from(span.end() - self.span_offset).unwrap();
207
679k
        &self.input[start..end]
208
679k
    }
209
210
197k
    pub fn parse_id(&self, span: Span) -> Result<&'a str, Error> {
211
197k
        let ret = self.get_span(span);
212
197k
        validate_id(span.start(), &ret)?;
213
197k
        Ok(ret)
214
197k
    }
215
216
333k
    pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str, Error> {
217
333k
        let token = self.get_span(span);
218
333k
        let id_part = token.strip_prefix('%').unwrap();
219
333k
        validate_id(span.start(), id_part)?;
220
333k
        Ok(id_part)
221
333k
    }
222
223
4.60M
    pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
224
        loop {
225
5.37M
            match self.next_raw()? {
226
764k
                Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
227
4.60M
                other => break Ok(other),
228
            }
229
        }
230
4.60M
    }
231
232
    /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
233
    /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
234
    /// tokens available.
235
6.45M
    pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
236
6.45M
        let (str_start, ch) = match self.chars.next() {
237
6.41M
            Some(pair) => pair,
238
44.7k
            None => return Ok(None),
239
        };
240
6.41M
        let start = self.span_offset + u32::try_from(str_start).unwrap();
241
6.41M
        let token = match ch {
242
            '\n' | '\t' | ' ' => {
243
                // Eat all contiguous whitespace tokens
244
1.86M
                while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
245
1.26M
                Whitespace
246
            }
247
            '/' => {
248
                // Eat a line comment if it's `//...`
249
28.1k
                if self.eatc('/') {
250
0
                    for (_, ch) in &mut self.chars {
251
0
                        if ch == '\n' {
252
0
                            break;
253
0
                        }
254
                    }
255
0
                    Comment
256
                // eat a block comment if it's `/*...`
257
28.1k
                } else if self.eatc('*') {
258
0
                    let mut depth = 1;
259
0
                    while depth > 0 {
260
0
                        let (_, ch) = match self.chars.next() {
261
0
                            Some(pair) => pair,
262
0
                            None => return Err(Error::UnterminatedComment(start)),
263
                        };
264
0
                        match ch {
265
0
                            '/' if self.eatc('*') => depth += 1,
266
0
                            '*' if self.eatc('/') => depth -= 1,
267
0
                            _ => {}
268
                        }
269
                    }
270
0
                    Comment
271
                } else {
272
28.1k
                    Slash
273
                }
274
            }
275
8.83k
            '=' => Equals,
276
368k
            ',' => Comma,
277
167k
            ':' => Colon,
278
234k
            '.' => Period,
279
143k
            ';' => Semicolon,
280
56.9k
            '(' => LeftParen,
281
90.1k
            ')' => RightParen,
282
133k
            '{' => LeftBrace,
283
240k
            '}' => RightBrace,
284
229k
            '<' => LessThan,
285
274k
            '>' => GreaterThan,
286
0
            '*' => Star,
287
52.2k
            '@' => At,
288
            '-' => {
289
55.0k
                if self.eatc('>') {
290
19.7k
                    RArrow
291
                } else {
292
35.2k
                    Minus
293
                }
294
            }
295
69.8k
            '+' => Plus,
296
            '%' => {
297
629k
                let mut iter = self.chars.clone();
298
629k
                if let Some((_, ch)) = iter.next() {
299
629k
                    if is_keylike_start(ch) {
300
629k
                        self.chars = iter.clone();
301
5.51M
                        while let Some((_, ch)) = iter.next() {
302
5.51M
                            if !is_keylike_continue(ch) {
303
629k
                                break;
304
4.88M
                            }
305
4.88M
                            self.chars = iter.clone();
306
                        }
307
0
                    }
308
0
                }
309
629k
                ExplicitId
310
            }
311
2.36M
            ch if is_keylike_start(ch) => {
312
1.99M
                let remaining = self.chars.chars.as_str().len();
313
1.99M
                let mut iter = self.chars.clone();
314
11.3M
                while let Some((_, ch)) = iter.next() {
315
11.3M
                    if !is_keylike_continue(ch) {
316
1.99M
                        break;
317
9.38M
                    }
318
9.38M
                    self.chars = iter.clone();
319
                }
320
1.99M
                let str_end =
321
1.99M
                    str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
322
1.99M
                match &self.input[str_start..str_end] {
323
1.99M
                    "use" => Use,
324
1.86M
                    "type" => Type,
325
1.84M
                    "func" => Func,
326
1.80M
                    "u8" => U8,
327
1.79M
                    "u16" => U16,
328
1.79M
                    "u32" => U32,
329
1.78M
                    "u64" => U64,
330
1.78M
                    "s8" => S8,
331
1.77M
                    "s16" => S16,
332
1.77M
                    "s32" => S32,
333
1.76M
                    "s64" => S64,
334
1.75M
                    "f32" => F32,
335
1.74M
                    "f64" => F64,
336
1.72M
                    "char" => Char,
337
1.70M
                    "resource" => Resource,
338
1.69M
                    "own" => Own,
339
1.69M
                    "borrow" => Borrow,
340
1.69M
                    "record" => Record,
341
1.67M
                    "flags" => Flags,
342
1.67M
                    "variant" => Variant,
343
1.63M
                    "enum" => Enum,
344
1.56M
                    "bool" => Bool,
345
1.35M
                    "string" => String_,
346
1.35M
                    "option" => Option_,
347
1.32M
                    "result" => Result_,
348
1.24M
                    "future" => Future,
349
1.23M
                    "stream" => Stream,
350
1.21M
                    "error-context" => ErrorContext,
351
1.17M
                    "list" => List,
352
1.06M
                    "map" => Map,
353
1.06M
                    "_" => Underscore,
354
1.05M
                    "as" => As,
355
1.02M
                    "from" => From_,
356
1.02M
                    "static" => Static,
357
1.02M
                    "interface" => Interface,
358
765k
                    "tuple" => Tuple,
359
697k
                    "world" => World,
360
608k
                    "import" => Import,
361
585k
                    "export" => Export,
362
497k
                    "package" => Package,
363
461k
                    "constructor" => Constructor,
364
458k
                    "include" => Include,
365
458k
                    "with" => With,
366
458k
                    "async" => Async,
367
439k
                    _ => Id,
368
                }
369
            }
370
371
372k
            ch if ch.is_ascii_digit() => {
372
372k
                let mut iter = self.chars.clone();
373
378k
                while let Some((_, ch)) = iter.next() {
374
378k
                    if !ch.is_ascii_digit() {
375
372k
                        break;
376
6.05k
                    }
377
6.05k
                    self.chars = iter.clone();
378
                }
379
380
372k
                Integer
381
            }
382
383
0
            ch => return Err(Error::Unexpected(start, ch)),
384
        };
385
6.41M
        let end = match self.chars.clone().next() {
386
6.38M
            Some((i, _)) => i,
387
21.2k
            None => self.input.len(),
388
        };
389
390
6.41M
        let end = self.span_offset + u32::try_from(end).unwrap();
391
6.41M
        Ok(Some((Span::new(start, end), token)))
392
6.45M
    }
393
394
1.64M
    pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
395
1.64M
        let mut other = self.clone();
396
1.64M
        match other.next()? {
397
1.64M
            Some((_span, found)) if expected == found => {
398
810k
                *self = other;
399
810k
                Ok(true)
400
            }
401
838k
            Some(_) => Ok(false),
402
575
            None => Ok(false),
403
        }
404
1.64M
    }
405
406
1.25M
    pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
407
1.25M
        match self.next()? {
408
1.25M
            Some((span, found)) => {
409
1.25M
                if expected == found {
410
1.25M
                    Ok(span)
411
                } else {
412
0
                    Err(Error::Wanted {
413
0
                        at: span.start(),
414
0
                        expected: expected.describe(),
415
0
                        found: found.describe(),
416
0
                    })
417
                }
418
            }
419
0
            None => Err(Error::Wanted {
420
0
                at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
421
0
                expected: expected.describe(),
422
0
                found: "eof",
423
0
            }),
424
        }
425
1.25M
    }
426
427
4.79M
    fn eatc(&mut self, ch: char) -> bool {
428
4.79M
        let mut iter = self.chars.clone();
429
4.79M
        match iter.next() {
430
4.73M
            Some((_, ch2)) if ch == ch2 => {
431
622k
                self.chars = iter;
432
622k
                true
433
            }
434
4.17M
            _ => false,
435
        }
436
4.79M
    }
437
438
0
    pub fn eof_span(&self) -> Span {
439
0
        let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
440
0
        Span::new(end, end)
441
0
    }
442
}
443
444
impl<'a> Iterator for CrlfFold<'a> {
445
    type Item = (usize, char);
446
447
35.5M
    fn next(&mut self) -> Option<(usize, char)> {
448
35.5M
        self.chars.next().map(|(i, c)| {
449
35.4M
            if c == '\r' {
450
0
                let mut attempt = self.chars.clone();
451
0
                if let Some((_, '\n')) = attempt.next() {
452
0
                    self.chars = attempt;
453
0
                    return (i, '\n');
454
0
                }
455
35.4M
            }
456
35.4M
            (i, c)
457
35.4M
        })
458
35.5M
    }
459
}
460
461
21.7k
fn detect_invalid_input(input: &str) -> Result<(), Error> {
462
    // Disallow specific codepoints.
463
12.3M
    for (pos, ch) in input.char_indices() {
464
11.8M
        match ch {
465
535k
            '\n' | '\r' | '\t' => {}
466
467
            // Bidirectional override codepoints can be used to craft source code that
468
            // appears to have a different meaning than its actual meaning. See
469
            // [CVE-2021-42574] for background and motivation.
470
            //
471
            // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
472
            '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
473
            | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
474
0
                return Err(Error::ForbiddenCodepoint(u32::try_from(pos).unwrap(), ch));
475
            }
476
477
            // Disallow several characters which are deprecated or discouraged in Unicode.
478
            //
479
            // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
480
            // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
481
            // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
482
            // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
483
            // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
484
            '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
485
            | '\u{17b4}' | '\u{17b5}' => {
486
0
                return Err(Error::DeprecatedCodepoint(u32::try_from(pos).unwrap(), ch));
487
            }
488
489
            // Disallow control codes other than the ones explicitly recognized above,
490
            // so that viewing a wit file on a terminal doesn't have surprising side
491
            // effects or appear to have a different meaning than its actual meaning.
492
11.8M
            ch if ch.is_control() => {
493
0
                return Err(Error::ControlCodepoint(u32::try_from(pos).unwrap(), ch));
494
            }
495
496
11.8M
            _ => {}
497
        }
498
    }
499
500
21.7k
    Ok(())
501
21.7k
}
502
503
2.99M
fn is_keylike_start(ch: char) -> bool {
504
    // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
505
    // but we'll diagnose that after we've lexed the full string.
506
2.99M
    unicode_ident::is_xid_start(ch) || ch == '_' || ch == '-'
507
2.99M
}
508
509
16.8M
fn is_keylike_continue(ch: char) -> bool {
510
    // Lex any XID continue (which includes `_`) or '-'.
511
16.8M
    unicode_ident::is_xid_continue(ch) || ch == '-'
512
16.8M
}
513
514
530k
pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
515
    // IDs must have at least one part.
516
530k
    if id.is_empty() {
517
0
        return Err(Error::IdPartEmpty(start));
518
530k
    }
519
520
    // Ids consist of parts separated by '-'s.
521
530k
    for (idx, part) in id.split('-').enumerate() {
522
        // Parts must be non-empty and contain either all ASCII lowercase or
523
        // all ASCII uppercase. Non-first segment can also start with a digit.
524
530k
        let Some(first_char) = part.chars().next() else {
525
0
            return Err(Error::IdPartEmpty(start));
526
        };
527
530k
        if idx == 0 && !first_char.is_ascii_alphabetic() {
528
0
            return Err(Error::InvalidCharInId(start, first_char));
529
530k
        }
530
530k
        let mut upper = None;
531
4.54M
        for ch in part.chars() {
532
4.54M
            if ch.is_ascii_digit() {
533
430k
                // Digits are accepted in both uppercase and lowercase segments.
534
4.11M
            } else if ch.is_ascii_uppercase() {
535
0
                if upper.is_none() {
536
0
                    upper = Some(true);
537
0
                } else if let Some(false) = upper {
538
0
                    return Err(Error::InvalidCharInId(start, ch));
539
0
                }
540
4.11M
            } else if ch.is_ascii_lowercase() {
541
4.11M
                if upper.is_none() {
542
530k
                    upper = Some(false);
543
3.58M
                } else if let Some(true) = upper {
544
0
                    return Err(Error::InvalidCharInId(start, ch));
545
3.58M
                }
546
            } else {
547
0
                return Err(Error::InvalidCharInId(start, ch));
548
            }
549
        }
550
    }
551
552
530k
    Ok(())
553
530k
}
554
555
impl Token {
556
0
    pub fn describe(&self) -> &'static str {
557
0
        match self {
558
0
            Whitespace => "whitespace",
559
0
            Comment => "a comment",
560
0
            Equals => "'='",
561
0
            Comma => "','",
562
0
            Colon => "':'",
563
0
            Period => "'.'",
564
0
            Semicolon => "';'",
565
0
            LeftParen => "'('",
566
0
            RightParen => "')'",
567
0
            LeftBrace => "'{'",
568
0
            RightBrace => "'}'",
569
0
            LessThan => "'<'",
570
0
            GreaterThan => "'>'",
571
0
            Use => "keyword `use`",
572
0
            Type => "keyword `type`",
573
0
            Func => "keyword `func`",
574
0
            U8 => "keyword `u8`",
575
0
            U16 => "keyword `u16`",
576
0
            U32 => "keyword `u32`",
577
0
            U64 => "keyword `u64`",
578
0
            S8 => "keyword `s8`",
579
0
            S16 => "keyword `s16`",
580
0
            S32 => "keyword `s32`",
581
0
            S64 => "keyword `s64`",
582
0
            F32 => "keyword `f32`",
583
0
            F64 => "keyword `f64`",
584
0
            Char => "keyword `char`",
585
0
            Own => "keyword `own`",
586
0
            Borrow => "keyword `borrow`",
587
0
            Resource => "keyword `resource`",
588
0
            Record => "keyword `record`",
589
0
            Flags => "keyword `flags`",
590
0
            Variant => "keyword `variant`",
591
0
            Enum => "keyword `enum`",
592
0
            Bool => "keyword `bool`",
593
0
            String_ => "keyword `string`",
594
0
            Option_ => "keyword `option`",
595
0
            Result_ => "keyword `result`",
596
0
            Future => "keyword `future`",
597
0
            Stream => "keyword `stream`",
598
0
            ErrorContext => "keyword `error-context`",
599
0
            List => "keyword `list`",
600
0
            Map => "keyword `map`",
601
0
            Underscore => "keyword `_`",
602
0
            Id => "an identifier",
603
0
            ExplicitId => "an '%' identifier",
604
0
            RArrow => "`->`",
605
0
            Star => "`*`",
606
0
            At => "`@`",
607
0
            Slash => "`/`",
608
0
            Plus => "`+`",
609
0
            Minus => "`-`",
610
0
            As => "keyword `as`",
611
0
            From_ => "keyword `from`",
612
0
            Static => "keyword `static`",
613
0
            Interface => "keyword `interface`",
614
0
            Tuple => "keyword `tuple`",
615
0
            Import => "keyword `import`",
616
0
            Export => "keyword `export`",
617
0
            World => "keyword `world`",
618
0
            Package => "keyword `package`",
619
0
            Constructor => "keyword `constructor`",
620
0
            Integer => "an integer",
621
0
            Include => "keyword `include`",
622
0
            With => "keyword `with`",
623
0
            Async => "keyword `async`",
624
        }
625
0
    }
626
}
627
628
impl core::error::Error for Error {}
629
630
impl Error {
631
    /// Returns the byte offset in the source map where this error occurred.
632
0
    pub fn position(&self) -> u32 {
633
0
        match self {
634
0
            Error::ControlCodepoint(at, _)
635
0
            | Error::DeprecatedCodepoint(at, _)
636
0
            | Error::ForbiddenCodepoint(at, _)
637
0
            | Error::InvalidCharInId(at, _)
638
0
            | Error::IdPartEmpty(at)
639
0
            | Error::InvalidEscape(at, _)
640
0
            | Error::Unexpected(at, _)
641
0
            | Error::UnterminatedComment(at) => *at,
642
0
            Error::Wanted { at, .. } => *at,
643
        }
644
0
    }
645
}
646
647
impl fmt::Display for Error {
648
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
649
0
        match self {
650
0
            Error::ControlCodepoint(_, ch) => write!(f, "Control code '{}'", ch.escape_unicode()),
651
0
            Error::DeprecatedCodepoint(_, ch) => {
652
0
                write!(
653
0
                    f,
654
                    "Codepoint {:?} is discouraged by Unicode",
655
0
                    ch.escape_unicode()
656
                )
657
            }
658
0
            Error::ForbiddenCodepoint(_, ch) => {
659
0
                write!(
660
0
                    f,
661
                    "Input contains bidirectional override codepoint {:?}",
662
0
                    ch.escape_unicode()
663
                )
664
            }
665
0
            Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
666
0
            Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
667
            Error::Wanted {
668
0
                expected, found, ..
669
0
            } => write!(f, "expected {expected}, found {found}"),
670
0
            Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
671
0
            Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
672
0
            Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
673
        }
674
0
    }
675
}
676
677
#[test]
678
fn test_validate_id() {
679
    validate_id(0, "apple").unwrap();
680
    validate_id(0, "apple-pear").unwrap();
681
    validate_id(0, "apple-pear-grape").unwrap();
682
    validate_id(0, "a0").unwrap();
683
    validate_id(0, "a").unwrap();
684
    validate_id(0, "a-a").unwrap();
685
    validate_id(0, "bool").unwrap();
686
    validate_id(0, "APPLE").unwrap();
687
    validate_id(0, "APPLE-PEAR").unwrap();
688
    validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
689
    validate_id(0, "apple-PEAR-grape").unwrap();
690
    validate_id(0, "APPLE-pear-GRAPE").unwrap();
691
    validate_id(0, "ENOENT").unwrap();
692
    validate_id(0, "is-XML").unwrap();
693
    validate_id(0, "apple-0").unwrap();
694
    validate_id(0, "a0-000-3d4a-54FF").unwrap();
695
696
    assert!(validate_id(0, "").is_err());
697
    assert!(validate_id(0, "0").is_err());
698
    assert!(validate_id(0, "%").is_err());
699
    assert!(validate_id(0, "$").is_err());
700
    assert!(validate_id(0, "0a").is_err());
701
    assert!(validate_id(0, ".").is_err());
702
    assert!(validate_id(0, "·").is_err());
703
    assert!(validate_id(0, "a a").is_err());
704
    assert!(validate_id(0, "_").is_err());
705
    assert!(validate_id(0, "-").is_err());
706
    assert!(validate_id(0, "a-").is_err());
707
    assert!(validate_id(0, "-a").is_err());
708
    assert!(validate_id(0, "Apple").is_err());
709
    assert!(validate_id(0, "applE").is_err());
710
    assert!(validate_id(0, "-apple-pear").is_err());
711
    assert!(validate_id(0, "apple-pear-").is_err());
712
    assert!(validate_id(0, "apple_pear").is_err());
713
    assert!(validate_id(0, "apple.pear").is_err());
714
    assert!(validate_id(0, "apple pear").is_err());
715
    assert!(validate_id(0, "apple/pear").is_err());
716
    assert!(validate_id(0, "apple|pear").is_err());
717
    assert!(validate_id(0, "apple-Pear").is_err());
718
    assert!(validate_id(0, "()()").is_err());
719
    assert!(validate_id(0, "").is_err());
720
    assert!(validate_id(0, "*").is_err());
721
    assert!(validate_id(0, "apple\u{5f3}pear").is_err());
722
    assert!(validate_id(0, "apple\u{200c}pear").is_err());
723
    assert!(validate_id(0, "apple\u{200d}pear").is_err());
724
    assert!(validate_id(0, "apple--pear").is_err());
725
    assert!(validate_id(0, "_apple").is_err());
726
    assert!(validate_id(0, "apple_").is_err());
727
    assert!(validate_id(0, "_Znwj").is_err());
728
    assert!(validate_id(0, "__i386").is_err());
729
    assert!(validate_id(0, "__i386__").is_err());
730
    assert!(validate_id(0, "Москва").is_err());
731
    assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
732
    assert!(validate_id(0, "a0-000-3d4A-54Ff").is_err());
733
    assert!(validate_id(0, "😼").is_err(), "non-identifier");
734
    assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
735
}
736
737
#[test]
738
fn test_tokenizer() {
739
    fn collect(s: &str) -> Result<Vec<Token>, Error> {
740
        let mut t = Tokenizer::new(s, 0)?;
741
        let mut tokens = Vec::new();
742
        while let Some(token) = t.next()? {
743
            tokens.push(token.1);
744
        }
745
        Ok(tokens)
746
    }
747
748
    assert_eq!(collect("").unwrap(), vec![]);
749
    assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
750
    assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
751
    assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
752
    assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
753
    assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
754
    assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
755
    assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
756
    assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
757
    assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
758
    assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
759
    assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
760
    assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
761
    assert_eq!(
762
        collect("garçon-hühnervögel-москва-東京").unwrap(),
763
        vec![Token::Id]
764
    );
765
    assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
766
    assert_eq!(collect("a").unwrap(), vec![Token::Id]);
767
    assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
768
    assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
769
    assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
770
    assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
771
    assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
772
    assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
773
    assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
774
    assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
775
    assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
776
    assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
777
    assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
778
779
    assert_eq!(collect("func").unwrap(), vec![Token::Func]);
780
    assert_eq!(
781
        collect("a: func()").unwrap(),
782
        vec![
783
            Token::Id,
784
            Token::Colon,
785
            Token::Func,
786
            Token::LeftParen,
787
            Token::RightParen
788
        ]
789
    );
790
791
    assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
792
793
    assert_eq!(collect("own").unwrap(), vec![Token::Own]);
794
    assert_eq!(
795
        collect("own<some-id>").unwrap(),
796
        vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
797
    );
798
799
    assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
800
    assert_eq!(
801
        collect("borrow<some-id>").unwrap(),
802
        vec![
803
            Token::Borrow,
804
            Token::LessThan,
805
            Token::Id,
806
            Token::GreaterThan
807
        ]
808
    );
809
810
    assert!(collect("\u{149}").is_err(), "strongly discouraged");
811
    assert!(collect("\u{673}").is_err(), "strongly discouraged");
812
    assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
813
    assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
814
    assert!(collect("\u{202a}").is_err(), "bidirectional override");
815
    assert!(collect("\u{2068}").is_err(), "bidirectional override");
816
    assert!(collect("\u{0}").is_err(), "control code");
817
    assert!(collect("\u{b}").is_err(), "control code");
818
    assert!(collect("\u{c}").is_err(), "control code");
819
    assert!(collect("\u{85}").is_err(), "control code");
820
}