Coverage Report

Created: 2025-06-02 07:01

/rust/registry/src/index.crates.io-6f17d22bba15001f/pulldown-cmark-0.13.0/src/linklabel.rs
Line
Count
Source
1
// Copyright 2018 Google LLC
2
//
3
// Permission is hereby granted, free of charge, to any person obtaining a copy
4
// of this software and associated documentation files (the "Software"), to deal
5
// in the Software without restriction, including without limitation the rights
6
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
// copies of the Software, and to permit persons to whom the Software is
8
// furnished to do so, subject to the following conditions:
9
//
10
// The above copyright notice and this permission notice shall be included in
11
// all copies or substantial portions of the Software.
12
//
13
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
// THE SOFTWARE.
20
21
//! Link label parsing and matching.
22
23
use unicase::UniCase;
24
25
use crate::scanners::{is_ascii_punctuation, is_ascii_whitespace, scan_eol};
26
use crate::strings::CowStr;
27
28
#[derive(Debug)]
29
pub(crate) enum ReferenceLabel<'a> {
30
    Link(CowStr<'a>),
31
    Footnote(CowStr<'a>),
32
}
33
34
pub(crate) type LinkLabel<'a> = UniCase<CowStr<'a>>;
35
36
pub(crate) type FootnoteLabel<'a> = UniCase<CowStr<'a>>;
37
38
/// Assumes the opening bracket has already been scanned.
39
/// The line break handler determines what happens when a linebreak
40
/// is found. It is passed the bytes following the line break and
41
/// either returns `Some(k)`, where `k` is the number of bytes to skip,
42
/// or `None` to abort parsing the label.
43
/// Returns the number of bytes read (including closing bracket) and label on success.
44
18.9M
pub(crate) fn scan_link_label_rest<'t>(
45
18.9M
    text: &'t str,
46
18.9M
    linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>,
47
18.9M
    is_in_table: bool,
48
18.9M
) -> Option<(usize, CowStr<'t>)> {
49
18.9M
    let bytes = text.as_bytes();
50
18.9M
    let mut ix = 0;
51
18.9M
    let mut only_white_space = true;
52
18.9M
    let mut codepoints = 0;
53
18.9M
    // no worries, doesn't allocate until we push things onto it
54
18.9M
    let mut label = String::new();
55
18.9M
    let mut mark = 0;
56
57
    loop {
58
138M
        if codepoints >= 1000 {
59
551
            return None;
60
138M
        }
61
138M
        match *bytes.get(ix)? {
62
2.54M
            b'[' => return None,
63
15.3M
            b']' => break,
64
            // Backslash escapes in link references are normally untouched, but
65
            // tables are an exception, because they're parsed as-if the tables
66
            // were parsed in a discrete pass, changing `\|` to `|`, and then
67
            // passing the changed string to the inline parser.
68
799k
            b'|' if is_in_table && ix != 0 && bytes.get(ix - 1) == Some(&b'\\') => {
69
28.9k
                // only way to reach this spot is to have `\\|` (even number of `\` before `|`)
70
28.9k
                label.push_str(&text[mark..ix - 1]);
71
28.9k
                label.push('|');
72
28.9k
                ix += 1;
73
28.9k
                only_white_space = false;
74
28.9k
                mark = ix;
75
28.9k
            }
76
1.83M
            b'\\' if is_in_table && bytes.get(ix + 1) == Some(&b'|') => {
77
16.0k
                // only way to reach this spot is to have `\|` (odd number of `\` before `|`)
78
16.0k
                label.push_str(&text[mark..ix]);
79
16.0k
                label.push('|');
80
16.0k
                ix += 2;
81
16.0k
                codepoints += 1;
82
16.0k
                only_white_space = false;
83
16.0k
                mark = ix;
84
16.0k
            }
85
1.82M
            b'\\' if is_ascii_punctuation(*bytes.get(ix + 1)?) => {
86
1.13M
                ix += 2;
87
1.13M
                codepoints += 2;
88
1.13M
                only_white_space = false;
89
1.13M
            }
90
119M
            b if is_ascii_whitespace(b) => {
91
7.54M
                // normalize labels by collapsing whitespaces, including linebreaks
92
7.54M
                let mut whitespaces = 0;
93
7.54M
                let mut linebreaks = 0;
94
7.54M
                let whitespace_start = ix;
95
96
17.2M
                while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
97
10.4M
                    if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
98
4.01M
                        linebreaks += 1;
99
4.01M
                        if linebreaks > 1 {
100
12.5k
                            return None;
101
4.00M
                        }
102
4.00M
                        ix += eol_bytes;
103
4.00M
                        ix += linebreak_handler(&bytes[ix..])?;
104
3.28M
                        whitespaces += 2; // indicate that we need to replace
105
                    } else {
106
6.38M
                        whitespaces += if bytes[ix] == b' ' { 1 } else { 2 };
107
6.38M
                        ix += 1;
108
                    }
109
                }
110
6.82M
                if whitespaces > 1 {
111
5.93M
                    label.push_str(&text[mark..whitespace_start]);
112
5.93M
                    label.push(' ');
113
5.93M
                    mark = ix;
114
5.93M
                    codepoints += ix - whitespace_start;
115
5.93M
                } else {
116
884k
                    codepoints += 1;
117
884k
                }
118
            }
119
111M
            b => {
120
111M
                only_white_space = false;
121
111M
                ix += 1;
122
111M
                if b & 0b1000_0000 != 0 {
123
4.02M
                    codepoints += 1;
124
107M
                }
125
            }
126
        }
127
    }
128
129
15.3M
    if only_white_space {
130
2.80M
        None
131
    } else {
132
12.5M
        let cow = if mark == 0 {
133
9.90M
            let asciiws = &[' ', '\r', '\n', '\t'][..];
134
9.90M
            text[..ix].trim_matches(asciiws).into()
135
        } else {
136
2.67M
            label.push_str(&text[mark..ix]);
137
2.67M
            while matches!(
138
3.76M
                label.as_bytes().last(),
139
                Some(&b' ' | &b'\r' | &b'\n' | &b'\t')
140
1.09M
            ) {
141
1.09M
                label.pop();
142
1.09M
            }
143
2.67M
            while matches!(
144
3.48M
                label.as_bytes().first(),
145
                Some(&b' ' | &b'\r' | &b'\n' | &b'\t')
146
810k
            ) {
147
810k
                label.remove(0);
148
810k
            }
149
2.67M
            label.into()
150
        };
151
12.5M
        Some((ix + 1, cow))
152
    }
153
18.9M
}
154
155
#[cfg(test)]
156
mod test {
157
    use super::scan_link_label_rest;
158
159
    #[test]
160
    fn whitespace_normalization() {
161
        let input = "«\t\tBlurry Eyes\t\t»][blurry_eyes]";
162
        let expected_output = "« Blurry Eyes »"; // regular spaces!
163
164
        let (_bytes, normalized_label) = scan_link_label_rest(input, &|_| None, false).unwrap();
165
        assert_eq!(expected_output, normalized_label.as_ref());
166
    }
167
168
    #[test]
169
    fn return_carriage_linefeed_ok() {
170
        let input = "hello\r\nworld\r\n]";
171
        assert!(scan_link_label_rest(input, &|_| Some(0), false).is_some());
172
    }
173
}