/rust/registry/src/index.crates.io-6f17d22bba15001f/pulldown-cmark-0.13.0/src/linklabel.rs

Source
// Copyright 2018 Google LLC
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

//! Link label parsing and matching.

use unicase::UniCase;

use crate::scanners::{is_ascii_punctuation, is_ascii_whitespace, scan_eol};
use crate::strings::CowStr;

#[derive(Debug)]
pub(crate) enum ReferenceLabel<'a> {
    Link(CowStr<'a>),
    Footnote(CowStr<'a>),
}

pub(crate) type LinkLabel<'a> = UniCase<CowStr<'a>>;

pub(crate) type FootnoteLabel<'a> = UniCase<CowStr<'a>>;

/// Assumes the opening bracket has already been scanned.
/// The line break handler determines what happens when a linebreak
/// is found. It is passed the bytes following the line break and
/// either returns `Some(k)`, where `k` is the number of bytes to skip,
/// or `None` to abort parsing the label.
/// Returns the number of bytes read (including closing bracket) and label on success.
pub(crate) fn scan_link_label_rest<'t>(
    text: &'t str,
    linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>,
    is_in_table: bool,
) -> Option<(usize, CowStr<'t>)> {
    let bytes = text.as_bytes();
    let mut ix = 0;
    let mut only_white_space = true;
    let mut codepoints = 0;
    // no worries, doesn't allocate until we push things onto it
    let mut label = String::new();
    let mut mark = 0;

    loop {
        if codepoints >= 1000 {
            return None;
        }
        match *bytes.get(ix)? {
            b'[' => return None,
            b']' => break,
            // Backslash escapes in link references are normally untouched, but
            // tables are an exception, because they're parsed as-if the tables
            // were parsed in a discrete pass, changing `\|` to `|`, and then
            // passing the changed string to the inline parser.
            b'|' if is_in_table && ix != 0 && bytes.get(ix - 1) == Some(&b'\\') => {
                // only way to reach this spot is to have `\\|` (even number of `\` before `|`)
                label.push_str(&text[mark..ix - 1]);
                label.push('|');
                ix += 1;
                only_white_space = false;
                mark = ix;
            }
            b'\\' if is_in_table && bytes.get(ix + 1) == Some(&b'|') => {
                // only way to reach this spot is to have `\|` (odd number of `\` before `|`)
                label.push_str(&text[mark..ix]);
                label.push('|');
                ix += 2;
                codepoints += 1;
                only_white_space = false;
                mark = ix;
            }
            b'\\' if is_ascii_punctuation(*bytes.get(ix + 1)?) => {
                ix += 2;
                codepoints += 2;
                only_white_space = false;
            }
            b if is_ascii_whitespace(b) => {
                // normalize labels by collapsing whitespaces, including linebreaks
                let mut whitespaces = 0;
                let mut linebreaks = 0;
                let whitespace_start = ix;

                while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
                    if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
                        linebreaks += 1;
                        if linebreaks > 1 {
                            return None;
                        }
                        ix += eol_bytes;
                        ix += linebreak_handler(&bytes[ix..])?;
                        whitespaces += 2; // indicate that we need to replace
                    } else {
                        whitespaces += if bytes[ix] == b' ' { 1 } else { 2 };
                        ix += 1;
                    }
                }
                if whitespaces > 1 {
                    label.push_str(&text[mark..whitespace_start]);
                    label.push(' ');
                    mark = ix;
                    codepoints += ix - whitespace_start;
                } else {
                    codepoints += 1;
                }
            }
            b => {
                only_white_space = false;
                ix += 1;
                if b & 0b1000_0000 != 0 {
                    codepoints += 1;
                }
            }
        }
    }

    if only_white_space {
        None
    } else {
        let cow = if mark == 0 {
            let asciiws = &[' ', '\r', '\n', '\t'][..];
            text[..ix].trim_matches(asciiws).into()
        } else {
            label.push_str(&text[mark..ix]);
            while matches!(
                label.as_bytes().last(),
                Some(&b' ' | &b'\r' | &b'\n' | &b'\t')
            ) {
                label.pop();
            }
            while matches!(
                label.as_bytes().first(),
                Some(&b' ' | &b'\r' | &b'\n' | &b'\t')
            ) {
                label.remove(0);
            }
            label.into()
        };
        Some((ix + 1, cow))
    }
}

#[cfg(test)]
mod test {
    use super::scan_link_label_rest;

    #[test]
    fn whitespace_normalization() {
        let input = "«\t\tBlurry Eyes\t\t»][blurry_eyes]";
        let expected_output = "« Blurry Eyes »"; // regular spaces!

        let (_bytes, normalized_label) = scan_link_label_rest(input, &|_| None, false).unwrap();
        assert_eq!(expected_output, normalized_label.as_ref());
    }

    #[test]
    fn return_carriage_linefeed_ok() {
        let input = "hello\r\nworld\r\n]";
        assert!(scan_link_label_rest(input, &|_| Some(0), false).is_some());
    }
}

Line	Count	Source
1		// Copyright 2018 Google LLC
2		//
3		// Permission is hereby granted, free of charge, to any person obtaining a copy
4		// of this software and associated documentation files (the "Software"), to deal
5		// in the Software without restriction, including without limitation the rights
6		// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7		// copies of the Software, and to permit persons to whom the Software is
8		// furnished to do so, subject to the following conditions:
9		//
10		// The above copyright notice and this permission notice shall be included in
11		// all copies or substantial portions of the Software.
12		//
13		// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14		// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15		// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16		// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17		// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18		// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19		// THE SOFTWARE.
20
21		//! Link label parsing and matching.
22
23		use unicase::UniCase;
24
25		use crate::scanners::{is_ascii_punctuation, is_ascii_whitespace, scan_eol};
26		use crate::strings::CowStr;
27
28		#[derive(Debug)]
29		pub(crate) enum ReferenceLabel<'a> {
30		Link(CowStr<'a>),
31		Footnote(CowStr<'a>),
32		}
33
34		pub(crate) type LinkLabel<'a> = UniCase<CowStr<'a>>;
35
36		pub(crate) type FootnoteLabel<'a> = UniCase<CowStr<'a>>;
37
38		/// Assumes the opening bracket has already been scanned.
39		/// The line break handler determines what happens when a linebreak
40		/// is found. It is passed the bytes following the line break and
41		/// either returns `Some(k)`, where `k` is the number of bytes to skip,
42		/// or `None` to abort parsing the label.
43		/// Returns the number of bytes read (including closing bracket) and label on success.
44	18.9M	pub(crate) fn scan_link_label_rest<'t>(
45	18.9M	text: &'t str,
46	18.9M	linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>,
47	18.9M	is_in_table: bool,
48	18.9M	) -> Option<(usize, CowStr<'t>)> {
49	18.9M	let bytes = text.as_bytes();
50	18.9M	let mut ix = 0;
51	18.9M	let mut only_white_space = true;
52	18.9M	let mut codepoints = 0;
53	18.9M	// no worries, doesn't allocate until we push things onto it
54	18.9M	let mut label = String::new();
55	18.9M	let mut mark = 0;
56
57		loop {
58	138M	if codepoints >= 1000 {
59	551	return None;
60	138M	}
61	138M	match *bytes.get(ix)? {
62	2.54M	b'[' => return None,
63	15.3M	b']' => break,
64		// Backslash escapes in link references are normally untouched, but
65		// tables are an exception, because they're parsed as-if the tables
66		// were parsed in a discrete pass, changing `\\|` to `\|`, and then
67		// passing the changed string to the inline parser.
68	799k	b'\|' if is_in_table && ix != 0 && bytes.get(ix - 1) == Some(&b'\\') => {
69	28.9k	// only way to reach this spot is to have `\\\|` (even number of `\` before `\|`)
70	28.9k	label.push_str(&text[mark..ix - 1]);
71	28.9k	label.push('\|');
72	28.9k	ix += 1;
73	28.9k	only_white_space = false;
74	28.9k	mark = ix;
75	28.9k	}
76	1.83M	b'\\' if is_in_table && bytes.get(ix + 1) == Some(&b'\|') => {
77	16.0k	// only way to reach this spot is to have `\\|` (odd number of `\` before `\|`)
78	16.0k	label.push_str(&text[mark..ix]);
79	16.0k	label.push('\|');
80	16.0k	ix += 2;
81	16.0k	codepoints += 1;
82	16.0k	only_white_space = false;
83	16.0k	mark = ix;
84	16.0k	}
85	1.82M	b'\\' if is_ascii_punctuation(*bytes.get(ix + 1)?) => {
86	1.13M	ix += 2;
87	1.13M	codepoints += 2;
88	1.13M	only_white_space = false;
89	1.13M	}
90	119M	b if is_ascii_whitespace(b) => {
91	7.54M	// normalize labels by collapsing whitespaces, including linebreaks
92	7.54M	let mut whitespaces = 0;
93	7.54M	let mut linebreaks = 0;
94	7.54M	let whitespace_start = ix;
95
96	17.2M	while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
97	10.4M	if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
98	4.01M	linebreaks += 1;
99	4.01M	if linebreaks > 1 {
100	12.5k	return None;
101	4.00M	}
102	4.00M	ix += eol_bytes;
103	4.00M	ix += linebreak_handler(&bytes[ix..])?;
104	3.28M	whitespaces += 2; // indicate that we need to replace
105		} else {
106	6.38M	whitespaces += if bytes[ix] == b' ' { 1 } else { 2 };
107	6.38M	ix += 1;
108		}
109		}
110	6.82M	if whitespaces > 1 {
111	5.93M	label.push_str(&text[mark..whitespace_start]);
112	5.93M	label.push(' ');
113	5.93M	mark = ix;
114	5.93M	codepoints += ix - whitespace_start;
115	5.93M	} else {
116	884k	codepoints += 1;
117	884k	}
118		}
119	111M	b => {
120	111M	only_white_space = false;
121	111M	ix += 1;
122	111M	if b & 0b1000_0000 != 0 {
123	4.02M	codepoints += 1;
124	107M	}
125		}
126		}
127		}
128
129	15.3M	if only_white_space {
130	2.80M	None
131		} else {
132	12.5M	let cow = if mark == 0 {
133	9.90M	let asciiws = &[' ', '\r', '\n', '\t'][..];
134	9.90M	text[..ix].trim_matches(asciiws).into()
135		} else {
136	2.67M	label.push_str(&text[mark..ix]);
137	2.67M	while matches!(
138	3.76M	label.as_bytes().last(),
139		Some(&b' ' \| &b'\r' \| &b'\n' \| &b'\t')
140	1.09M	) {
141	1.09M	label.pop();
142	1.09M	}
143	2.67M	while matches!(
144	3.48M	label.as_bytes().first(),
145		Some(&b' ' \| &b'\r' \| &b'\n' \| &b'\t')
146	810k	) {
147	810k	label.remove(0);
148	810k	}
149	2.67M	label.into()
150		};
151	12.5M	Some((ix + 1, cow))
152		}
153	18.9M	}
154
155		#[cfg(test)]
156		mod test {
157		use super::scan_link_label_rest;
158
159		#[test]
160		fn whitespace_normalization() {
161		let input = "«\t\tBlurry Eyes\t\t»][blurry_eyes]";
162		let expected_output = "« Blurry Eyes »"; // regular spaces!
163
164		let (_bytes, normalized_label) = scan_link_label_rest(input, &\|_\| None, false).unwrap();
165		assert_eq!(expected_output, normalized_label.as_ref());
166		}
167
168		#[test]
169		fn return_carriage_linefeed_ok() {
170		let input = "hello\r\nworld\r\n]";
171		assert!(scan_link_label_rest(input, &\|_\| Some(0), false).is_some());
172		}
173		}

Coverage Report

Created: 2025-06-02 07:01