Coverage Report

Created: 2025-06-02 07:01

/rust/registry/src/index.crates.io-6f17d22bba15001f/pulldown-cmark-escape-0.10.1/src/lib.rs
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2015 Google Inc. All rights reserved.
2
//
3
// Permission is hereby granted, free of charge, to any person obtaining a copy
4
// of this software and associated documentation files (the "Software"), to deal
5
// in the Software without restriction, including without limitation the rights
6
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
// copies of the Software, and to permit persons to whom the Software is
8
// furnished to do so, subject to the following conditions:
9
//
10
// The above copyright notice and this permission notice shall be included in
11
// all copies or substantial portions of the Software.
12
//
13
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
// THE SOFTWARE.
20
21
//! Utility functions for HTML escaping. Only useful when building your own
22
//! HTML renderer.
23
24
use std::fmt::{Arguments, Write as FmtWrite};
25
use std::io::{self, ErrorKind, Write};
26
use std::str::from_utf8;
27
28
#[rustfmt::skip]
29
static HREF_SAFE: [u8; 128] = [
30
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32
    0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
33
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
34
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
36
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
38
];
39
40
static HEX_CHARS: &[u8] = b"0123456789ABCDEF";
41
static AMP_ESCAPE: &str = "&";
42
static SINGLE_QUOTE_ESCAPE: &str = "'";
43
44
/// This wrapper exists because we can't have both a blanket implementation
45
/// for all types implementing `Write` and types of the for `&mut W` where
46
/// `W: StrWrite`. Since we need the latter a lot, we choose to wrap
47
/// `Write` types.
48
#[derive(Debug)]
49
pub struct WriteWrapper<W>(pub W);
50
51
/// Trait that allows writing string slices. This is basically an extension
52
/// of `std::io::Write` in order to include `String`.
53
pub trait StrWrite {
54
    fn write_str(&mut self, s: &str) -> io::Result<()>;
55
56
    fn write_fmt(&mut self, args: Arguments) -> io::Result<()>;
57
}
58
59
impl<W> StrWrite for WriteWrapper<W>
60
where
61
    W: Write,
62
{
63
    #[inline]
64
0
    fn write_str(&mut self, s: &str) -> io::Result<()> {
65
0
        self.0.write_all(s.as_bytes())
66
0
    }
67
68
    #[inline]
69
0
    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
70
0
        self.0.write_fmt(args)
71
0
    }
72
}
73
74
impl StrWrite for String {
75
    #[inline]
76
0
    fn write_str(&mut self, s: &str) -> io::Result<()> {
77
0
        self.push_str(s);
78
0
        Ok(())
79
0
    }
Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str
Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str
80
81
    #[inline]
82
0
    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
83
0
        // FIXME: translate fmt error to io error?
84
0
        FmtWrite::write_fmt(self, args).map_err(|_| ErrorKind::Other.into())
Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt::{closure#0}
Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt::{closure#0}
85
0
    }
Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt
Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt
86
}
87
88
impl<W> StrWrite for &'_ mut W
89
where
90
    W: StrWrite,
91
{
92
    #[inline]
93
0
    fn write_str(&mut self, s: &str) -> io::Result<()> {
94
0
        (**self).write_str(s)
95
0
    }
Unexecuted instantiation: <&mut alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str
Unexecuted instantiation: <&mut &mut alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str
Unexecuted instantiation: <&mut _ as pulldown_cmark_escape::StrWrite>::write_str
96
97
    #[inline]
98
0
    fn write_fmt(&mut self, args: Arguments) -> io::Result<()> {
99
0
        (**self).write_fmt(args)
100
0
    }
Unexecuted instantiation: <&mut alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt
Unexecuted instantiation: <&mut _ as pulldown_cmark_escape::StrWrite>::write_fmt
101
}
102
103
/// Writes an href to the buffer, escaping href unsafe bytes.
104
0
pub fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>
105
0
where
106
0
    W: StrWrite,
107
0
{
108
0
    let bytes = s.as_bytes();
109
0
    let mut mark = 0;
110
0
    for i in 0..bytes.len() {
111
0
        let c = bytes[i];
112
0
        if c >= 0x80 || HREF_SAFE[c as usize] == 0 {
113
            // character needing escape
114
115
            // write partial substring up to mark
116
0
            if mark < i {
117
0
                w.write_str(&s[mark..i])?;
118
0
            }
119
0
            match c {
120
                b'&' => {
121
0
                    w.write_str(AMP_ESCAPE)?;
122
                }
123
                b'\'' => {
124
0
                    w.write_str(SINGLE_QUOTE_ESCAPE)?;
125
                }
126
                _ => {
127
0
                    let mut buf = [0u8; 3];
128
0
                    buf[0] = b'%';
129
0
                    buf[1] = HEX_CHARS[((c as usize) >> 4) & 0xF];
130
0
                    buf[2] = HEX_CHARS[(c as usize) & 0xF];
131
0
                    let escaped = from_utf8(&buf).unwrap();
132
0
                    w.write_str(escaped)?;
133
                }
134
            }
135
0
            mark = i + 1; // all escaped characters are ASCII
136
0
        }
137
    }
138
0
    w.write_str(&s[mark..])
139
0
}
Unexecuted instantiation: pulldown_cmark_escape::escape_href::<&mut &mut alloc::string::String>
Unexecuted instantiation: pulldown_cmark_escape::escape_href::<_>
140
141
0
const fn create_html_escape_table(body: bool) -> [u8; 256] {
142
0
    let mut table = [0; 256];
143
0
    table[b'&' as usize] = 1;
144
0
    table[b'<' as usize] = 2;
145
0
    table[b'>' as usize] = 3;
146
0
    if !body {
147
0
        table[b'"' as usize] = 4;
148
0
        table[b'\'' as usize] = 5;
149
0
    }
150
0
    table
151
0
}
152
153
static HTML_ESCAPE_TABLE: [u8; 256] = create_html_escape_table(false);
154
static HTML_BODY_TEXT_ESCAPE_TABLE: [u8; 256] = create_html_escape_table(true);
155
156
static HTML_ESCAPES: [&str; 6] = ["", "&amp;", "&lt;", "&gt;", "&quot;", "&#39;"];
157
158
/// Writes the given string to the Write sink, replacing special HTML bytes
159
/// (<, >, &, ", ') by escape sequences.
160
///
161
/// Use this function to write output to quoted HTML attributes.
162
/// Since this function doesn't escape spaces, unquoted attributes
163
/// cannot be used. For example:
164
///
165
/// ```rust
166
/// let mut value = String::new();
167
/// pulldown_cmark_escape::escape_html(&mut value, "two words")
168
///     .expect("writing to a string is infallible");
169
/// // This is okay.
170
/// let ok = format!("<a title='{value}'>test</a>");
171
/// // This is not okay.
172
/// //let not_ok = format!("<a title={value}>test</a>");
173
/// ````
174
0
pub fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {
175
0
    #[cfg(all(target_arch = "x86_64", feature = "simd"))]
176
0
    {
177
0
        simd::escape_html(w, s, &HTML_ESCAPE_TABLE)
178
0
    }
179
0
    #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
180
0
    {
181
0
        escape_html_scalar(w, s, &HTML_ESCAPE_TABLE)
182
0
    }
183
0
}
Unexecuted instantiation: pulldown_cmark_escape::escape_html::<&mut &mut alloc::string::String>
Unexecuted instantiation: pulldown_cmark_escape::escape_html::<_>
184
185
/// For use in HTML body text, writes the given string to the Write sink,
186
/// replacing special HTML bytes (<, >, &) by escape sequences.
187
///
188
/// <div class="warning">
189
///
190
/// This function should be used for escaping text nodes, not attributes.
191
/// In the below example, the word "foo" is an attribute, and the word
192
/// "bar" is an text node. The word "bar" could be escaped by this function,
193
/// but the word "foo" must be escaped using [`escape_html`].
194
///
195
/// ```html
196
/// <span class="foo">bar</span>
197
/// ```
198
///
199
/// If you aren't sure what the difference is, use [`escape_html`].
200
/// It should always be correct, but will produce larger output.
201
///
202
/// </div>
203
0
pub fn escape_html_body_text<W: StrWrite>(w: W, s: &str) -> io::Result<()> {
204
0
    #[cfg(all(target_arch = "x86_64", feature = "simd"))]
205
0
    {
206
0
        simd::escape_html(w, s, &HTML_BODY_TEXT_ESCAPE_TABLE)
207
0
    }
208
0
    #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
209
0
    {
210
0
        escape_html_scalar(w, s, &HTML_BODY_TEXT_ESCAPE_TABLE)
211
0
    }
212
0
}
Unexecuted instantiation: pulldown_cmark_escape::escape_html_body_text::<&mut &mut alloc::string::String>
Unexecuted instantiation: pulldown_cmark_escape::escape_html_body_text::<_>
213
214
0
fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str, table: &'static [u8; 256]) -> io::Result<()> {
215
0
    let bytes = s.as_bytes();
216
0
    let mut mark = 0;
217
0
    let mut i = 0;
218
0
    while i < s.len() {
219
0
        match bytes[i..].iter().position(|&c| table[c as usize] != 0) {
Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<&mut &mut alloc::string::String>::{closure#0}
Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<_>::{closure#0}
220
0
            Some(pos) => {
221
0
                i += pos;
222
0
            }
223
0
            None => break,
224
        }
225
0
        let c = bytes[i];
226
0
        let escape = table[c as usize];
227
0
        let escape_seq = HTML_ESCAPES[escape as usize];
228
0
        w.write_str(&s[mark..i])?;
229
0
        w.write_str(escape_seq)?;
230
0
        i += 1;
231
0
        mark = i; // all escaped characters are ASCII
232
    }
233
0
    w.write_str(&s[mark..])
234
0
}
Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<&mut &mut alloc::string::String>
Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<_>
235
236
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
237
mod simd {
238
    use super::StrWrite;
239
    use std::arch::x86_64::*;
240
    use std::io;
241
    use std::mem::size_of;
242
243
    const VECTOR_SIZE: usize = size_of::<__m128i>();
244
245
    pub(super) fn escape_html<W: StrWrite>(
246
        mut w: W,
247
        s: &str,
248
        table: &'static [u8; 256],
249
    ) -> io::Result<()> {
250
        // The SIMD accelerated code uses the PSHUFB instruction, which is part
251
        // of the SSSE3 instruction set. Further, we can only use this code if
252
        // the buffer is at least one VECTOR_SIZE in length to prevent reading
253
        // out of bounds. If either of these conditions is not met, we fall back
254
        // to scalar code.
255
        if is_x86_feature_detected!("ssse3") && s.len() >= VECTOR_SIZE {
256
            let bytes = s.as_bytes();
257
            let mut mark = 0;
258
259
            unsafe {
260
                foreach_special_simd(bytes, 0, |i| {
261
                    let escape_ix = *bytes.get_unchecked(i) as usize;
262
                    let entry = table[escape_ix] as usize;
263
                    w.write_str(s.get_unchecked(mark..i))?;
264
                    mark = i + 1; // all escaped characters are ASCII
265
                    if entry == 0 {
266
                        w.write_str(s.get_unchecked(i..mark))
267
                    } else {
268
                        let replacement = super::HTML_ESCAPES[entry];
269
                        w.write_str(replacement)
270
                    }
271
                })?;
272
                w.write_str(s.get_unchecked(mark..))
273
            }
274
        } else {
275
            super::escape_html_scalar(w, s, table)
276
        }
277
    }
278
279
    /// Creates the lookup table for use in `compute_mask`.
280
    const fn create_lookup() -> [u8; 16] {
281
        let mut table = [0; 16];
282
        table[(b'<' & 0x0f) as usize] = b'<';
283
        table[(b'>' & 0x0f) as usize] = b'>';
284
        table[(b'&' & 0x0f) as usize] = b'&';
285
        table[(b'"' & 0x0f) as usize] = b'"';
286
        table[(b'\'' & 0x0f) as usize] = b'\'';
287
        table[0] = 0b0111_1111;
288
        table
289
    }
290
291
    #[target_feature(enable = "ssse3")]
292
    /// Computes a byte mask at given offset in the byte buffer. Its first 16 (least significant)
293
    /// bits correspond to whether there is an HTML special byte (&, <, ", >) at the 16 bytes
294
    /// `bytes[offset..]`. For example, the mask `(1 << 3)` states that there is an HTML byte
295
    /// at `offset + 3`. It is only safe to call this function when
296
    /// `bytes.len() >= offset + VECTOR_SIZE`.
297
    unsafe fn compute_mask(bytes: &[u8], offset: usize) -> i32 {
298
        debug_assert!(bytes.len() >= offset + VECTOR_SIZE);
299
300
        let table = create_lookup();
301
        let lookup = _mm_loadu_si128(table.as_ptr() as *const __m128i);
302
        let raw_ptr = bytes.as_ptr().add(offset) as *const __m128i;
303
304
        // Load the vector from memory.
305
        let vector = _mm_loadu_si128(raw_ptr);
306
        // We take the least significant 4 bits of every byte and use them as indices
307
        // to map into the lookup vector.
308
        // Note that shuffle maps bytes with their most significant bit set to lookup[0].
309
        // Bytes that share their lower nibble with an HTML special byte get mapped to that
310
        // corresponding special byte. Note that all HTML special bytes have distinct lower
311
        // nibbles. Other bytes either get mapped to 0 or 127.
312
        let expected = _mm_shuffle_epi8(lookup, vector);
313
        // We compare the original vector to the mapped output. Bytes that shared a lower
314
        // nibble with an HTML special byte match *only* if they are that special byte. Bytes
315
        // that have either a 0 lower nibble or their most significant bit set were mapped to
316
        // 127 and will hence never match. All other bytes have non-zero lower nibbles but
317
        // were mapped to 0 and will therefore also not match.
318
        let matches = _mm_cmpeq_epi8(expected, vector);
319
320
        // Translate matches to a bitmask, where every 1 corresponds to a HTML special character
321
        // and a 0 is a non-HTML byte.
322
        _mm_movemask_epi8(matches)
323
    }
324
325
    /// Calls the given function with the index of every byte in the given byteslice
326
    /// that is either ", &, <, or > and for no other byte.
327
    /// Make sure to only call this when `bytes.len() >= 16`, undefined behaviour may
328
    /// occur otherwise.
329
    #[target_feature(enable = "ssse3")]
330
    unsafe fn foreach_special_simd<F>(
331
        bytes: &[u8],
332
        mut offset: usize,
333
        mut callback: F,
334
    ) -> io::Result<()>
335
    where
336
        F: FnMut(usize) -> io::Result<()>,
337
    {
338
        // The strategy here is to walk the byte buffer in chunks of VECTOR_SIZE (16)
339
        // bytes at a time starting at the given offset. For each chunk, we compute a
340
        // a bitmask indicating whether the corresponding byte is a HTML special byte.
341
        // We then iterate over all the 1 bits in this mask and call the callback function
342
        // with the corresponding index in the buffer.
343
        // When the number of HTML special bytes in the buffer is relatively low, this
344
        // allows us to quickly go through the buffer without a lookup and for every
345
        // single byte.
346
347
        debug_assert!(bytes.len() >= VECTOR_SIZE);
348
        let upperbound = bytes.len() - VECTOR_SIZE;
349
        while offset < upperbound {
350
            let mut mask = compute_mask(bytes, offset);
351
            while mask != 0 {
352
                let ix = mask.trailing_zeros();
353
                callback(offset + ix as usize)?;
354
                mask ^= mask & -mask;
355
            }
356
            offset += VECTOR_SIZE;
357
        }
358
359
        // Final iteration. We align the read with the end of the slice and
360
        // shift off the bytes at start we have already scanned.
361
        let mut mask = compute_mask(bytes, upperbound);
362
        mask >>= offset - upperbound;
363
        while mask != 0 {
364
            let ix = mask.trailing_zeros();
365
            callback(offset + ix as usize)?;
366
            mask ^= mask & -mask;
367
        }
368
        Ok(())
369
    }
370
371
    #[cfg(test)]
372
    mod html_scan_tests {
373
        #[test]
374
        fn multichunk() {
375
            let mut vec = Vec::new();
376
            unsafe {
377
                super::foreach_special_simd("&aXaaaa.a'aa9a<>aab&".as_bytes(), 0, |ix| {
378
                    #[allow(clippy::unit_arg)]
379
                    Ok(vec.push(ix))
380
                })
381
                .unwrap();
382
            }
383
            assert_eq!(vec, vec![0, 9, 14, 15, 19]);
384
        }
385
386
        // only match these bytes, and when we match them, match them VECTOR_SIZE times
387
        #[test]
388
        fn only_right_bytes_matched() {
389
            for b in 0..255u8 {
390
                let right_byte = b == b'&' || b == b'<' || b == b'>' || b == b'"' || b == b'\'';
391
                let vek = vec![b; super::VECTOR_SIZE];
392
                let mut match_count = 0;
393
                unsafe {
394
                    super::foreach_special_simd(&vek, 0, |_| {
395
                        match_count += 1;
396
                        Ok(())
397
                    })
398
                    .unwrap();
399
                }
400
                assert!((match_count > 0) == (match_count == super::VECTOR_SIZE));
401
                assert_eq!(
402
                    (match_count == super::VECTOR_SIZE),
403
                    right_byte,
404
                    "match_count: {}, byte: {:?}",
405
                    match_count,
406
                    b as char
407
                );
408
            }
409
        }
410
    }
411
}
412
413
#[cfg(test)]
414
mod test {
415
    pub use super::{escape_href, escape_html, escape_html_body_text};
416
417
    #[test]
418
    fn check_href_escape() {
419
        let mut s = String::new();
420
        escape_href(&mut s, "&^_").unwrap();
421
        assert_eq!(s.as_str(), "&amp;^_");
422
    }
423
424
    #[test]
425
    fn check_attr_escape() {
426
        let mut s = String::new();
427
        escape_html(&mut s, r##"&^"'_"##).unwrap();
428
        assert_eq!(s.as_str(), "&amp;^&quot;&#39;_");
429
    }
430
431
    #[test]
432
    fn check_body_escape() {
433
        let mut s = String::new();
434
        escape_html_body_text(&mut s, r##"&^"'_"##).unwrap();
435
        assert_eq!(s.as_str(), r##"&amp;^"'_"##);
436
    }
437
}