/rust/registry/src/index.crates.io-6f17d22bba15001f/pulldown-cmark-escape-0.10.1/src/lib.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015 Google Inc. All rights reserved. |
2 | | // |
3 | | // Permission is hereby granted, free of charge, to any person obtaining a copy |
4 | | // of this software and associated documentation files (the "Software"), to deal |
5 | | // in the Software without restriction, including without limitation the rights |
6 | | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
7 | | // copies of the Software, and to permit persons to whom the Software is |
8 | | // furnished to do so, subject to the following conditions: |
9 | | // |
10 | | // The above copyright notice and this permission notice shall be included in |
11 | | // all copies or substantial portions of the Software. |
12 | | // |
13 | | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
14 | | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
15 | | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
16 | | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
17 | | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
18 | | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
19 | | // THE SOFTWARE. |
20 | | |
21 | | //! Utility functions for HTML escaping. Only useful when building your own |
22 | | //! HTML renderer. |
23 | | |
24 | | use std::fmt::{Arguments, Write as FmtWrite}; |
25 | | use std::io::{self, ErrorKind, Write}; |
26 | | use std::str::from_utf8; |
27 | | |
28 | | #[rustfmt::skip] |
29 | | static HREF_SAFE: [u8; 128] = [ |
30 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
31 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
32 | | 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, |
33 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, |
34 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
35 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, |
36 | | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
37 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, |
38 | | ]; |
39 | | |
40 | | static HEX_CHARS: &[u8] = b"0123456789ABCDEF"; |
41 | | static AMP_ESCAPE: &str = "&"; |
42 | | static SINGLE_QUOTE_ESCAPE: &str = "'"; |
43 | | |
44 | | /// This wrapper exists because we can't have both a blanket implementation |
45 | | /// for all types implementing `Write` and types of the for `&mut W` where |
46 | | /// `W: StrWrite`. Since we need the latter a lot, we choose to wrap |
47 | | /// `Write` types. |
48 | | #[derive(Debug)] |
49 | | pub struct WriteWrapper<W>(pub W); |
50 | | |
51 | | /// Trait that allows writing string slices. This is basically an extension |
52 | | /// of `std::io::Write` in order to include `String`. |
53 | | pub trait StrWrite { |
54 | | fn write_str(&mut self, s: &str) -> io::Result<()>; |
55 | | |
56 | | fn write_fmt(&mut self, args: Arguments) -> io::Result<()>; |
57 | | } |
58 | | |
59 | | impl<W> StrWrite for WriteWrapper<W> |
60 | | where |
61 | | W: Write, |
62 | | { |
63 | | #[inline] |
64 | 0 | fn write_str(&mut self, s: &str) -> io::Result<()> { |
65 | 0 | self.0.write_all(s.as_bytes()) |
66 | 0 | } |
67 | | |
68 | | #[inline] |
69 | 0 | fn write_fmt(&mut self, args: Arguments) -> io::Result<()> { |
70 | 0 | self.0.write_fmt(args) |
71 | 0 | } |
72 | | } |
73 | | |
74 | | impl StrWrite for String { |
75 | | #[inline] |
76 | 0 | fn write_str(&mut self, s: &str) -> io::Result<()> { |
77 | 0 | self.push_str(s); |
78 | 0 | Ok(()) |
79 | 0 | } Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str |
80 | | |
81 | | #[inline] |
82 | 0 | fn write_fmt(&mut self, args: Arguments) -> io::Result<()> { |
83 | 0 | // FIXME: translate fmt error to io error? |
84 | 0 | FmtWrite::write_fmt(self, args).map_err(|_| ErrorKind::Other.into()) Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt::{closure#0} Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt::{closure#0} |
85 | 0 | } Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt Unexecuted instantiation: <alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt |
86 | | } |
87 | | |
88 | | impl<W> StrWrite for &'_ mut W |
89 | | where |
90 | | W: StrWrite, |
91 | | { |
92 | | #[inline] |
93 | 0 | fn write_str(&mut self, s: &str) -> io::Result<()> { |
94 | 0 | (**self).write_str(s) |
95 | 0 | } Unexecuted instantiation: <&mut alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str Unexecuted instantiation: <&mut &mut alloc::string::String as pulldown_cmark_escape::StrWrite>::write_str Unexecuted instantiation: <&mut _ as pulldown_cmark_escape::StrWrite>::write_str |
96 | | |
97 | | #[inline] |
98 | 0 | fn write_fmt(&mut self, args: Arguments) -> io::Result<()> { |
99 | 0 | (**self).write_fmt(args) |
100 | 0 | } Unexecuted instantiation: <&mut alloc::string::String as pulldown_cmark_escape::StrWrite>::write_fmt Unexecuted instantiation: <&mut _ as pulldown_cmark_escape::StrWrite>::write_fmt |
101 | | } |
102 | | |
103 | | /// Writes an href to the buffer, escaping href unsafe bytes. |
104 | 0 | pub fn escape_href<W>(mut w: W, s: &str) -> io::Result<()> |
105 | 0 | where |
106 | 0 | W: StrWrite, |
107 | 0 | { |
108 | 0 | let bytes = s.as_bytes(); |
109 | 0 | let mut mark = 0; |
110 | 0 | for i in 0..bytes.len() { |
111 | 0 | let c = bytes[i]; |
112 | 0 | if c >= 0x80 || HREF_SAFE[c as usize] == 0 { |
113 | | // character needing escape |
114 | | |
115 | | // write partial substring up to mark |
116 | 0 | if mark < i { |
117 | 0 | w.write_str(&s[mark..i])?; |
118 | 0 | } |
119 | 0 | match c { |
120 | | b'&' => { |
121 | 0 | w.write_str(AMP_ESCAPE)?; |
122 | | } |
123 | | b'\'' => { |
124 | 0 | w.write_str(SINGLE_QUOTE_ESCAPE)?; |
125 | | } |
126 | | _ => { |
127 | 0 | let mut buf = [0u8; 3]; |
128 | 0 | buf[0] = b'%'; |
129 | 0 | buf[1] = HEX_CHARS[((c as usize) >> 4) & 0xF]; |
130 | 0 | buf[2] = HEX_CHARS[(c as usize) & 0xF]; |
131 | 0 | let escaped = from_utf8(&buf).unwrap(); |
132 | 0 | w.write_str(escaped)?; |
133 | | } |
134 | | } |
135 | 0 | mark = i + 1; // all escaped characters are ASCII |
136 | 0 | } |
137 | | } |
138 | 0 | w.write_str(&s[mark..]) |
139 | 0 | } Unexecuted instantiation: pulldown_cmark_escape::escape_href::<&mut &mut alloc::string::String> Unexecuted instantiation: pulldown_cmark_escape::escape_href::<_> |
140 | | |
141 | 0 | const fn create_html_escape_table(body: bool) -> [u8; 256] { |
142 | 0 | let mut table = [0; 256]; |
143 | 0 | table[b'&' as usize] = 1; |
144 | 0 | table[b'<' as usize] = 2; |
145 | 0 | table[b'>' as usize] = 3; |
146 | 0 | if !body { |
147 | 0 | table[b'"' as usize] = 4; |
148 | 0 | table[b'\'' as usize] = 5; |
149 | 0 | } |
150 | 0 | table |
151 | 0 | } |
152 | | |
153 | | static HTML_ESCAPE_TABLE: [u8; 256] = create_html_escape_table(false); |
154 | | static HTML_BODY_TEXT_ESCAPE_TABLE: [u8; 256] = create_html_escape_table(true); |
155 | | |
156 | | static HTML_ESCAPES: [&str; 6] = ["", "&", "<", ">", """, "'"]; |
157 | | |
158 | | /// Writes the given string to the Write sink, replacing special HTML bytes |
159 | | /// (<, >, &, ", ') by escape sequences. |
160 | | /// |
161 | | /// Use this function to write output to quoted HTML attributes. |
162 | | /// Since this function doesn't escape spaces, unquoted attributes |
163 | | /// cannot be used. For example: |
164 | | /// |
165 | | /// ```rust |
166 | | /// let mut value = String::new(); |
167 | | /// pulldown_cmark_escape::escape_html(&mut value, "two words") |
168 | | /// .expect("writing to a string is infallible"); |
169 | | /// // This is okay. |
170 | | /// let ok = format!("<a title='{value}'>test</a>"); |
171 | | /// // This is not okay. |
172 | | /// //let not_ok = format!("<a title={value}>test</a>"); |
173 | | /// ```` |
174 | 0 | pub fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> { |
175 | 0 | #[cfg(all(target_arch = "x86_64", feature = "simd"))] |
176 | 0 | { |
177 | 0 | simd::escape_html(w, s, &HTML_ESCAPE_TABLE) |
178 | 0 | } |
179 | 0 | #[cfg(not(all(target_arch = "x86_64", feature = "simd")))] |
180 | 0 | { |
181 | 0 | escape_html_scalar(w, s, &HTML_ESCAPE_TABLE) |
182 | 0 | } |
183 | 0 | } Unexecuted instantiation: pulldown_cmark_escape::escape_html::<&mut &mut alloc::string::String> Unexecuted instantiation: pulldown_cmark_escape::escape_html::<_> |
184 | | |
185 | | /// For use in HTML body text, writes the given string to the Write sink, |
186 | | /// replacing special HTML bytes (<, >, &) by escape sequences. |
187 | | /// |
188 | | /// <div class="warning"> |
189 | | /// |
190 | | /// This function should be used for escaping text nodes, not attributes. |
191 | | /// In the below example, the word "foo" is an attribute, and the word |
192 | | /// "bar" is an text node. The word "bar" could be escaped by this function, |
193 | | /// but the word "foo" must be escaped using [`escape_html`]. |
194 | | /// |
195 | | /// ```html |
196 | | /// <span class="foo">bar</span> |
197 | | /// ``` |
198 | | /// |
199 | | /// If you aren't sure what the difference is, use [`escape_html`]. |
200 | | /// It should always be correct, but will produce larger output. |
201 | | /// |
202 | | /// </div> |
203 | 0 | pub fn escape_html_body_text<W: StrWrite>(w: W, s: &str) -> io::Result<()> { |
204 | 0 | #[cfg(all(target_arch = "x86_64", feature = "simd"))] |
205 | 0 | { |
206 | 0 | simd::escape_html(w, s, &HTML_BODY_TEXT_ESCAPE_TABLE) |
207 | 0 | } |
208 | 0 | #[cfg(not(all(target_arch = "x86_64", feature = "simd")))] |
209 | 0 | { |
210 | 0 | escape_html_scalar(w, s, &HTML_BODY_TEXT_ESCAPE_TABLE) |
211 | 0 | } |
212 | 0 | } Unexecuted instantiation: pulldown_cmark_escape::escape_html_body_text::<&mut &mut alloc::string::String> Unexecuted instantiation: pulldown_cmark_escape::escape_html_body_text::<_> |
213 | | |
214 | 0 | fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str, table: &'static [u8; 256]) -> io::Result<()> { |
215 | 0 | let bytes = s.as_bytes(); |
216 | 0 | let mut mark = 0; |
217 | 0 | let mut i = 0; |
218 | 0 | while i < s.len() { |
219 | 0 | match bytes[i..].iter().position(|&c| table[c as usize] != 0) { Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<&mut &mut alloc::string::String>::{closure#0} Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<_>::{closure#0} |
220 | 0 | Some(pos) => { |
221 | 0 | i += pos; |
222 | 0 | } |
223 | 0 | None => break, |
224 | | } |
225 | 0 | let c = bytes[i]; |
226 | 0 | let escape = table[c as usize]; |
227 | 0 | let escape_seq = HTML_ESCAPES[escape as usize]; |
228 | 0 | w.write_str(&s[mark..i])?; |
229 | 0 | w.write_str(escape_seq)?; |
230 | 0 | i += 1; |
231 | 0 | mark = i; // all escaped characters are ASCII |
232 | | } |
233 | 0 | w.write_str(&s[mark..]) |
234 | 0 | } Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<&mut &mut alloc::string::String> Unexecuted instantiation: pulldown_cmark_escape::escape_html_scalar::<_> |
235 | | |
236 | | #[cfg(all(target_arch = "x86_64", feature = "simd"))] |
237 | | mod simd { |
238 | | use super::StrWrite; |
239 | | use std::arch::x86_64::*; |
240 | | use std::io; |
241 | | use std::mem::size_of; |
242 | | |
243 | | const VECTOR_SIZE: usize = size_of::<__m128i>(); |
244 | | |
245 | | pub(super) fn escape_html<W: StrWrite>( |
246 | | mut w: W, |
247 | | s: &str, |
248 | | table: &'static [u8; 256], |
249 | | ) -> io::Result<()> { |
250 | | // The SIMD accelerated code uses the PSHUFB instruction, which is part |
251 | | // of the SSSE3 instruction set. Further, we can only use this code if |
252 | | // the buffer is at least one VECTOR_SIZE in length to prevent reading |
253 | | // out of bounds. If either of these conditions is not met, we fall back |
254 | | // to scalar code. |
255 | | if is_x86_feature_detected!("ssse3") && s.len() >= VECTOR_SIZE { |
256 | | let bytes = s.as_bytes(); |
257 | | let mut mark = 0; |
258 | | |
259 | | unsafe { |
260 | | foreach_special_simd(bytes, 0, |i| { |
261 | | let escape_ix = *bytes.get_unchecked(i) as usize; |
262 | | let entry = table[escape_ix] as usize; |
263 | | w.write_str(s.get_unchecked(mark..i))?; |
264 | | mark = i + 1; // all escaped characters are ASCII |
265 | | if entry == 0 { |
266 | | w.write_str(s.get_unchecked(i..mark)) |
267 | | } else { |
268 | | let replacement = super::HTML_ESCAPES[entry]; |
269 | | w.write_str(replacement) |
270 | | } |
271 | | })?; |
272 | | w.write_str(s.get_unchecked(mark..)) |
273 | | } |
274 | | } else { |
275 | | super::escape_html_scalar(w, s, table) |
276 | | } |
277 | | } |
278 | | |
279 | | /// Creates the lookup table for use in `compute_mask`. |
280 | | const fn create_lookup() -> [u8; 16] { |
281 | | let mut table = [0; 16]; |
282 | | table[(b'<' & 0x0f) as usize] = b'<'; |
283 | | table[(b'>' & 0x0f) as usize] = b'>'; |
284 | | table[(b'&' & 0x0f) as usize] = b'&'; |
285 | | table[(b'"' & 0x0f) as usize] = b'"'; |
286 | | table[(b'\'' & 0x0f) as usize] = b'\''; |
287 | | table[0] = 0b0111_1111; |
288 | | table |
289 | | } |
290 | | |
291 | | #[target_feature(enable = "ssse3")] |
292 | | /// Computes a byte mask at given offset in the byte buffer. Its first 16 (least significant) |
293 | | /// bits correspond to whether there is an HTML special byte (&, <, ", >) at the 16 bytes |
294 | | /// `bytes[offset..]`. For example, the mask `(1 << 3)` states that there is an HTML byte |
295 | | /// at `offset + 3`. It is only safe to call this function when |
296 | | /// `bytes.len() >= offset + VECTOR_SIZE`. |
297 | | unsafe fn compute_mask(bytes: &[u8], offset: usize) -> i32 { |
298 | | debug_assert!(bytes.len() >= offset + VECTOR_SIZE); |
299 | | |
300 | | let table = create_lookup(); |
301 | | let lookup = _mm_loadu_si128(table.as_ptr() as *const __m128i); |
302 | | let raw_ptr = bytes.as_ptr().add(offset) as *const __m128i; |
303 | | |
304 | | // Load the vector from memory. |
305 | | let vector = _mm_loadu_si128(raw_ptr); |
306 | | // We take the least significant 4 bits of every byte and use them as indices |
307 | | // to map into the lookup vector. |
308 | | // Note that shuffle maps bytes with their most significant bit set to lookup[0]. |
309 | | // Bytes that share their lower nibble with an HTML special byte get mapped to that |
310 | | // corresponding special byte. Note that all HTML special bytes have distinct lower |
311 | | // nibbles. Other bytes either get mapped to 0 or 127. |
312 | | let expected = _mm_shuffle_epi8(lookup, vector); |
313 | | // We compare the original vector to the mapped output. Bytes that shared a lower |
314 | | // nibble with an HTML special byte match *only* if they are that special byte. Bytes |
315 | | // that have either a 0 lower nibble or their most significant bit set were mapped to |
316 | | // 127 and will hence never match. All other bytes have non-zero lower nibbles but |
317 | | // were mapped to 0 and will therefore also not match. |
318 | | let matches = _mm_cmpeq_epi8(expected, vector); |
319 | | |
320 | | // Translate matches to a bitmask, where every 1 corresponds to a HTML special character |
321 | | // and a 0 is a non-HTML byte. |
322 | | _mm_movemask_epi8(matches) |
323 | | } |
324 | | |
325 | | /// Calls the given function with the index of every byte in the given byteslice |
326 | | /// that is either ", &, <, or > and for no other byte. |
327 | | /// Make sure to only call this when `bytes.len() >= 16`, undefined behaviour may |
328 | | /// occur otherwise. |
329 | | #[target_feature(enable = "ssse3")] |
330 | | unsafe fn foreach_special_simd<F>( |
331 | | bytes: &[u8], |
332 | | mut offset: usize, |
333 | | mut callback: F, |
334 | | ) -> io::Result<()> |
335 | | where |
336 | | F: FnMut(usize) -> io::Result<()>, |
337 | | { |
338 | | // The strategy here is to walk the byte buffer in chunks of VECTOR_SIZE (16) |
339 | | // bytes at a time starting at the given offset. For each chunk, we compute a |
340 | | // a bitmask indicating whether the corresponding byte is a HTML special byte. |
341 | | // We then iterate over all the 1 bits in this mask and call the callback function |
342 | | // with the corresponding index in the buffer. |
343 | | // When the number of HTML special bytes in the buffer is relatively low, this |
344 | | // allows us to quickly go through the buffer without a lookup and for every |
345 | | // single byte. |
346 | | |
347 | | debug_assert!(bytes.len() >= VECTOR_SIZE); |
348 | | let upperbound = bytes.len() - VECTOR_SIZE; |
349 | | while offset < upperbound { |
350 | | let mut mask = compute_mask(bytes, offset); |
351 | | while mask != 0 { |
352 | | let ix = mask.trailing_zeros(); |
353 | | callback(offset + ix as usize)?; |
354 | | mask ^= mask & -mask; |
355 | | } |
356 | | offset += VECTOR_SIZE; |
357 | | } |
358 | | |
359 | | // Final iteration. We align the read with the end of the slice and |
360 | | // shift off the bytes at start we have already scanned. |
361 | | let mut mask = compute_mask(bytes, upperbound); |
362 | | mask >>= offset - upperbound; |
363 | | while mask != 0 { |
364 | | let ix = mask.trailing_zeros(); |
365 | | callback(offset + ix as usize)?; |
366 | | mask ^= mask & -mask; |
367 | | } |
368 | | Ok(()) |
369 | | } |
370 | | |
371 | | #[cfg(test)] |
372 | | mod html_scan_tests { |
373 | | #[test] |
374 | | fn multichunk() { |
375 | | let mut vec = Vec::new(); |
376 | | unsafe { |
377 | | super::foreach_special_simd("&aXaaaa.a'aa9a<>aab&".as_bytes(), 0, |ix| { |
378 | | #[allow(clippy::unit_arg)] |
379 | | Ok(vec.push(ix)) |
380 | | }) |
381 | | .unwrap(); |
382 | | } |
383 | | assert_eq!(vec, vec![0, 9, 14, 15, 19]); |
384 | | } |
385 | | |
386 | | // only match these bytes, and when we match them, match them VECTOR_SIZE times |
387 | | #[test] |
388 | | fn only_right_bytes_matched() { |
389 | | for b in 0..255u8 { |
390 | | let right_byte = b == b'&' || b == b'<' || b == b'>' || b == b'"' || b == b'\''; |
391 | | let vek = vec![b; super::VECTOR_SIZE]; |
392 | | let mut match_count = 0; |
393 | | unsafe { |
394 | | super::foreach_special_simd(&vek, 0, |_| { |
395 | | match_count += 1; |
396 | | Ok(()) |
397 | | }) |
398 | | .unwrap(); |
399 | | } |
400 | | assert!((match_count > 0) == (match_count == super::VECTOR_SIZE)); |
401 | | assert_eq!( |
402 | | (match_count == super::VECTOR_SIZE), |
403 | | right_byte, |
404 | | "match_count: {}, byte: {:?}", |
405 | | match_count, |
406 | | b as char |
407 | | ); |
408 | | } |
409 | | } |
410 | | } |
411 | | } |
412 | | |
413 | | #[cfg(test)] |
414 | | mod test { |
415 | | pub use super::{escape_href, escape_html, escape_html_body_text}; |
416 | | |
417 | | #[test] |
418 | | fn check_href_escape() { |
419 | | let mut s = String::new(); |
420 | | escape_href(&mut s, "&^_").unwrap(); |
421 | | assert_eq!(s.as_str(), "&^_"); |
422 | | } |
423 | | |
424 | | #[test] |
425 | | fn check_attr_escape() { |
426 | | let mut s = String::new(); |
427 | | escape_html(&mut s, r##"&^"'_"##).unwrap(); |
428 | | assert_eq!(s.as_str(), "&^"'_"); |
429 | | } |
430 | | |
431 | | #[test] |
432 | | fn check_body_escape() { |
433 | | let mut s = String::new(); |
434 | | escape_html_body_text(&mut s, r##"&^"'_"##).unwrap(); |
435 | | assert_eq!(s.as_str(), r##"&^"'_"##); |
436 | | } |
437 | | } |