/src/html5ever/markup5ever/util/buffer_queue.rs
Line | Count | Source |
1 | | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | | // COPYRIGHT file at the top-level directory of this distribution. |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | | // option. This file may not be copied, modified, or distributed |
8 | | // except according to those terms. |
9 | | |
10 | | //! The `BufferQueue` struct and helper types. |
11 | | //! |
12 | | //! This type is designed for the efficient parsing of string data, especially where many |
13 | | //! significant characters are from the ascii range 0-63. This includes, for example, important |
14 | | //! characters in xml/html parsing. |
15 | | //! |
16 | | //! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero |
17 | | //! copy). |
18 | | //! |
19 | | //! [`BufferQueue`]: struct.BufferQueue.html |
20 | | |
21 | | use std::{ |
22 | | cell::{RefCell, RefMut}, |
23 | | collections::VecDeque, |
24 | | mem, |
25 | | }; |
26 | | |
27 | | use tendril::StrTendril; |
28 | | |
29 | | pub use self::SetResult::{FromSet, NotFromSet}; |
30 | | use crate::util::smallcharset::SmallCharSet; |
31 | | |
32 | | /// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a |
33 | | /// string buffer of characters not from the set. |
34 | | /// |
35 | | /// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from |
36 | | /// [`SmallCharSet`]: ../struct.SmallCharSet.html |
37 | | #[derive(PartialEq, Eq, Debug)] |
38 | | pub enum SetResult { |
39 | | /// A character from the `SmallCharSet`. |
40 | | FromSet(char), |
41 | | /// A string buffer containing no characters from the `SmallCharSet`. |
42 | | NotFromSet(StrTendril), |
43 | | } |
44 | | |
45 | | /// A queue of owned string buffers, which supports incrementally consuming characters. |
46 | | /// |
47 | | /// Internally it uses [`VecDeque`] and has the same complexity properties. |
48 | | /// |
49 | | /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html |
50 | | #[derive(Clone, Debug)] |
51 | | pub struct BufferQueue { |
52 | | /// Buffers to process. |
53 | | buffers: RefCell<VecDeque<StrTendril>>, |
54 | | } |
55 | | |
56 | | impl Default for BufferQueue { |
57 | | /// Create an empty BufferQueue. |
58 | | #[inline] |
59 | 24.7k | fn default() -> Self { |
60 | 24.7k | Self { |
61 | 24.7k | buffers: RefCell::new(VecDeque::with_capacity(16)), |
62 | 24.7k | } |
63 | 24.7k | } |
64 | | } |
65 | | |
66 | | impl BufferQueue { |
67 | | /// Returns whether the queue is empty. |
68 | | #[inline] |
69 | 36.9M | pub fn is_empty(&self) -> bool { |
70 | 36.9M | self.buffers.borrow().is_empty() |
71 | 36.9M | } |
72 | | |
73 | | /// Get the buffer at the beginning of the queue. |
74 | | #[inline] |
75 | | pub fn pop_front(&self) -> Option<StrTendril> { |
76 | | self.buffers.borrow_mut().pop_front() |
77 | | } |
78 | | |
79 | | /// Add a buffer to the beginning of the queue. |
80 | | /// |
81 | | /// If the buffer is empty, it will be skipped. |
82 | 11.0M | pub fn push_front(&self, buf: StrTendril) { |
83 | 11.0M | if buf.len32() == 0 { |
84 | 10.8M | return; |
85 | 201k | } |
86 | 201k | self.buffers.borrow_mut().push_front(buf); |
87 | 11.0M | } |
88 | | |
89 | | /// Add a buffer to the end of the queue. |
90 | | /// |
91 | | /// If the buffer is empty, it will be skipped. |
92 | 36.9M | pub fn push_back(&self, buf: StrTendril) { |
93 | 36.9M | if buf.len32() == 0 { |
94 | 0 | return; |
95 | 36.9M | } |
96 | 36.9M | self.buffers.borrow_mut().push_back(buf); |
97 | 36.9M | } |
98 | | |
99 | | /// Look at the next available character without removing it, if the queue is not empty. |
100 | 37.8M | pub fn peek(&self) -> Option<char> { |
101 | 37.8M | debug_assert!( |
102 | 0 | !self.buffers.borrow().iter().any(|el| el.len32() == 0), |
103 | 0 | "invariant \"all buffers in the queue are non-empty\" failed" |
104 | | ); |
105 | 37.8M | self.buffers |
106 | 37.8M | .borrow() |
107 | 37.8M | .front() |
108 | 37.8M | .map(|b| b.chars().next().unwrap()) |
109 | 37.8M | } |
110 | | |
111 | | /// Pops and returns either a single character from the given set, or |
112 | | /// a buffer of characters none of which are in the set. |
113 | | /// |
114 | | /// # Examples |
115 | | /// |
116 | | /// ``` |
117 | | /// # #[macro_use] extern crate markup5ever; |
118 | | /// # #[macro_use] extern crate tendril; |
119 | | /// # fn main() { |
120 | | /// use markup5ever::buffer_queue::{BufferQueue, SetResult}; |
121 | | /// |
122 | | /// let mut queue = BufferQueue::default(); |
123 | | /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#)); |
124 | | /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/'); |
125 | | /// let tag = format_tendril!("some_tag"); |
126 | | /// let attr = format_tendril!("attr"); |
127 | | /// let attr_val = format_tendril!("text"); |
128 | | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<'))); |
129 | | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag))); |
130 | | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' '))); |
131 | | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr))); |
132 | | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('='))); |
133 | | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"'))); |
134 | | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val))); |
135 | | /// // ... |
136 | | /// # } |
137 | | /// ``` |
138 | 43.4M | pub fn pop_except_from(&self, set: SmallCharSet) -> Option<SetResult> { |
139 | 43.4M | let (result, now_empty) = match self.buffers.borrow_mut().front_mut() { |
140 | 15.6M | None => (None, false), |
141 | 27.7M | Some(buf) => { |
142 | 27.7M | let n = set.nonmember_prefix_len(buf); |
143 | 27.7M | if n > 0 { |
144 | | let out; |
145 | 19.8M | unsafe { |
146 | 19.8M | out = buf.unsafe_subtendril(0, n); |
147 | 19.8M | buf.unsafe_pop_front(n); |
148 | 19.8M | } |
149 | 19.8M | (Some(NotFromSet(out)), buf.is_empty()) |
150 | | } else { |
151 | 7.96M | let c = buf.pop_front_char().expect("empty buffer in queue"); |
152 | 7.96M | (Some(FromSet(c)), buf.is_empty()) |
153 | | } |
154 | | }, |
155 | | }; |
156 | | |
157 | | // Unborrow self for this part. |
158 | 43.4M | if now_empty { |
159 | 16.0M | self.buffers.borrow_mut().pop_front(); |
160 | 27.4M | } |
161 | | |
162 | 43.4M | result |
163 | 43.4M | } |
164 | | |
165 | | /// Consume bytes matching the pattern, using a custom comparison function `eq`. |
166 | | /// |
167 | | /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if |
168 | | /// it wasn't possible to know (more data is needed). |
169 | | /// |
170 | | /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. |
171 | | /// |
172 | | /// # Examples |
173 | | /// |
174 | | /// ``` |
175 | | /// # extern crate markup5ever; |
176 | | /// # #[macro_use] extern crate tendril; |
177 | | /// # fn main() { |
178 | | /// use markup5ever::buffer_queue::BufferQueue; |
179 | | /// |
180 | | /// let mut queue = BufferQueue::default(); |
181 | | /// queue.push_back(format_tendril!("testtext")); |
182 | | /// let test_str = "test"; |
183 | | /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true)); |
184 | | /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true)); |
185 | | /// assert!(queue.is_empty()); |
186 | | /// # } |
187 | | /// ``` |
188 | 10.8M | pub fn eat<F: Fn(&u8, &u8) -> bool>(&self, pat: &str, eq: F) -> Option<bool> { |
189 | 10.8M | let mut buffers_exhausted = 0; |
190 | 10.8M | let mut consumed_from_last = 0; |
191 | | |
192 | 10.8M | self.buffers.borrow().front()?; |
193 | | |
194 | 13.0M | for pattern_byte in pat.bytes() { |
195 | 13.0M | if buffers_exhausted >= self.buffers.borrow().len() { |
196 | 6.29k | return None; |
197 | 13.0M | } |
198 | 13.0M | let buf = &self.buffers.borrow()[buffers_exhausted]; |
199 | | |
200 | 13.0M | if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { |
201 | 10.4M | return Some(false); |
202 | 2.63M | } |
203 | | |
204 | 2.63M | consumed_from_last += 1; |
205 | 2.63M | if consumed_from_last >= buf.len() { |
206 | 150k | buffers_exhausted += 1; |
207 | 150k | consumed_from_last = 0; |
208 | 2.48M | } |
209 | | } |
210 | | |
211 | | // We have a match. Commit changes to the BufferQueue. |
212 | 403k | for _ in 0..buffers_exhausted { |
213 | 138k | self.buffers.borrow_mut().pop_front(); |
214 | 138k | } |
215 | | |
216 | 403k | match self.buffers.borrow_mut().front_mut() { |
217 | 129k | None => assert_eq!(consumed_from_last, 0), |
218 | 273k | Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), |
219 | | } |
220 | | |
221 | 403k | Some(true) |
222 | 10.8M | } |
223 | | |
224 | | /// Get the next character if one is available, removing it from the queue. |
225 | | /// |
226 | | /// This function manages the buffers, removing them as they become empty. |
227 | 91.9M | pub fn next(&self) -> Option<char> { |
228 | 91.9M | let (result, now_empty) = match self.buffers.borrow_mut().front_mut() { |
229 | 20.2M | None => (None, false), |
230 | 71.6M | Some(buf) => { |
231 | 71.6M | let c = buf.pop_front_char().expect("empty buffer in queue"); |
232 | 71.6M | (Some(c), buf.is_empty()) |
233 | | }, |
234 | | }; |
235 | | |
236 | 91.9M | if now_empty { |
237 | 20.7M | self.buffers.borrow_mut().pop_front(); |
238 | 71.2M | } |
239 | | |
240 | 91.9M | result |
241 | 91.9M | } |
242 | | |
243 | 0 | pub fn replace_with(&self, other: BufferQueue) { |
244 | 0 | let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take()); |
245 | 0 | } |
246 | | |
247 | 0 | pub fn swap_with(&self, other: &BufferQueue) { |
248 | 0 | mem::swap( |
249 | 0 | &mut *self.buffers.borrow_mut(), |
250 | 0 | &mut *other.buffers.borrow_mut(), |
251 | | ); |
252 | 0 | } |
253 | | |
254 | | /// Return a mutable reference to the first tendril in the queue. |
255 | 0 | pub fn peek_front_chunk_mut(&self) -> Option<RefMut<'_, StrTendril>> { |
256 | 0 | let buffers = self.buffers.borrow_mut(); |
257 | 0 | if buffers.is_empty() { |
258 | 0 | return None; |
259 | 0 | } |
260 | | |
261 | 0 | let front_buffer = RefMut::map(buffers, |buffers| { |
262 | 0 | buffers.front_mut().expect("there is at least one buffer") |
263 | 0 | }); |
264 | 0 | Some(front_buffer) |
265 | 0 | } |
266 | | } |
267 | | |
268 | | #[cfg(test)] |
269 | | #[allow(non_snake_case)] |
270 | | mod test { |
271 | | use tendril::SliceExt; |
272 | | |
273 | | use super::BufferQueue; |
274 | | use super::SetResult::{FromSet, NotFromSet}; |
275 | | |
276 | | #[test] |
277 | | fn smoke_test() { |
278 | | let bq = BufferQueue::default(); |
279 | | assert_eq!(bq.peek(), None); |
280 | | assert_eq!(bq.next(), None); |
281 | | |
282 | | bq.push_back("abc".to_tendril()); |
283 | | assert_eq!(bq.peek(), Some('a')); |
284 | | assert_eq!(bq.next(), Some('a')); |
285 | | assert_eq!(bq.peek(), Some('b')); |
286 | | assert_eq!(bq.peek(), Some('b')); |
287 | | assert_eq!(bq.next(), Some('b')); |
288 | | assert_eq!(bq.peek(), Some('c')); |
289 | | assert_eq!(bq.next(), Some('c')); |
290 | | assert_eq!(bq.peek(), None); |
291 | | assert_eq!(bq.next(), None); |
292 | | } |
293 | | |
294 | | #[test] |
295 | | fn can_unconsume() { |
296 | | let bq = BufferQueue::default(); |
297 | | bq.push_back("abc".to_tendril()); |
298 | | assert_eq!(bq.next(), Some('a')); |
299 | | |
300 | | bq.push_front("xy".to_tendril()); |
301 | | assert_eq!(bq.next(), Some('x')); |
302 | | assert_eq!(bq.next(), Some('y')); |
303 | | assert_eq!(bq.next(), Some('b')); |
304 | | assert_eq!(bq.next(), Some('c')); |
305 | | assert_eq!(bq.next(), None); |
306 | | } |
307 | | |
308 | | #[test] |
309 | | fn can_pop_except_set() { |
310 | | let bq = BufferQueue::default(); |
311 | | bq.push_back("abc&def".to_tendril()); |
312 | | let pop = || bq.pop_except_from(small_char_set!('&')); |
313 | | assert_eq!(pop(), Some(NotFromSet("abc".to_tendril()))); |
314 | | assert_eq!(pop(), Some(FromSet('&'))); |
315 | | assert_eq!(pop(), Some(NotFromSet("def".to_tendril()))); |
316 | | assert_eq!(pop(), None); |
317 | | } |
318 | | |
319 | | #[test] |
320 | | fn can_eat() { |
321 | | // This is not very comprehensive. We rely on the tokenizer |
322 | | // integration tests for more thorough testing with many |
323 | | // different input buffer splits. |
324 | | let bq = BufferQueue::default(); |
325 | | bq.push_back("a".to_tendril()); |
326 | | bq.push_back("bc".to_tendril()); |
327 | | assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None); |
328 | | assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false)); |
329 | | assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true)); |
330 | | assert_eq!(bq.next(), Some('c')); |
331 | | assert_eq!(bq.next(), None); |
332 | | } |
333 | | } |