/rust/registry/src/index.crates.io-1949cf8c6b5b557f/tendril-0.4.3/src/stream.rs
Line | Count | Source |
1 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
2 | | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
3 | | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
4 | | // option. This file may not be copied, modified, or distributed |
5 | | // except according to those terms. |
6 | | |
7 | | //! Streams of tendrils. |
8 | | |
9 | | use fmt; |
10 | | use tendril::{Atomicity, NonAtomic, Tendril}; |
11 | | |
12 | | use std::borrow::Cow; |
13 | | use std::fs::File; |
14 | | use std::io; |
15 | | use std::marker::PhantomData; |
16 | | use std::path::Path; |
17 | | |
18 | | #[cfg(feature = "encoding")] |
19 | | use encoding; |
20 | | #[cfg(feature = "encoding_rs")] |
21 | | use encoding_rs::{self, DecoderResult}; |
22 | | use utf8; |
23 | | |
24 | | /// Trait for types that can process a tendril. |
25 | | /// |
26 | | /// This is a "push" interface, unlike the "pull" interface of |
27 | | /// `Iterator<Item=Tendril<F>>`. The push interface matches |
28 | | /// [html5ever][] and other incremental parsers with a similar |
29 | | /// architecture. |
30 | | /// |
31 | | /// [html5ever]: https://github.com/servo/html5ever |
32 | | pub trait TendrilSink<F, A = NonAtomic> |
33 | | where |
34 | | F: fmt::Format, |
35 | | A: Atomicity, |
36 | | { |
37 | | /// Process this tendril. |
38 | | fn process(&mut self, t: Tendril<F, A>); |
39 | | |
40 | | /// Indicates that an error has occurred. |
41 | | fn error(&mut self, desc: Cow<'static, str>); |
42 | | |
43 | | /// What the overall result of processing is. |
44 | | type Output; |
45 | | |
46 | | /// Indicates the end of the stream. |
47 | | fn finish(self) -> Self::Output; |
48 | | |
49 | | /// Process one tendril and finish. |
50 | 0 | fn one<T>(mut self, t: T) -> Self::Output |
51 | 0 | where |
52 | 0 | Self: Sized, |
53 | 0 | T: Into<Tendril<F, A>>, |
54 | | { |
55 | 0 | self.process(t.into()); |
56 | 0 | self.finish() |
57 | 0 | } Unexecuted instantiation: <html5ever::driver::Parser<ammonia::rcdom::RcDom> as tendril::stream::TendrilSink<tendril::fmt::UTF8>>::one::<&str> Unexecuted instantiation: <_ as tendril::stream::TendrilSink<_, _>>::one::<_> |
58 | | |
59 | | /// Consume an iterator of tendrils, processing each item, then finish. |
60 | 0 | fn from_iter<I>(mut self, i: I) -> Self::Output |
61 | 0 | where |
62 | 0 | Self: Sized, |
63 | 0 | I: IntoIterator, |
64 | 0 | I::Item: Into<Tendril<F, A>>, |
65 | | { |
66 | 0 | for t in i { |
67 | 0 | self.process(t.into()) |
68 | | } |
69 | 0 | self.finish() |
70 | 0 | } |
71 | | |
72 | | /// Read from the given stream of bytes until exhaustion and process incrementally, |
73 | | /// then finish. Return `Err` at the first I/O error. |
74 | 0 | fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output> |
75 | 0 | where |
76 | 0 | Self: Sized, |
77 | 0 | R: io::Read, |
78 | 0 | F: fmt::SliceFormat<Slice = [u8]>, |
79 | | { |
80 | | const BUFFER_SIZE: u32 = 4 * 1024; |
81 | | loop { |
82 | 0 | let mut tendril = Tendril::<F, A>::new(); |
83 | | // FIXME: this exposes uninitialized bytes to a generic R type |
84 | | // this is fine for R=File which never reads these bytes, |
85 | | // but user-defined types might. |
86 | | // The standard library pushes zeros to `Vec<u8>` for that reason. |
87 | 0 | unsafe { |
88 | 0 | tendril.push_uninitialized(BUFFER_SIZE); |
89 | 0 | } |
90 | | loop { |
91 | 0 | match r.read(&mut tendril) { |
92 | 0 | Ok(0) => return Ok(self.finish()), |
93 | 0 | Ok(n) => { |
94 | 0 | tendril.pop_back(BUFFER_SIZE - n as u32); |
95 | 0 | self.process(tendril); |
96 | 0 | break; |
97 | | } |
98 | 0 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} |
99 | 0 | Err(e) => return Err(e), |
100 | | } |
101 | | } |
102 | | } |
103 | 0 | } |
104 | | |
105 | | /// Read from the file at the given path and process incrementally, |
106 | | /// then finish. Return `Err` at the first I/O error. |
107 | 0 | fn from_file<P>(self, path: P) -> io::Result<Self::Output> |
108 | 0 | where |
109 | 0 | Self: Sized, |
110 | 0 | P: AsRef<Path>, |
111 | 0 | F: fmt::SliceFormat<Slice = [u8]>, |
112 | | { |
113 | 0 | self.read_from(&mut File::open(path)?) |
114 | 0 | } |
115 | | } |
116 | | |
117 | | /// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, |
118 | | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
119 | | /// and emits Unicode (`StrTendril`). |
120 | | /// |
121 | | /// This does not allocate memory: the output is either subtendrils on the input, |
122 | | /// on inline tendrils for a single code point. |
123 | | pub struct Utf8LossyDecoder<Sink, A = NonAtomic> |
124 | | where |
125 | | Sink: TendrilSink<fmt::UTF8, A>, |
126 | | A: Atomicity, |
127 | | { |
128 | | pub inner_sink: Sink, |
129 | | incomplete: Option<utf8::Incomplete>, |
130 | | marker: PhantomData<A>, |
131 | | } |
132 | | |
133 | | impl<Sink, A> Utf8LossyDecoder<Sink, A> |
134 | | where |
135 | | Sink: TendrilSink<fmt::UTF8, A>, |
136 | | A: Atomicity, |
137 | | { |
138 | | /// Create a new incremental UTF-8 decoder. |
139 | | #[inline] |
140 | 0 | pub fn new(inner_sink: Sink) -> Self { |
141 | 0 | Utf8LossyDecoder { |
142 | 0 | inner_sink: inner_sink, |
143 | 0 | incomplete: None, |
144 | 0 | marker: PhantomData, |
145 | 0 | } |
146 | 0 | } |
147 | | } |
148 | | |
149 | | impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A> |
150 | | where |
151 | | Sink: TendrilSink<fmt::UTF8, A>, |
152 | | A: Atomicity, |
153 | | { |
154 | | #[inline] |
155 | 0 | fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) { |
156 | | // FIXME: remove take() and map() when non-lexical borrows are stable. |
157 | 0 | if let Some(mut incomplete) = self.incomplete.take() { |
158 | 0 | let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { |
159 | 0 | match result { |
160 | 0 | Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), |
161 | 0 | Err(_) => { |
162 | 0 | self.inner_sink.error("invalid byte sequence".into()); |
163 | 0 | self.inner_sink |
164 | 0 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
165 | 0 | } |
166 | | } |
167 | 0 | t.len() - rest.len() |
168 | 0 | }); |
169 | 0 | match resume_at { |
170 | | None => { |
171 | 0 | self.incomplete = Some(incomplete); |
172 | 0 | return; |
173 | | } |
174 | 0 | Some(resume_at) => t.pop_front(resume_at as u32), |
175 | | } |
176 | 0 | } |
177 | 0 | while !t.is_empty() { |
178 | 0 | let unborrowed_result = match utf8::decode(&t) { |
179 | 0 | Ok(s) => { |
180 | 0 | debug_assert!(s.as_ptr() == t.as_ptr()); |
181 | 0 | debug_assert!(s.len() == t.len()); |
182 | 0 | Ok(()) |
183 | | } |
184 | | Err(utf8::DecodeError::Invalid { |
185 | 0 | valid_prefix, |
186 | 0 | invalid_sequence, |
187 | | .. |
188 | | }) => { |
189 | 0 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
190 | 0 | debug_assert!(valid_prefix.len() <= t.len()); |
191 | 0 | Err(( |
192 | 0 | valid_prefix.len(), |
193 | 0 | Err(valid_prefix.len() + invalid_sequence.len()), |
194 | 0 | )) |
195 | | } |
196 | | Err(utf8::DecodeError::Incomplete { |
197 | 0 | valid_prefix, |
198 | 0 | incomplete_suffix, |
199 | | }) => { |
200 | 0 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
201 | 0 | debug_assert!(valid_prefix.len() <= t.len()); |
202 | 0 | Err((valid_prefix.len(), Ok(incomplete_suffix))) |
203 | | } |
204 | | }; |
205 | 0 | match unborrowed_result { |
206 | | Ok(()) => { |
207 | 0 | unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } |
208 | 0 | return; |
209 | | } |
210 | 0 | Err((valid_len, and_then)) => { |
211 | 0 | if valid_len > 0 { |
212 | 0 | let subtendril = t.subtendril(0, valid_len as u32); |
213 | | unsafe { |
214 | 0 | self.inner_sink |
215 | 0 | .process(subtendril.reinterpret_without_validating()) |
216 | | } |
217 | 0 | } |
218 | 0 | match and_then { |
219 | 0 | Ok(incomplete) => { |
220 | 0 | self.incomplete = Some(incomplete); |
221 | 0 | return; |
222 | | } |
223 | 0 | Err(offset) => { |
224 | 0 | self.inner_sink.error("invalid byte sequence".into()); |
225 | 0 | self.inner_sink |
226 | 0 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
227 | 0 | t.pop_front(offset as u32); |
228 | 0 | } |
229 | | } |
230 | | } |
231 | | } |
232 | | } |
233 | 0 | } |
234 | | |
235 | | #[inline] |
236 | 0 | fn error(&mut self, desc: Cow<'static, str>) { |
237 | 0 | self.inner_sink.error(desc); |
238 | 0 | } |
239 | | |
240 | | type Output = Sink::Output; |
241 | | |
242 | | #[inline] |
243 | 0 | fn finish(mut self) -> Sink::Output { |
244 | 0 | if self.incomplete.is_some() { |
245 | 0 | self.inner_sink |
246 | 0 | .error("incomplete byte sequence at end of stream".into()); |
247 | 0 | self.inner_sink |
248 | 0 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
249 | 0 | } |
250 | 0 | self.inner_sink.finish() |
251 | 0 | } |
252 | | } |
253 | | |
254 | | /// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, |
255 | | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
256 | | /// and emits Unicode (`StrTendril`). |
257 | | /// |
258 | | /// This allocates new tendrils for encodings other than UTF-8. |
259 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
260 | | pub struct LossyDecoder<Sink, A = NonAtomic> |
261 | | where |
262 | | Sink: TendrilSink<fmt::UTF8, A>, |
263 | | A: Atomicity, |
264 | | { |
265 | | inner: LossyDecoderInner<Sink, A>, |
266 | | } |
267 | | |
268 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
269 | | enum LossyDecoderInner<Sink, A> |
270 | | where |
271 | | Sink: TendrilSink<fmt::UTF8, A>, |
272 | | A: Atomicity, |
273 | | { |
274 | | Utf8(Utf8LossyDecoder<Sink, A>), |
275 | | #[cfg(feature = "encoding")] |
276 | | Encoding(Box<encoding::RawDecoder>, Sink), |
277 | | #[cfg(feature = "encoding_rs")] |
278 | | EncodingRs(encoding_rs::Decoder, Sink), |
279 | | } |
280 | | |
281 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
282 | | impl<Sink, A> LossyDecoder<Sink, A> |
283 | | where |
284 | | Sink: TendrilSink<fmt::UTF8, A>, |
285 | | A: Atomicity, |
286 | | { |
287 | | /// Create a new incremental decoder using the encoding crate. |
288 | | #[cfg(feature = "encoding")] |
289 | | #[inline] |
290 | | pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { |
291 | | if encoding.name() == "utf-8" { |
292 | | LossyDecoder::utf8(sink) |
293 | | } else { |
294 | | LossyDecoder { |
295 | | inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), |
296 | | } |
297 | | } |
298 | | } |
299 | | |
300 | | /// Create a new incremental decoder using the encoding_rs crate. |
301 | | #[cfg(feature = "encoding_rs")] |
302 | | #[inline] |
303 | | pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { |
304 | | if encoding == encoding_rs::UTF_8 { |
305 | | return Self::utf8(sink); |
306 | | } |
307 | | Self { |
308 | | inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), |
309 | | } |
310 | | } |
311 | | |
312 | | /// Create a new incremental decoder for the UTF-8 encoding. |
313 | | /// |
314 | | /// This is useful for content that is known at run-time to be UTF-8 |
315 | | /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) |
316 | | #[inline] |
317 | | pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> { |
318 | | LossyDecoder { |
319 | | inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), |
320 | | } |
321 | | } |
322 | | |
323 | | /// Give a reference to the inner sink. |
324 | | pub fn inner_sink(&self) -> &Sink { |
325 | | match self.inner { |
326 | | LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, |
327 | | #[cfg(feature = "encoding")] |
328 | | LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, |
329 | | #[cfg(feature = "encoding_rs")] |
330 | | LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, |
331 | | } |
332 | | } |
333 | | |
334 | | /// Give a mutable reference to the inner sink. |
335 | | pub fn inner_sink_mut(&mut self) -> &mut Sink { |
336 | | match self.inner { |
337 | | LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, |
338 | | #[cfg(feature = "encoding")] |
339 | | LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, |
340 | | #[cfg(feature = "encoding_rs")] |
341 | | LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, |
342 | | } |
343 | | } |
344 | | } |
345 | | |
346 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
347 | | impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A> |
348 | | where |
349 | | Sink: TendrilSink<fmt::UTF8, A>, |
350 | | A: Atomicity, |
351 | | { |
352 | | #[inline] |
353 | | fn process(&mut self, t: Tendril<fmt::Bytes, A>) { |
354 | | match self.inner { |
355 | | LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), |
356 | | #[cfg(feature = "encoding")] |
357 | | LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { |
358 | | let mut out = Tendril::new(); |
359 | | let mut t = t; |
360 | | loop { |
361 | | match decoder.raw_feed(&*t, &mut out) { |
362 | | (_, Some(err)) => { |
363 | | out.push_char('\u{fffd}'); |
364 | | sink.error(err.cause); |
365 | | debug_assert!(err.upto >= 0); |
366 | | t.pop_front(err.upto as u32); |
367 | | // continue loop and process remainder of t |
368 | | } |
369 | | (_, None) => break, |
370 | | } |
371 | | } |
372 | | if out.len() > 0 { |
373 | | sink.process(out); |
374 | | } |
375 | | } |
376 | | #[cfg(feature = "encoding_rs")] |
377 | | LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { |
378 | | if t.is_empty() { |
379 | | return; |
380 | | } |
381 | | decode_to_sink(t, decoder, sink, false); |
382 | | } |
383 | | } |
384 | | } |
385 | | |
386 | | #[inline] |
387 | | fn error(&mut self, desc: Cow<'static, str>) { |
388 | | match self.inner { |
389 | | LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), |
390 | | #[cfg(feature = "encoding")] |
391 | | LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), |
392 | | #[cfg(feature = "encoding_rs")] |
393 | | LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), |
394 | | } |
395 | | } |
396 | | |
397 | | type Output = Sink::Output; |
398 | | |
399 | | #[inline] |
400 | | fn finish(self) -> Sink::Output { |
401 | | match self.inner { |
402 | | LossyDecoderInner::Utf8(utf8) => return utf8.finish(), |
403 | | #[cfg(feature = "encoding")] |
404 | | LossyDecoderInner::Encoding(mut decoder, mut sink) => { |
405 | | let mut out = Tendril::new(); |
406 | | if let Some(err) = decoder.raw_finish(&mut out) { |
407 | | out.push_char('\u{fffd}'); |
408 | | sink.error(err.cause); |
409 | | } |
410 | | if out.len() > 0 { |
411 | | sink.process(out); |
412 | | } |
413 | | sink.finish() |
414 | | } |
415 | | #[cfg(feature = "encoding_rs")] |
416 | | LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { |
417 | | decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); |
418 | | sink.finish() |
419 | | } |
420 | | } |
421 | | } |
422 | | } |
423 | | |
424 | | #[cfg(feature = "encoding_rs")] |
425 | | fn decode_to_sink<Sink, A>( |
426 | | mut t: Tendril<fmt::Bytes, A>, |
427 | | decoder: &mut encoding_rs::Decoder, |
428 | | sink: &mut Sink, |
429 | | last: bool, |
430 | | ) where |
431 | | Sink: TendrilSink<fmt::UTF8, A>, |
432 | | A: Atomicity, |
433 | | { |
434 | | loop { |
435 | | let mut out = <Tendril<fmt::Bytes, A>>::new(); |
436 | | let max_len = decoder |
437 | | .max_utf8_buffer_length_without_replacement(t.len()) |
438 | | .unwrap_or(8192); |
439 | | unsafe { |
440 | | out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); |
441 | | } |
442 | | let (result, bytes_read, bytes_written) = |
443 | | decoder.decode_to_utf8_without_replacement(&t, &mut out, last); |
444 | | if bytes_written > 0 { |
445 | | sink.process(unsafe { |
446 | | out.subtendril(0, bytes_written as u32) |
447 | | .reinterpret_without_validating() |
448 | | }); |
449 | | } |
450 | | match result { |
451 | | DecoderResult::InputEmpty => return, |
452 | | DecoderResult::OutputFull => {} |
453 | | DecoderResult::Malformed(_, _) => { |
454 | | sink.error(Cow::Borrowed("invalid sequence")); |
455 | | sink.process("\u{FFFD}".into()); |
456 | | } |
457 | | } |
458 | | t.pop_front(bytes_read as u32); |
459 | | if t.is_empty() { |
460 | | return; |
461 | | } |
462 | | } |
463 | | } |
464 | | |
465 | | #[cfg(test)] |
466 | | mod test { |
467 | | use super::{TendrilSink, Utf8LossyDecoder}; |
468 | | use fmt; |
469 | | use std::borrow::Cow; |
470 | | use tendril::{Atomicity, NonAtomic, Tendril}; |
471 | | |
472 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
473 | | use super::LossyDecoder; |
474 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
475 | | use tendril::SliceExt; |
476 | | |
477 | | #[cfg(feature = "encoding")] |
478 | | use encoding::all as enc; |
479 | | #[cfg(feature = "encoding_rs")] |
480 | | use encoding_rs as enc_rs; |
481 | | |
482 | | struct Accumulate<A> |
483 | | where |
484 | | A: Atomicity, |
485 | | { |
486 | | tendrils: Vec<Tendril<fmt::UTF8, A>>, |
487 | | errors: Vec<String>, |
488 | | } |
489 | | |
490 | | impl<A> Accumulate<A> |
491 | | where |
492 | | A: Atomicity, |
493 | | { |
494 | | fn new() -> Accumulate<A> { |
495 | | Accumulate { |
496 | | tendrils: vec![], |
497 | | errors: vec![], |
498 | | } |
499 | | } |
500 | | } |
501 | | |
502 | | impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A> |
503 | | where |
504 | | A: Atomicity, |
505 | | { |
506 | | fn process(&mut self, t: Tendril<fmt::UTF8, A>) { |
507 | | self.tendrils.push(t); |
508 | | } |
509 | | |
510 | | fn error(&mut self, desc: Cow<'static, str>) { |
511 | | self.errors.push(desc.into_owned()); |
512 | | } |
513 | | |
514 | | type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>); |
515 | | |
516 | | fn finish(self) -> Self::Output { |
517 | | (self.tendrils, self.errors) |
518 | | } |
519 | | } |
520 | | |
521 | | fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { |
522 | | let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
523 | | let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); |
524 | | assert_eq!( |
525 | | expected, |
526 | | &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>() |
527 | | ); |
528 | | assert_eq!(errs, errors.len()); |
529 | | } |
530 | | |
531 | | #[test] |
532 | | fn utf8() { |
533 | | check_utf8(&[], &[], 0); |
534 | | check_utf8(&[b""], &[], 0); |
535 | | check_utf8(&[b"xyz"], &["xyz"], 0); |
536 | | check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); |
537 | | |
538 | | check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); |
539 | | check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); |
540 | | check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); |
541 | | check_utf8( |
542 | | &[b"xy\xEA", b"\x99", b"\xAEzw"], |
543 | | &["xy", "\u{a66e}z", "w"], |
544 | | 0, |
545 | | ); |
546 | | check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); |
547 | | check_utf8( |
548 | | &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], |
549 | | &["\u{a66e}"], |
550 | | 0, |
551 | | ); |
552 | | |
553 | | check_utf8( |
554 | | &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], |
555 | | &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], |
556 | | 4, |
557 | | ); |
558 | | check_utf8( |
559 | | &[b"xy\xEA\x99", b"\xFFz"], |
560 | | &["xy", "\u{fffd}", "\u{fffd}", "z"], |
561 | | 2, |
562 | | ); |
563 | | |
564 | | check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); |
565 | | check_utf8( |
566 | | &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], |
567 | | &["ő", "ő", "ő"], |
568 | | 0, |
569 | | ); |
570 | | check_utf8( |
571 | | &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], |
572 | | &["ő", "ő", "ő"], |
573 | | 0, |
574 | | ); |
575 | | check_utf8( |
576 | | &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], |
577 | | &["ő", "\u{fffd}", "\u{fffd}", "ő"], |
578 | | 2, |
579 | | ); |
580 | | |
581 | | // incomplete char at end of input |
582 | | check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); |
583 | | check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); |
584 | | } |
585 | | |
586 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
587 | | fn check_decode( |
588 | | mut decoder: LossyDecoder<Accumulate<NonAtomic>>, |
589 | | input: &[&[u8]], |
590 | | expected: &str, |
591 | | errs: usize, |
592 | | ) { |
593 | | for x in input { |
594 | | decoder.process(x.to_tendril()); |
595 | | } |
596 | | let (tendrils, errors) = decoder.finish(); |
597 | | let mut tendril: Tendril<fmt::UTF8> = Tendril::new(); |
598 | | for t in tendrils { |
599 | | tendril.push_tendril(&t); |
600 | | } |
601 | | assert_eq!(expected, &*tendril); |
602 | | assert_eq!(errs, errors.len()); |
603 | | } |
604 | | |
605 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
606 | | pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; |
607 | | |
608 | | #[cfg(any(feature = "encoding"))] |
609 | | const ASCII: Tests = &[ |
610 | | (&[], "", 0), |
611 | | (&[b""], "", 0), |
612 | | (&[b"xyz"], "xyz", 0), |
613 | | (&[b"xy", b"", b"", b"z"], "xyz", 0), |
614 | | (&[b"x", b"y", b"z"], "xyz", 0), |
615 | | (&[b"\xFF"], "\u{fffd}", 1), |
616 | | (&[b"x\xC0yz"], "x\u{fffd}yz", 1), |
617 | | (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), |
618 | | (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), |
619 | | ]; |
620 | | |
621 | | #[cfg(feature = "encoding")] |
622 | | #[test] |
623 | | fn decode_ascii() { |
624 | | for &(input, expected, errs) in ASCII { |
625 | | let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); |
626 | | check_decode(decoder, input, expected, errs); |
627 | | } |
628 | | } |
629 | | |
630 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
631 | | const UTF_8: Tests = &[ |
632 | | (&[], "", 0), |
633 | | (&[b""], "", 0), |
634 | | (&[b"xyz"], "xyz", 0), |
635 | | (&[b"x", b"y", b"z"], "xyz", 0), |
636 | | (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), |
637 | | (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), |
638 | | (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), |
639 | | (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), |
640 | | (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), |
641 | | ( |
642 | | &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], |
643 | | "\u{a66e}", |
644 | | 0, |
645 | | ), |
646 | | (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), |
647 | | ( |
648 | | &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], |
649 | | "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", |
650 | | 4, |
651 | | ), |
652 | | (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), |
653 | | // incomplete char at end of input |
654 | | (&[b"\xC0"], "\u{fffd}", 1), |
655 | | (&[b"\xEA\x99"], "\u{fffd}", 1), |
656 | | ]; |
657 | | |
658 | | #[cfg(feature = "encoding")] |
659 | | #[test] |
660 | | fn decode_utf8() { |
661 | | for &(input, expected, errs) in UTF_8 { |
662 | | let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); |
663 | | check_decode(decoder, input, expected, errs); |
664 | | } |
665 | | } |
666 | | |
667 | | #[cfg(feature = "encoding_rs")] |
668 | | #[test] |
669 | | fn decode_utf8_encoding_rs() { |
670 | | for &(input, expected, errs) in UTF_8 { |
671 | | let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); |
672 | | check_decode(decoder, input, expected, errs); |
673 | | } |
674 | | } |
675 | | |
676 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
677 | | const KOI8_U: Tests = &[ |
678 | | (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), |
679 | | (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), |
680 | | (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), |
681 | | ( |
682 | | &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], |
683 | | "Энергия", |
684 | | 0, |
685 | | ), |
686 | | ]; |
687 | | |
688 | | #[cfg(feature = "encoding")] |
689 | | #[test] |
690 | | fn decode_koi8_u() { |
691 | | for &(input, expected, errs) in KOI8_U { |
692 | | let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); |
693 | | check_decode(decoder, input, expected, errs); |
694 | | } |
695 | | } |
696 | | |
697 | | #[cfg(feature = "encoding_rs")] |
698 | | #[test] |
699 | | fn decode_koi8_u_encoding_rs() { |
700 | | for &(input, expected, errs) in KOI8_U { |
701 | | let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); |
702 | | check_decode(decoder, input, expected, errs); |
703 | | } |
704 | | } |
705 | | |
706 | | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] |
707 | | const WINDOWS_949: Tests = &[ |
708 | | (&[], "", 0), |
709 | | (&[b""], "", 0), |
710 | | (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), |
711 | | (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), |
712 | | (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), |
713 | | ( |
714 | | &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], |
715 | | "안녕하세요", |
716 | | 0, |
717 | | ), |
718 | | (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), |
719 | | (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), |
720 | | (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), |
721 | | ]; |
722 | | |
723 | | #[cfg(feature = "encoding")] |
724 | | #[test] |
725 | | fn decode_windows_949() { |
726 | | for &(input, expected, errs) in WINDOWS_949 { |
727 | | let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); |
728 | | check_decode(decoder, input, expected, errs); |
729 | | } |
730 | | } |
731 | | |
732 | | #[cfg(feature = "encoding_rs")] |
733 | | #[test] |
734 | | fn decode_windows_949_encoding_rs() { |
735 | | for &(input, expected, errs) in WINDOWS_949 { |
736 | | let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); |
737 | | check_decode(decoder, input, expected, errs); |
738 | | } |
739 | | } |
740 | | |
741 | | #[test] |
742 | | fn read_from() { |
743 | | let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
744 | | let mut bytes: &[u8] = b"foo\xffbar"; |
745 | | let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); |
746 | | assert_eq!( |
747 | | &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(), |
748 | | &["foo", "\u{FFFD}", "bar"] |
749 | | ); |
750 | | assert_eq!(errors, &["invalid byte sequence"]); |
751 | | } |
752 | | } |