Coverage Report

Created: 2026-03-23 07:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/tendril-0.4.3/src/stream.rs
Line
Count
Source
1
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4
// option. This file may not be copied, modified, or distributed
5
// except according to those terms.
6
7
//! Streams of tendrils.
8
9
use fmt;
10
use tendril::{Atomicity, NonAtomic, Tendril};
11
12
use std::borrow::Cow;
13
use std::fs::File;
14
use std::io;
15
use std::marker::PhantomData;
16
use std::path::Path;
17
18
#[cfg(feature = "encoding")]
19
use encoding;
20
#[cfg(feature = "encoding_rs")]
21
use encoding_rs::{self, DecoderResult};
22
use utf8;
23
24
/// Trait for types that can process a tendril.
25
///
26
/// This is a "push" interface, unlike the "pull" interface of
27
/// `Iterator<Item=Tendril<F>>`. The push interface matches
28
/// [html5ever][] and other incremental parsers with a similar
29
/// architecture.
30
///
31
/// [html5ever]: https://github.com/servo/html5ever
32
pub trait TendrilSink<F, A = NonAtomic>
33
where
34
    F: fmt::Format,
35
    A: Atomicity,
36
{
37
    /// Process this tendril.
38
    fn process(&mut self, t: Tendril<F, A>);
39
40
    /// Indicates that an error has occurred.
41
    fn error(&mut self, desc: Cow<'static, str>);
42
43
    /// What the overall result of processing is.
44
    type Output;
45
46
    /// Indicates the end of the stream.
47
    fn finish(self) -> Self::Output;
48
49
    /// Process one tendril and finish.
50
0
    fn one<T>(mut self, t: T) -> Self::Output
51
0
    where
52
0
        Self: Sized,
53
0
        T: Into<Tendril<F, A>>,
54
    {
55
0
        self.process(t.into());
56
0
        self.finish()
57
0
    }
Unexecuted instantiation: <html5ever::driver::Parser<ammonia::rcdom::RcDom> as tendril::stream::TendrilSink<tendril::fmt::UTF8>>::one::<&str>
Unexecuted instantiation: <_ as tendril::stream::TendrilSink<_, _>>::one::<_>
58
59
    /// Consume an iterator of tendrils, processing each item, then finish.
60
0
    fn from_iter<I>(mut self, i: I) -> Self::Output
61
0
    where
62
0
        Self: Sized,
63
0
        I: IntoIterator,
64
0
        I::Item: Into<Tendril<F, A>>,
65
    {
66
0
        for t in i {
67
0
            self.process(t.into())
68
        }
69
0
        self.finish()
70
0
    }
71
72
    /// Read from the given stream of bytes until exhaustion and process incrementally,
73
    /// then finish. Return `Err` at the first I/O error.
74
0
    fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output>
75
0
    where
76
0
        Self: Sized,
77
0
        R: io::Read,
78
0
        F: fmt::SliceFormat<Slice = [u8]>,
79
    {
80
        const BUFFER_SIZE: u32 = 4 * 1024;
81
        loop {
82
0
            let mut tendril = Tendril::<F, A>::new();
83
            // FIXME: this exposes uninitialized bytes to a generic R type
84
            // this is fine for R=File which never reads these bytes,
85
            // but user-defined types might.
86
            // The standard library pushes zeros to `Vec<u8>` for that reason.
87
0
            unsafe {
88
0
                tendril.push_uninitialized(BUFFER_SIZE);
89
0
            }
90
            loop {
91
0
                match r.read(&mut tendril) {
92
0
                    Ok(0) => return Ok(self.finish()),
93
0
                    Ok(n) => {
94
0
                        tendril.pop_back(BUFFER_SIZE - n as u32);
95
0
                        self.process(tendril);
96
0
                        break;
97
                    }
98
0
                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
99
0
                    Err(e) => return Err(e),
100
                }
101
            }
102
        }
103
0
    }
104
105
    /// Read from the file at the given path and process incrementally,
106
    /// then finish. Return `Err` at the first I/O error.
107
0
    fn from_file<P>(self, path: P) -> io::Result<Self::Output>
108
0
    where
109
0
        Self: Sized,
110
0
        P: AsRef<Path>,
111
0
        F: fmt::SliceFormat<Slice = [u8]>,
112
    {
113
0
        self.read_from(&mut File::open(path)?)
114
0
    }
115
}
116
117
/// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8,
118
/// lossily replace ill-formed byte sequences with U+FFFD replacement characters,
119
/// and emits Unicode (`StrTendril`).
120
///
121
/// This does not allocate memory: the output is either subtendrils on the input,
122
/// on inline tendrils for a single code point.
123
pub struct Utf8LossyDecoder<Sink, A = NonAtomic>
124
where
125
    Sink: TendrilSink<fmt::UTF8, A>,
126
    A: Atomicity,
127
{
128
    pub inner_sink: Sink,
129
    incomplete: Option<utf8::Incomplete>,
130
    marker: PhantomData<A>,
131
}
132
133
impl<Sink, A> Utf8LossyDecoder<Sink, A>
134
where
135
    Sink: TendrilSink<fmt::UTF8, A>,
136
    A: Atomicity,
137
{
138
    /// Create a new incremental UTF-8 decoder.
139
    #[inline]
140
0
    pub fn new(inner_sink: Sink) -> Self {
141
0
        Utf8LossyDecoder {
142
0
            inner_sink: inner_sink,
143
0
            incomplete: None,
144
0
            marker: PhantomData,
145
0
        }
146
0
    }
147
}
148
149
impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A>
150
where
151
    Sink: TendrilSink<fmt::UTF8, A>,
152
    A: Atomicity,
153
{
154
    #[inline]
155
0
    fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) {
156
        // FIXME: remove take() and map() when non-lexical borrows are stable.
157
0
        if let Some(mut incomplete) = self.incomplete.take() {
158
0
            let resume_at = incomplete.try_complete(&t).map(|(result, rest)| {
159
0
                match result {
160
0
                    Ok(s) => self.inner_sink.process(Tendril::from_slice(s)),
161
0
                    Err(_) => {
162
0
                        self.inner_sink.error("invalid byte sequence".into());
163
0
                        self.inner_sink
164
0
                            .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
165
0
                    }
166
                }
167
0
                t.len() - rest.len()
168
0
            });
169
0
            match resume_at {
170
                None => {
171
0
                    self.incomplete = Some(incomplete);
172
0
                    return;
173
                }
174
0
                Some(resume_at) => t.pop_front(resume_at as u32),
175
            }
176
0
        }
177
0
        while !t.is_empty() {
178
0
            let unborrowed_result = match utf8::decode(&t) {
179
0
                Ok(s) => {
180
0
                    debug_assert!(s.as_ptr() == t.as_ptr());
181
0
                    debug_assert!(s.len() == t.len());
182
0
                    Ok(())
183
                }
184
                Err(utf8::DecodeError::Invalid {
185
0
                    valid_prefix,
186
0
                    invalid_sequence,
187
                    ..
188
                }) => {
189
0
                    debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
190
0
                    debug_assert!(valid_prefix.len() <= t.len());
191
0
                    Err((
192
0
                        valid_prefix.len(),
193
0
                        Err(valid_prefix.len() + invalid_sequence.len()),
194
0
                    ))
195
                }
196
                Err(utf8::DecodeError::Incomplete {
197
0
                    valid_prefix,
198
0
                    incomplete_suffix,
199
                }) => {
200
0
                    debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
201
0
                    debug_assert!(valid_prefix.len() <= t.len());
202
0
                    Err((valid_prefix.len(), Ok(incomplete_suffix)))
203
                }
204
            };
205
0
            match unborrowed_result {
206
                Ok(()) => {
207
0
                    unsafe { self.inner_sink.process(t.reinterpret_without_validating()) }
208
0
                    return;
209
                }
210
0
                Err((valid_len, and_then)) => {
211
0
                    if valid_len > 0 {
212
0
                        let subtendril = t.subtendril(0, valid_len as u32);
213
                        unsafe {
214
0
                            self.inner_sink
215
0
                                .process(subtendril.reinterpret_without_validating())
216
                        }
217
0
                    }
218
0
                    match and_then {
219
0
                        Ok(incomplete) => {
220
0
                            self.incomplete = Some(incomplete);
221
0
                            return;
222
                        }
223
0
                        Err(offset) => {
224
0
                            self.inner_sink.error("invalid byte sequence".into());
225
0
                            self.inner_sink
226
0
                                .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
227
0
                            t.pop_front(offset as u32);
228
0
                        }
229
                    }
230
                }
231
            }
232
        }
233
0
    }
234
235
    #[inline]
236
0
    fn error(&mut self, desc: Cow<'static, str>) {
237
0
        self.inner_sink.error(desc);
238
0
    }
239
240
    type Output = Sink::Output;
241
242
    #[inline]
243
0
    fn finish(mut self) -> Sink::Output {
244
0
        if self.incomplete.is_some() {
245
0
            self.inner_sink
246
0
                .error("incomplete byte sequence at end of stream".into());
247
0
            self.inner_sink
248
0
                .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
249
0
        }
250
0
        self.inner_sink.finish()
251
0
    }
252
}
253
254
/// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding,
255
/// lossily replace ill-formed byte sequences with U+FFFD replacement characters,
256
/// and emits Unicode (`StrTendril`).
257
///
258
/// This allocates new tendrils for encodings other than UTF-8.
259
#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
260
pub struct LossyDecoder<Sink, A = NonAtomic>
261
where
262
    Sink: TendrilSink<fmt::UTF8, A>,
263
    A: Atomicity,
264
{
265
    inner: LossyDecoderInner<Sink, A>,
266
}
267
268
#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
269
enum LossyDecoderInner<Sink, A>
270
where
271
    Sink: TendrilSink<fmt::UTF8, A>,
272
    A: Atomicity,
273
{
274
    Utf8(Utf8LossyDecoder<Sink, A>),
275
    #[cfg(feature = "encoding")]
276
    Encoding(Box<encoding::RawDecoder>, Sink),
277
    #[cfg(feature = "encoding_rs")]
278
    EncodingRs(encoding_rs::Decoder, Sink),
279
}
280
281
#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
282
impl<Sink, A> LossyDecoder<Sink, A>
283
where
284
    Sink: TendrilSink<fmt::UTF8, A>,
285
    A: Atomicity,
286
{
287
    /// Create a new incremental decoder using the encoding crate.
288
    #[cfg(feature = "encoding")]
289
    #[inline]
290
    pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self {
291
        if encoding.name() == "utf-8" {
292
            LossyDecoder::utf8(sink)
293
        } else {
294
            LossyDecoder {
295
                inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink),
296
            }
297
        }
298
    }
299
300
    /// Create a new incremental decoder using the encoding_rs crate.
301
    #[cfg(feature = "encoding_rs")]
302
    #[inline]
303
    pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self {
304
        if encoding == encoding_rs::UTF_8 {
305
            return Self::utf8(sink);
306
        }
307
        Self {
308
            inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink),
309
        }
310
    }
311
312
    /// Create a new incremental decoder for the UTF-8 encoding.
313
    ///
314
    /// This is useful for content that is known at run-time to be UTF-8
315
    /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.)
316
    #[inline]
317
    pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> {
318
        LossyDecoder {
319
            inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)),
320
        }
321
    }
322
323
    /// Give a reference to the inner sink.
324
    pub fn inner_sink(&self) -> &Sink {
325
        match self.inner {
326
            LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink,
327
            #[cfg(feature = "encoding")]
328
            LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink,
329
            #[cfg(feature = "encoding_rs")]
330
            LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink,
331
        }
332
    }
333
334
    /// Give a mutable reference to the inner sink.
335
    pub fn inner_sink_mut(&mut self) -> &mut Sink {
336
        match self.inner {
337
            LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink,
338
            #[cfg(feature = "encoding")]
339
            LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink,
340
            #[cfg(feature = "encoding_rs")]
341
            LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink,
342
        }
343
    }
344
}
345
346
#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
347
impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A>
348
where
349
    Sink: TendrilSink<fmt::UTF8, A>,
350
    A: Atomicity,
351
{
352
    #[inline]
353
    fn process(&mut self, t: Tendril<fmt::Bytes, A>) {
354
        match self.inner {
355
            LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t),
356
            #[cfg(feature = "encoding")]
357
            LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => {
358
                let mut out = Tendril::new();
359
                let mut t = t;
360
                loop {
361
                    match decoder.raw_feed(&*t, &mut out) {
362
                        (_, Some(err)) => {
363
                            out.push_char('\u{fffd}');
364
                            sink.error(err.cause);
365
                            debug_assert!(err.upto >= 0);
366
                            t.pop_front(err.upto as u32);
367
                            // continue loop and process remainder of t
368
                        }
369
                        (_, None) => break,
370
                    }
371
                }
372
                if out.len() > 0 {
373
                    sink.process(out);
374
                }
375
            }
376
            #[cfg(feature = "encoding_rs")]
377
            LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => {
378
                if t.is_empty() {
379
                    return;
380
                }
381
                decode_to_sink(t, decoder, sink, false);
382
            }
383
        }
384
    }
385
386
    #[inline]
387
    fn error(&mut self, desc: Cow<'static, str>) {
388
        match self.inner {
389
            LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc),
390
            #[cfg(feature = "encoding")]
391
            LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc),
392
            #[cfg(feature = "encoding_rs")]
393
            LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc),
394
        }
395
    }
396
397
    type Output = Sink::Output;
398
399
    #[inline]
400
    fn finish(self) -> Sink::Output {
401
        match self.inner {
402
            LossyDecoderInner::Utf8(utf8) => return utf8.finish(),
403
            #[cfg(feature = "encoding")]
404
            LossyDecoderInner::Encoding(mut decoder, mut sink) => {
405
                let mut out = Tendril::new();
406
                if let Some(err) = decoder.raw_finish(&mut out) {
407
                    out.push_char('\u{fffd}');
408
                    sink.error(err.cause);
409
                }
410
                if out.len() > 0 {
411
                    sink.process(out);
412
                }
413
                sink.finish()
414
            }
415
            #[cfg(feature = "encoding_rs")]
416
            LossyDecoderInner::EncodingRs(mut decoder, mut sink) => {
417
                decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true);
418
                sink.finish()
419
            }
420
        }
421
    }
422
}
423
424
#[cfg(feature = "encoding_rs")]
425
fn decode_to_sink<Sink, A>(
426
    mut t: Tendril<fmt::Bytes, A>,
427
    decoder: &mut encoding_rs::Decoder,
428
    sink: &mut Sink,
429
    last: bool,
430
) where
431
    Sink: TendrilSink<fmt::UTF8, A>,
432
    A: Atomicity,
433
{
434
    loop {
435
        let mut out = <Tendril<fmt::Bytes, A>>::new();
436
        let max_len = decoder
437
            .max_utf8_buffer_length_without_replacement(t.len())
438
            .unwrap_or(8192);
439
        unsafe {
440
            out.push_uninitialized(std::cmp::min(max_len as u32, 8192));
441
        }
442
        let (result, bytes_read, bytes_written) =
443
            decoder.decode_to_utf8_without_replacement(&t, &mut out, last);
444
        if bytes_written > 0 {
445
            sink.process(unsafe {
446
                out.subtendril(0, bytes_written as u32)
447
                    .reinterpret_without_validating()
448
            });
449
        }
450
        match result {
451
            DecoderResult::InputEmpty => return,
452
            DecoderResult::OutputFull => {}
453
            DecoderResult::Malformed(_, _) => {
454
                sink.error(Cow::Borrowed("invalid sequence"));
455
                sink.process("\u{FFFD}".into());
456
            }
457
        }
458
        t.pop_front(bytes_read as u32);
459
        if t.is_empty() {
460
            return;
461
        }
462
    }
463
}
464
465
#[cfg(test)]
466
mod test {
467
    use super::{TendrilSink, Utf8LossyDecoder};
468
    use fmt;
469
    use std::borrow::Cow;
470
    use tendril::{Atomicity, NonAtomic, Tendril};
471
472
    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
473
    use super::LossyDecoder;
474
    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
475
    use tendril::SliceExt;
476
477
    #[cfg(feature = "encoding")]
478
    use encoding::all as enc;
479
    #[cfg(feature = "encoding_rs")]
480
    use encoding_rs as enc_rs;
481
482
    struct Accumulate<A>
483
    where
484
        A: Atomicity,
485
    {
486
        tendrils: Vec<Tendril<fmt::UTF8, A>>,
487
        errors: Vec<String>,
488
    }
489
490
    impl<A> Accumulate<A>
491
    where
492
        A: Atomicity,
493
    {
494
        fn new() -> Accumulate<A> {
495
            Accumulate {
496
                tendrils: vec![],
497
                errors: vec![],
498
            }
499
        }
500
    }
501
502
    impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A>
503
    where
504
        A: Atomicity,
505
    {
506
        fn process(&mut self, t: Tendril<fmt::UTF8, A>) {
507
            self.tendrils.push(t);
508
        }
509
510
        fn error(&mut self, desc: Cow<'static, str>) {
511
            self.errors.push(desc.into_owned());
512
        }
513
514
        type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>);
515
516
        fn finish(self) -> Self::Output {
517
            (self.tendrils, self.errors)
518
        }
519
    }
520
521
    fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) {
522
        let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
523
        let (tendrils, errors) = decoder.from_iter(input.iter().cloned());
524
        assert_eq!(
525
            expected,
526
            &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>()
527
        );
528
        assert_eq!(errs, errors.len());
529
    }
530
531
    #[test]
532
    fn utf8() {
533
        check_utf8(&[], &[], 0);
534
        check_utf8(&[b""], &[], 0);
535
        check_utf8(&[b"xyz"], &["xyz"], 0);
536
        check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0);
537
538
        check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0);
539
        check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
540
        check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
541
        check_utf8(
542
            &[b"xy\xEA", b"\x99", b"\xAEzw"],
543
            &["xy", "\u{a66e}z", "w"],
544
            0,
545
        );
546
        check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0);
547
        check_utf8(
548
            &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""],
549
            &["\u{a66e}"],
550
            0,
551
        );
552
553
        check_utf8(
554
            &[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
555
            &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"],
556
            4,
557
        );
558
        check_utf8(
559
            &[b"xy\xEA\x99", b"\xFFz"],
560
            &["xy", "\u{fffd}", "\u{fffd}", "z"],
561
            2,
562
        );
563
564
        check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0);
565
        check_utf8(
566
            &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"],
567
            &["ő", "ő", "ő"],
568
            0,
569
        );
570
        check_utf8(
571
            &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"],
572
            &["ő", "ő", "ő"],
573
            0,
574
        );
575
        check_utf8(
576
            &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"],
577
            &["ő", "\u{fffd}", "\u{fffd}", "ő"],
578
            2,
579
        );
580
581
        // incomplete char at end of input
582
        check_utf8(&[b"\xC0"], &["\u{fffd}"], 1);
583
        check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1);
584
    }
585
586
    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
587
    fn check_decode(
588
        mut decoder: LossyDecoder<Accumulate<NonAtomic>>,
589
        input: &[&[u8]],
590
        expected: &str,
591
        errs: usize,
592
    ) {
593
        for x in input {
594
            decoder.process(x.to_tendril());
595
        }
596
        let (tendrils, errors) = decoder.finish();
597
        let mut tendril: Tendril<fmt::UTF8> = Tendril::new();
598
        for t in tendrils {
599
            tendril.push_tendril(&t);
600
        }
601
        assert_eq!(expected, &*tendril);
602
        assert_eq!(errs, errors.len());
603
    }
604
605
    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
606
    pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)];
607
608
    #[cfg(any(feature = "encoding"))]
609
    const ASCII: Tests = &[
610
        (&[], "", 0),
611
        (&[b""], "", 0),
612
        (&[b"xyz"], "xyz", 0),
613
        (&[b"xy", b"", b"", b"z"], "xyz", 0),
614
        (&[b"x", b"y", b"z"], "xyz", 0),
615
        (&[b"\xFF"], "\u{fffd}", 1),
616
        (&[b"x\xC0yz"], "x\u{fffd}yz", 1),
617
        (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1),
618
        (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3),
619
    ];
620
621
    #[cfg(feature = "encoding")]
622
    #[test]
623
    fn decode_ascii() {
624
        for &(input, expected, errs) in ASCII {
625
            let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new());
626
            check_decode(decoder, input, expected, errs);
627
        }
628
    }
629
630
    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
631
    const UTF_8: Tests = &[
632
        (&[], "", 0),
633
        (&[b""], "", 0),
634
        (&[b"xyz"], "xyz", 0),
635
        (&[b"x", b"y", b"z"], "xyz", 0),
636
        (&[b"\xEA\x99\xAE"], "\u{a66e}", 0),
637
        (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0),
638
        (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0),
639
        (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0),
640
        (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0),
641
        (
642
            &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""],
643
            "\u{a66e}",
644
            0,
645
        ),
646
        (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0),
647
        (
648
            &[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
649
            "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z",
650
            4,
651
        ),
652
        (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2),
653
        // incomplete char at end of input
654
        (&[b"\xC0"], "\u{fffd}", 1),
655
        (&[b"\xEA\x99"], "\u{fffd}", 1),
656
    ];
657
658
    #[cfg(feature = "encoding")]
659
    #[test]
660
    fn decode_utf8() {
661
        for &(input, expected, errs) in UTF_8 {
662
            let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new());
663
            check_decode(decoder, input, expected, errs);
664
        }
665
    }
666
667
    #[cfg(feature = "encoding_rs")]
668
    #[test]
669
    fn decode_utf8_encoding_rs() {
670
        for &(input, expected, errs) in UTF_8 {
671
            let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new());
672
            check_decode(decoder, input, expected, errs);
673
        }
674
    }
675
676
    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
677
    const KOI8_U: Tests = &[
678
        (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
679
        (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
680
        (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0),
681
        (
682
            &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""],
683
            "Энергия",
684
            0,
685
        ),
686
    ];
687
688
    #[cfg(feature = "encoding")]
689
    #[test]
690
    fn decode_koi8_u() {
691
        for &(input, expected, errs) in KOI8_U {
692
            let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new());
693
            check_decode(decoder, input, expected, errs);
694
        }
695
    }
696
697
    #[cfg(feature = "encoding_rs")]
698
    #[test]
699
    fn decode_koi8_u_encoding_rs() {
700
        for &(input, expected, errs) in KOI8_U {
701
            let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new());
702
            check_decode(decoder, input, expected, errs);
703
        }
704
    }
705
706
    #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
707
    const WINDOWS_949: Tests = &[
708
        (&[], "", 0),
709
        (&[b""], "", 0),
710
        (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0),
711
        (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0),
712
        (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0),
713
        (
714
            &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"],
715
            "안녕하세요",
716
            0,
717
        ),
718
        (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1),
719
        (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1),
720
        (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1),
721
    ];
722
723
    #[cfg(feature = "encoding")]
724
    #[test]
725
    fn decode_windows_949() {
726
        for &(input, expected, errs) in WINDOWS_949 {
727
            let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new());
728
            check_decode(decoder, input, expected, errs);
729
        }
730
    }
731
732
    #[cfg(feature = "encoding_rs")]
733
    #[test]
734
    fn decode_windows_949_encoding_rs() {
735
        for &(input, expected, errs) in WINDOWS_949 {
736
            let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new());
737
            check_decode(decoder, input, expected, errs);
738
        }
739
    }
740
741
    #[test]
742
    fn read_from() {
743
        let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
744
        let mut bytes: &[u8] = b"foo\xffbar";
745
        let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap();
746
        assert_eq!(
747
            &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(),
748
            &["foo", "\u{FFFD}", "bar"]
749
        );
750
        assert_eq!(errors, &["invalid byte sequence"]);
751
    }
752
}