/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_capi-1.5.1/src/utf.rs
Line | Count | Source |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | use alloc::borrow::Cow; |
6 | | |
7 | | use core::fmt::Write; |
8 | | use writeable::{LengthHint, Part, TryWriteable, Writeable}; |
9 | | |
10 | | #[allow(dead_code)] |
11 | | pub(crate) struct LossyWrap<T>(pub T); |
12 | | |
13 | | impl<T: TryWriteable> Writeable for LossyWrap<T> { |
14 | 0 | fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result { |
15 | 0 | let _ = self.0.try_write_to(sink)?; |
16 | 0 | Ok(()) |
17 | 0 | } Unexecuted instantiation: <icu_capi::utf::LossyWrap<icu_capi::utf::PotentiallyInvalidUtf8> as writeable::Writeable>::write_to::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut diplomat_runtime::writeable::DiplomatWriteable>> Unexecuted instantiation: <icu_capi::utf::LossyWrap<icu_capi::utf::PotentiallyInvalidUtf8> as writeable::Writeable>::write_to::<<regex_automata::dfa::sparse::DFA<_> as icu_list::lazy_automaton::LazyAutomaton>::matches_earliest_fwd_lazy::DFAStepper> Unexecuted instantiation: <icu_capi::utf::LossyWrap<icu_capi::utf::PotentiallyInvalidUtf16> as writeable::Writeable>::write_to::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut diplomat_runtime::writeable::DiplomatWriteable>> Unexecuted instantiation: <icu_capi::utf::LossyWrap<icu_capi::utf::PotentiallyInvalidUtf16> as writeable::Writeable>::write_to::<<regex_automata::dfa::sparse::DFA<_> as icu_list::lazy_automaton::LazyAutomaton>::matches_earliest_fwd_lazy::DFAStepper> |
18 | | |
19 | 0 | fn writeable_length_hint(&self) -> LengthHint { |
20 | 0 | self.0.writeable_length_hint() |
21 | 0 | } |
22 | | } |
23 | | |
24 | | use core::{char::DecodeUtf16Error, fmt, str::Utf8Error}; |
25 | | |
26 | | /// Implements [`Writeable`] for [`&[u8]`] according to the [WHATWG Encoding Standard]( |
27 | | /// https://encoding.spec.whatwg.org/#utf-8-decoder). |
28 | | #[derive(Debug)] |
29 | | #[allow(clippy::exhaustive_structs)] // newtype |
30 | | pub struct PotentiallyInvalidUtf8<'a>(pub &'a [u8]); |
31 | | |
32 | | impl TryWriteable for PotentiallyInvalidUtf8<'_> { |
33 | | type Error = Utf8Error; |
34 | | |
35 | 0 | fn try_write_to_parts<S: writeable::PartsWrite + ?Sized>( |
36 | 0 | &self, |
37 | 0 | sink: &mut S, |
38 | 0 | ) -> Result<Result<(), Self::Error>, fmt::Error> { |
39 | 0 | let mut remaining = self.0; |
40 | 0 | let mut r = Ok(()); |
41 | | loop { |
42 | 0 | match core::str::from_utf8(remaining) { |
43 | 0 | Ok(valid) => { |
44 | 0 | sink.write_str(valid)?; |
45 | 0 | return Ok(r); |
46 | | } |
47 | 0 | Err(e) => { |
48 | | // SAFETY: By Utf8Error invariants |
49 | 0 | let valid = unsafe { |
50 | 0 | core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to())) |
51 | | }; |
52 | 0 | sink.write_str(valid)?; |
53 | 0 | sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf8 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut diplomat_runtime::writeable::DiplomatWriteable>>>::{closure#0}Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf8 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut alloc::string::String>>::{closure#0}Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf8 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut <regex_automata::dfa::sparse::DFA<_> as icu_list::lazy_automaton::LazyAutomaton>::matches_earliest_fwd_lazy::DFAStepper>>::{closure#0} |
54 | 0 | if r.is_ok() { |
55 | 0 | r = Err(e); |
56 | 0 | } |
57 | 0 | let Some(error_len) = e.error_len() else { |
58 | 0 | return Ok(r); // end of string |
59 | | }; |
60 | | // SAFETY: By Utf8Error invariants |
61 | 0 | remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) } |
62 | | } |
63 | | } |
64 | | } |
65 | 0 | } Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf8 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut diplomat_runtime::writeable::DiplomatWriteable>>> Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf8 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut alloc::string::String>> Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf8 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut <regex_automata::dfa::sparse::DFA<_> as icu_list::lazy_automaton::LazyAutomaton>::matches_earliest_fwd_lazy::DFAStepper>> |
66 | | |
67 | 0 | fn writeable_length_hint(&self) -> writeable::LengthHint { |
68 | | // Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters. |
69 | 0 | LengthHint::between(self.0.len(), self.0.len() * 3) |
70 | 0 | } |
71 | | |
72 | 0 | fn try_write_to_string(&self) -> Result<Cow<str>, (Self::Error, Cow<str>)> { |
73 | 0 | match core::str::from_utf8(self.0) { |
74 | 0 | Ok(valid) => Ok(Cow::Borrowed(valid)), |
75 | 0 | Err(e) => { |
76 | | // SAFETY: By Utf8Error invariants |
77 | 0 | let valid = unsafe { |
78 | 0 | core::str::from_utf8_unchecked(self.0.get_unchecked(..e.valid_up_to())) |
79 | | }; |
80 | | |
81 | | // Let's assume this is the only error |
82 | 0 | let mut out = alloc::string::String::with_capacity( |
83 | 0 | self.0.len() + char::REPLACEMENT_CHARACTER.len_utf8() |
84 | 0 | - e.error_len().unwrap_or(0), |
85 | | ); |
86 | | |
87 | 0 | out.push_str(valid); |
88 | 0 | out.push(char::REPLACEMENT_CHARACTER); |
89 | | |
90 | | // If there's more, we can use `try_write_to` |
91 | 0 | if let Some(error_len) = e.error_len() { |
92 | 0 | // SAFETY: By Utf8Error invariants |
93 | 0 | let remaining = unsafe { self.0.get_unchecked(e.valid_up_to() + error_len..) }; |
94 | 0 | let _discard = Self(remaining).try_write_to(&mut out); |
95 | 0 | } |
96 | | |
97 | 0 | Err((e, Cow::Owned(out))) |
98 | | } |
99 | | } |
100 | 0 | } |
101 | | } |
102 | | |
103 | | /// Implements [`Writeable`] for [`&[u16]`] according to the [WHATWG Encoding Standard]( |
104 | | /// https://encoding.spec.whatwg.org/#shared-utf-16-decoder). |
105 | | #[derive(Debug)] |
106 | | #[allow(clippy::exhaustive_structs)] // newtype |
107 | | pub struct PotentiallyInvalidUtf16<'a>(pub &'a [u16]); |
108 | | |
109 | | impl TryWriteable for PotentiallyInvalidUtf16<'_> { |
110 | | type Error = DecodeUtf16Error; |
111 | | |
112 | 0 | fn try_write_to_parts<S: writeable::PartsWrite + ?Sized>( |
113 | 0 | &self, |
114 | 0 | sink: &mut S, |
115 | 0 | ) -> Result<Result<(), Self::Error>, fmt::Error> { |
116 | 0 | let mut r = Ok(()); |
117 | 0 | for c in core::char::decode_utf16(self.0.iter().copied()) { |
118 | 0 | match c { |
119 | 0 | Ok(c) => sink.write_char(c)?, |
120 | 0 | Err(e) => { |
121 | 0 | if r.is_ok() { |
122 | 0 | r = Err(e); |
123 | 0 | } |
124 | 0 | sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf16 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut diplomat_runtime::writeable::DiplomatWriteable>>>::{closure#0}Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf16 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut <regex_automata::dfa::sparse::DFA<_> as icu_list::lazy_automaton::LazyAutomaton>::matches_earliest_fwd_lazy::DFAStepper>>::{closure#0} |
125 | | } |
126 | | } |
127 | | } |
128 | 0 | Ok(r) |
129 | 0 | } Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf16 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut diplomat_runtime::writeable::DiplomatWriteable>>> Unexecuted instantiation: <icu_capi::utf::PotentiallyInvalidUtf16 as writeable::try_writeable::TryWriteable>::try_write_to_parts::<writeable::parts_write_adapter::CoreWriteAsPartsWrite<&mut <regex_automata::dfa::sparse::DFA<_> as icu_list::lazy_automaton::LazyAutomaton>::matches_earliest_fwd_lazy::DFAStepper>> |
130 | | |
131 | 0 | fn writeable_length_hint(&self) -> LengthHint { |
132 | | // Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character) |
133 | 0 | LengthHint::between(self.0.len(), self.0.len() * 3) |
134 | 0 | } |
135 | | } |
136 | | |
137 | | #[cfg(test)] |
138 | | mod test { |
139 | | #![allow(invalid_from_utf8)] // only way to construct the error |
140 | | use super::*; |
141 | | use writeable::assert_try_writeable_parts_eq; |
142 | | |
143 | | #[test] |
144 | | fn test_utf8() { |
145 | | assert_try_writeable_parts_eq!(PotentiallyInvalidUtf8(b"Foo Bar"), "Foo Bar", Ok(()), []); |
146 | | assert_try_writeable_parts_eq!( |
147 | | PotentiallyInvalidUtf8(b"Foo\xFDBar"), |
148 | | "Foo�Bar", |
149 | | Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()), |
150 | | [(3, 6, Part::ERROR)] |
151 | | ); |
152 | | assert_try_writeable_parts_eq!( |
153 | | PotentiallyInvalidUtf8(b"Foo\xFDBar\xff"), |
154 | | "Foo�Bar�", |
155 | | Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()), |
156 | | [(3, 6, Part::ERROR), (9, 12, Part::ERROR)], |
157 | | ); |
158 | | } |
159 | | |
160 | | #[test] |
161 | | fn test_utf16() { |
162 | | assert_try_writeable_parts_eq!( |
163 | | PotentiallyInvalidUtf16(&[0xD83E, 0xDD73]), |
164 | | "🥳", |
165 | | Ok(()), |
166 | | [] |
167 | | ); |
168 | | assert_try_writeable_parts_eq!( |
169 | | PotentiallyInvalidUtf16(&[0xD83E, 0x20, 0xDD73]), |
170 | | "� �", |
171 | | Err(core::char::decode_utf16([0xD83E].into_iter()) |
172 | | .next() |
173 | | .unwrap() |
174 | | .unwrap_err()), |
175 | | [(0, 3, Part::ERROR), (4, 7, Part::ERROR)] |
176 | | ); |
177 | | } |
178 | | } |