/rust/registry/src/index.crates.io-1949cf8c6b5b557f/potential_utf-0.1.3/src/uchar.rs
Line | Count | Source |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | use core::cmp::Ordering; |
6 | | use core::fmt; |
7 | | |
8 | | /// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not |
9 | | /// validated as such. |
10 | | /// |
11 | | /// Use this type instead of `char` when you want to deal with data that is expected to be valid |
12 | | /// Unicode scalar values, but you want control over when or if you validate that assumption. |
13 | | /// |
14 | | /// # Examples |
15 | | /// |
16 | | /// ``` |
17 | | /// use potential_utf::PotentialCodePoint; |
18 | | /// |
19 | | /// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h')); |
20 | | /// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i')); |
21 | | /// assert_eq!( |
22 | | /// PotentialCodePoint::from_u24(0x1F44B).try_to_char(), |
23 | | /// Ok('👋') |
24 | | /// ); |
25 | | /// |
26 | | /// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err()); |
27 | | /// assert_eq!( |
28 | | /// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(), |
29 | | /// char::REPLACEMENT_CHARACTER |
30 | | /// ); |
31 | | /// ``` |
32 | | #[repr(transparent)] |
33 | | #[allow(clippy::exhaustive_structs)] // transparent newtype |
34 | | #[derive(PartialEq, Eq, Clone, Copy, Hash)] |
35 | | pub struct PotentialCodePoint([u8; 3]); |
36 | | |
37 | | impl PotentialCodePoint { |
38 | | /// Create a [`PotentialCodePoint`] from a `char`. |
39 | | /// |
40 | | /// # Examples |
41 | | /// |
42 | | /// ``` |
43 | | /// use potential_utf::PotentialCodePoint; |
44 | | /// |
45 | | /// let a = PotentialCodePoint::from_char('a'); |
46 | | /// assert_eq!(a.try_to_char().unwrap(), 'a'); |
47 | | /// ``` |
48 | | #[inline] |
49 | 0 | pub const fn from_char(c: char) -> Self { |
50 | 0 | let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); |
51 | 0 | Self([u0, u1, u2]) |
52 | 0 | } |
53 | | |
54 | | /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits. |
55 | | #[inline] |
56 | 0 | pub const fn from_u24(c: u32) -> Self { |
57 | 0 | let [u0, u1, u2, _u3] = c.to_le_bytes(); |
58 | 0 | Self([u0, u1, u2]) |
59 | 0 | } |
60 | | |
61 | | /// Attempt to convert a [`PotentialCodePoint`] to a `char`. |
62 | | /// |
63 | | /// # Examples |
64 | | /// |
65 | | /// ``` |
66 | | /// use potential_utf::PotentialCodePoint; |
67 | | /// use zerovec::ule::AsULE; |
68 | | /// |
69 | | /// let a = PotentialCodePoint::from_char('a'); |
70 | | /// assert_eq!(a.try_to_char(), Ok('a')); |
71 | | /// |
72 | | /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); |
73 | | /// assert!(matches!(b.try_to_char(), Err(_))); |
74 | | /// ``` |
75 | | #[inline] |
76 | 0 | pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> { |
77 | 0 | char::try_from(u32::from(self)) |
78 | 0 | } |
79 | | |
80 | | /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] |
81 | | /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value. |
82 | | /// |
83 | | /// # Examples |
84 | | /// |
85 | | /// ``` |
86 | | /// use potential_utf::PotentialCodePoint; |
87 | | /// use zerovec::ule::AsULE; |
88 | | /// |
89 | | /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); |
90 | | /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); |
91 | | /// ``` |
92 | | #[inline] |
93 | 0 | pub fn to_char_lossy(self) -> char { |
94 | 0 | self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) |
95 | 0 | } |
96 | | |
97 | | /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is |
98 | | /// a valid Unicode scalar value. |
99 | | /// |
100 | | /// # Safety |
101 | | /// |
102 | | /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order. |
103 | | /// |
104 | | /// # Examples |
105 | | /// |
106 | | /// ``` |
107 | | /// use potential_utf::PotentialCodePoint; |
108 | | /// |
109 | | /// let a = PotentialCodePoint::from_char('a'); |
110 | | /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a'); |
111 | | /// ``` |
112 | | #[inline] |
113 | 0 | pub unsafe fn to_char_unchecked(self) -> char { |
114 | 0 | char::from_u32_unchecked(u32::from(self)) |
115 | 0 | } |
116 | | |
117 | | /// For converting to the ULE type in a const context |
118 | | /// |
119 | | /// Can be removed once const traits are a thing |
120 | | #[inline] |
121 | | #[cfg(feature = "zerovec")] |
122 | 0 | pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> { |
123 | 0 | zerovec::ule::RawBytesULE(self.0) |
124 | 0 | } |
125 | | } |
126 | | |
127 | | /// This impl requires enabling the optional `zerovec` Cargo feature |
128 | | #[cfg(feature = "zerovec")] |
129 | | impl zerovec::ule::AsULE for PotentialCodePoint { |
130 | | type ULE = zerovec::ule::RawBytesULE<3>; |
131 | | |
132 | | #[inline] |
133 | 0 | fn to_unaligned(self) -> Self::ULE { |
134 | 0 | zerovec::ule::RawBytesULE(self.0) |
135 | 0 | } |
136 | | |
137 | | #[inline] |
138 | 0 | fn from_unaligned(unaligned: Self::ULE) -> Self { |
139 | 0 | Self(unaligned.0) |
140 | 0 | } Unexecuted instantiation: <potential_utf::uchar::PotentialCodePoint as zerovec::ule::AsULE>::from_unaligned Unexecuted instantiation: <potential_utf::uchar::PotentialCodePoint as zerovec::ule::AsULE>::from_unaligned |
141 | | } |
142 | | |
143 | | // Safety: PotentialCodePoint is always the little-endian representation of a char, |
144 | | // which corresponds to its AsULE::ULE type |
145 | | /// This impl requires enabling the optional `zerovec` Cargo feature |
146 | | #[cfg(feature = "zerovec")] |
147 | | unsafe impl zerovec::ule::EqULE for PotentialCodePoint {} |
148 | | |
149 | | impl fmt::Debug for PotentialCodePoint { |
150 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
151 | | // Debug as a char if possible |
152 | 0 | match self.try_to_char() { |
153 | 0 | Ok(c) => fmt::Debug::fmt(&c, f), |
154 | 0 | Err(_) => fmt::Debug::fmt(&self.0, f), |
155 | | } |
156 | 0 | } |
157 | | } |
158 | | |
159 | | impl PartialOrd for PotentialCodePoint { |
160 | 0 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
161 | 0 | Some(self.cmp(other)) |
162 | 0 | } |
163 | | } |
164 | | |
165 | | impl PartialEq<char> for PotentialCodePoint { |
166 | 0 | fn eq(&self, other: &char) -> bool { |
167 | 0 | self.eq(&Self::from_char(*other)) |
168 | 0 | } |
169 | | } |
170 | | |
171 | | impl PartialOrd<char> for PotentialCodePoint { |
172 | 0 | fn partial_cmp(&self, other: &char) -> Option<Ordering> { |
173 | 0 | self.partial_cmp(&Self::from_char(*other)) |
174 | 0 | } |
175 | | } |
176 | | |
177 | | impl PartialEq<PotentialCodePoint> for char { |
178 | 0 | fn eq(&self, other: &PotentialCodePoint) -> bool { |
179 | 0 | PotentialCodePoint::from_char(*self).eq(other) |
180 | 0 | } |
181 | | } |
182 | | |
183 | | impl PartialOrd<PotentialCodePoint> for char { |
184 | 0 | fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> { |
185 | 0 | PotentialCodePoint::from_char(*self).partial_cmp(other) |
186 | 0 | } |
187 | | } |
188 | | |
189 | | impl Ord for PotentialCodePoint { |
190 | | // custom implementation, as derived Ord would compare lexicographically |
191 | 0 | fn cmp(&self, other: &Self) -> Ordering { |
192 | 0 | let a = u32::from(*self); |
193 | 0 | let b = u32::from(*other); |
194 | 0 | a.cmp(&b) |
195 | 0 | } |
196 | | } |
197 | | |
198 | | impl From<PotentialCodePoint> for u32 { |
199 | 0 | fn from(x: PotentialCodePoint) -> Self { |
200 | 0 | let [a0, a1, a2] = x.0; |
201 | 0 | u32::from_le_bytes([a0, a1, a2, 0]) |
202 | 0 | } |
203 | | } |
204 | | |
205 | | impl TryFrom<u32> for PotentialCodePoint { |
206 | | type Error = (); |
207 | 0 | fn try_from(x: u32) -> Result<Self, ()> { |
208 | 0 | let [u0, u1, u2, u3] = x.to_le_bytes(); |
209 | 0 | if u3 != 0 { |
210 | 0 | return Err(()); |
211 | 0 | } |
212 | 0 | Ok(Self([u0, u1, u2])) |
213 | 0 | } |
214 | | } |
215 | | |
216 | | impl From<char> for PotentialCodePoint { |
217 | | #[inline] |
218 | 0 | fn from(value: char) -> Self { |
219 | 0 | Self::from_char(value) |
220 | 0 | } |
221 | | } |
222 | | |
223 | | impl TryFrom<PotentialCodePoint> for char { |
224 | | type Error = core::char::CharTryFromError; |
225 | | |
226 | | #[inline] |
227 | 0 | fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> { |
228 | 0 | value.try_to_char() |
229 | 0 | } |
230 | | } |
231 | | |
232 | | /// This impl requires enabling the optional `serde` Cargo feature |
233 | | #[cfg(feature = "serde")] |
234 | | impl serde::Serialize for PotentialCodePoint { |
235 | | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> |
236 | | where |
237 | | S: serde::Serializer, |
238 | | { |
239 | | use serde::ser::Error; |
240 | | let c = self |
241 | | .try_to_char() |
242 | | .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?; |
243 | | if serializer.is_human_readable() { |
244 | | serializer.serialize_char(c) |
245 | | } else { |
246 | | self.0.serialize(serializer) |
247 | | } |
248 | | } |
249 | | } |
250 | | |
251 | | /// This impl requires enabling the optional `serde` Cargo feature |
252 | | #[cfg(feature = "serde")] |
253 | | impl<'de> serde::Deserialize<'de> for PotentialCodePoint { |
254 | | fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> |
255 | | where |
256 | | D: serde::Deserializer<'de>, |
257 | | { |
258 | | if deserializer.is_human_readable() { |
259 | | let c = <char>::deserialize(deserializer)?; |
260 | | Ok(PotentialCodePoint::from_char(c)) |
261 | | } else { |
262 | | let bytes = <[u8; 3]>::deserialize(deserializer)?; |
263 | | Ok(PotentialCodePoint(bytes)) |
264 | | } |
265 | | } |
266 | | } |
267 | | |
268 | | /// This impl requires enabling the optional `databake` Cargo feature |
269 | | #[cfg(feature = "databake")] |
270 | | impl databake::Bake for PotentialCodePoint { |
271 | | fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { |
272 | | match self.try_to_char() { |
273 | | Ok(ch) => { |
274 | | env.insert("potential_utf"); |
275 | | let ch = ch.bake(env); |
276 | | databake::quote! { |
277 | | potential_utf::PotentialCodePoint::from_char(#ch) |
278 | | } |
279 | | } |
280 | | Err(_) => { |
281 | | env.insert("potential_utf"); |
282 | | let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); |
283 | | databake::quote! { |
284 | | potential_utf::PotentialCodePoint::from_u24(#u24) |
285 | | } |
286 | | } |
287 | | } |
288 | | } |
289 | | } |
290 | | |
291 | | #[cfg(test)] |
292 | | mod test { |
293 | | use super::*; |
294 | | use zerovec::ZeroVec; |
295 | | |
296 | | #[test] |
297 | | fn test_serde_fail() { |
298 | | let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]); |
299 | | serde_json::to_string(&uc).expect_err("serialize invalid char bytes"); |
300 | | bincode::serialize(&uc).expect_err("serialize invalid char bytes"); |
301 | | } |
302 | | |
303 | | #[test] |
304 | | fn test_serde_json() { |
305 | | let c = '🙃'; |
306 | | let uc = PotentialCodePoint::from_char(c); |
307 | | let json_ser = serde_json::to_string(&uc).unwrap(); |
308 | | |
309 | | assert_eq!(json_ser, r#""🙃""#); |
310 | | |
311 | | let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap(); |
312 | | |
313 | | assert_eq!(uc, json_de); |
314 | | } |
315 | | |
316 | | #[test] |
317 | | fn test_serde_bincode() { |
318 | | let c = '🙃'; |
319 | | let uc = PotentialCodePoint::from_char(c); |
320 | | let bytes_ser = bincode::serialize(&uc).unwrap(); |
321 | | |
322 | | assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); |
323 | | |
324 | | let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap(); |
325 | | |
326 | | assert_eq!(uc, bytes_de); |
327 | | } |
328 | | |
329 | | #[test] |
330 | | fn test_representation() { |
331 | | let chars = ['w', 'ω', '文', '𑄃', '🙃']; |
332 | | |
333 | | // backed by [PotentialCodePoint] |
334 | | let uvchars: Vec<_> = chars |
335 | | .iter() |
336 | | .copied() |
337 | | .map(PotentialCodePoint::from_char) |
338 | | .collect(); |
339 | | // backed by [RawBytesULE<3>] |
340 | | let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); |
341 | | |
342 | | let ule_bytes = zvec.as_bytes(); |
343 | | let uvbytes; |
344 | | unsafe { |
345 | | let ptr = &uvchars[..] as *const _ as *const u8; |
346 | | uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); |
347 | | } |
348 | | |
349 | | // PotentialCodePoint is defined as little-endian, so this must be true on all platforms |
350 | | // also asserts that to_unaligned/from_unaligned are no-ops |
351 | | assert_eq!(uvbytes, ule_bytes); |
352 | | |
353 | | assert_eq!( |
354 | | &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], |
355 | | ule_bytes |
356 | | ); |
357 | | } |
358 | | |
359 | | #[test] |
360 | | fn test_char_bake() { |
361 | | databake::test_bake!( |
362 | | PotentialCodePoint, |
363 | | const, |
364 | | crate::PotentialCodePoint::from_char('b'), |
365 | | potential_utf |
366 | | ); |
367 | | // surrogate code point |
368 | | databake::test_bake!( |
369 | | PotentialCodePoint, |
370 | | const, |
371 | | crate::PotentialCodePoint::from_u24(55296u32), |
372 | | potential_utf |
373 | | ); |
374 | | } |
375 | | } |