Coverage Report

Created: 2025-10-13 06:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/potential_utf-0.1.3/src/uchar.rs
Line
Count
Source
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5
use core::cmp::Ordering;
6
use core::fmt;
7
8
/// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not
9
/// validated as such.
10
///
11
/// Use this type instead of `char` when you want to deal with data that is expected to be valid
12
/// Unicode scalar values, but you want control over when or if you validate that assumption.
13
///
14
/// # Examples
15
///
16
/// ```
17
/// use potential_utf::PotentialCodePoint;
18
///
19
/// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h'));
20
/// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i'));
21
/// assert_eq!(
22
///     PotentialCodePoint::from_u24(0x1F44B).try_to_char(),
23
///     Ok('👋')
24
/// );
25
///
26
/// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err());
27
/// assert_eq!(
28
///     PotentialCodePoint::from_u24(0xDE01).to_char_lossy(),
29
///     char::REPLACEMENT_CHARACTER
30
/// );
31
/// ```
32
#[repr(transparent)]
33
#[allow(clippy::exhaustive_structs)] // transparent newtype
34
#[derive(PartialEq, Eq, Clone, Copy, Hash)]
35
pub struct PotentialCodePoint([u8; 3]);
36
37
impl PotentialCodePoint {
38
    /// Create a [`PotentialCodePoint`] from a `char`.
39
    ///
40
    /// # Examples
41
    ///
42
    /// ```
43
    /// use potential_utf::PotentialCodePoint;
44
    ///
45
    /// let a = PotentialCodePoint::from_char('a');
46
    /// assert_eq!(a.try_to_char().unwrap(), 'a');
47
    /// ```
48
    #[inline]
49
0
    pub const fn from_char(c: char) -> Self {
50
0
        let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
51
0
        Self([u0, u1, u2])
52
0
    }
53
54
    /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits.
55
    #[inline]
56
0
    pub const fn from_u24(c: u32) -> Self {
57
0
        let [u0, u1, u2, _u3] = c.to_le_bytes();
58
0
        Self([u0, u1, u2])
59
0
    }
60
61
    /// Attempt to convert a [`PotentialCodePoint`] to a `char`.
62
    ///
63
    /// # Examples
64
    ///
65
    /// ```
66
    /// use potential_utf::PotentialCodePoint;
67
    /// use zerovec::ule::AsULE;
68
    ///
69
    /// let a = PotentialCodePoint::from_char('a');
70
    /// assert_eq!(a.try_to_char(), Ok('a'));
71
    ///
72
    /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
73
    /// assert!(matches!(b.try_to_char(), Err(_)));
74
    /// ```
75
    #[inline]
76
0
    pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> {
77
0
        char::try_from(u32::from(self))
78
0
    }
79
80
    /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`]
81
    /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value.
82
    ///
83
    /// # Examples
84
    ///
85
    /// ```
86
    /// use potential_utf::PotentialCodePoint;
87
    /// use zerovec::ule::AsULE;
88
    ///
89
    /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
90
    /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER);
91
    /// ```
92
    #[inline]
93
0
    pub fn to_char_lossy(self) -> char {
94
0
        self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
95
0
    }
96
97
    /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is
98
    /// a valid Unicode scalar value.
99
    ///
100
    /// # Safety
101
    ///
102
    /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order.
103
    ///
104
    /// # Examples
105
    ///
106
    /// ```
107
    /// use potential_utf::PotentialCodePoint;
108
    ///
109
    /// let a = PotentialCodePoint::from_char('a');
110
    /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a');
111
    /// ```
112
    #[inline]
113
0
    pub unsafe fn to_char_unchecked(self) -> char {
114
0
        char::from_u32_unchecked(u32::from(self))
115
0
    }
116
117
    /// For converting to the ULE type in a const context
118
    ///
119
    /// Can be removed once const traits are a thing
120
    #[inline]
121
    #[cfg(feature = "zerovec")]
122
0
    pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> {
123
0
        zerovec::ule::RawBytesULE(self.0)
124
0
    }
125
}
126
127
/// This impl requires enabling the optional `zerovec` Cargo feature
128
#[cfg(feature = "zerovec")]
129
impl zerovec::ule::AsULE for PotentialCodePoint {
130
    type ULE = zerovec::ule::RawBytesULE<3>;
131
132
    #[inline]
133
0
    fn to_unaligned(self) -> Self::ULE {
134
0
        zerovec::ule::RawBytesULE(self.0)
135
0
    }
136
137
    #[inline]
138
0
    fn from_unaligned(unaligned: Self::ULE) -> Self {
139
0
        Self(unaligned.0)
140
0
    }
Unexecuted instantiation: <potential_utf::uchar::PotentialCodePoint as zerovec::ule::AsULE>::from_unaligned
Unexecuted instantiation: <potential_utf::uchar::PotentialCodePoint as zerovec::ule::AsULE>::from_unaligned
141
}
142
143
// Safety: PotentialCodePoint is always the little-endian representation of a char,
144
// which corresponds to its AsULE::ULE type
145
/// This impl requires enabling the optional `zerovec` Cargo feature
146
#[cfg(feature = "zerovec")]
147
unsafe impl zerovec::ule::EqULE for PotentialCodePoint {}
148
149
impl fmt::Debug for PotentialCodePoint {
150
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
151
        // Debug as a char if possible
152
0
        match self.try_to_char() {
153
0
            Ok(c) => fmt::Debug::fmt(&c, f),
154
0
            Err(_) => fmt::Debug::fmt(&self.0, f),
155
        }
156
0
    }
157
}
158
159
impl PartialOrd for PotentialCodePoint {
160
0
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
161
0
        Some(self.cmp(other))
162
0
    }
163
}
164
165
impl PartialEq<char> for PotentialCodePoint {
166
0
    fn eq(&self, other: &char) -> bool {
167
0
        self.eq(&Self::from_char(*other))
168
0
    }
169
}
170
171
impl PartialOrd<char> for PotentialCodePoint {
172
0
    fn partial_cmp(&self, other: &char) -> Option<Ordering> {
173
0
        self.partial_cmp(&Self::from_char(*other))
174
0
    }
175
}
176
177
impl PartialEq<PotentialCodePoint> for char {
178
0
    fn eq(&self, other: &PotentialCodePoint) -> bool {
179
0
        PotentialCodePoint::from_char(*self).eq(other)
180
0
    }
181
}
182
183
impl PartialOrd<PotentialCodePoint> for char {
184
0
    fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> {
185
0
        PotentialCodePoint::from_char(*self).partial_cmp(other)
186
0
    }
187
}
188
189
impl Ord for PotentialCodePoint {
190
    // custom implementation, as derived Ord would compare lexicographically
191
0
    fn cmp(&self, other: &Self) -> Ordering {
192
0
        let a = u32::from(*self);
193
0
        let b = u32::from(*other);
194
0
        a.cmp(&b)
195
0
    }
196
}
197
198
impl From<PotentialCodePoint> for u32 {
199
0
    fn from(x: PotentialCodePoint) -> Self {
200
0
        let [a0, a1, a2] = x.0;
201
0
        u32::from_le_bytes([a0, a1, a2, 0])
202
0
    }
203
}
204
205
impl TryFrom<u32> for PotentialCodePoint {
206
    type Error = ();
207
0
    fn try_from(x: u32) -> Result<Self, ()> {
208
0
        let [u0, u1, u2, u3] = x.to_le_bytes();
209
0
        if u3 != 0 {
210
0
            return Err(());
211
0
        }
212
0
        Ok(Self([u0, u1, u2]))
213
0
    }
214
}
215
216
impl From<char> for PotentialCodePoint {
217
    #[inline]
218
0
    fn from(value: char) -> Self {
219
0
        Self::from_char(value)
220
0
    }
221
}
222
223
impl TryFrom<PotentialCodePoint> for char {
224
    type Error = core::char::CharTryFromError;
225
226
    #[inline]
227
0
    fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> {
228
0
        value.try_to_char()
229
0
    }
230
}
231
232
/// This impl requires enabling the optional `serde` Cargo feature
233
#[cfg(feature = "serde")]
234
impl serde::Serialize for PotentialCodePoint {
235
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
236
    where
237
        S: serde::Serializer,
238
    {
239
        use serde::ser::Error;
240
        let c = self
241
            .try_to_char()
242
            .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?;
243
        if serializer.is_human_readable() {
244
            serializer.serialize_char(c)
245
        } else {
246
            self.0.serialize(serializer)
247
        }
248
    }
249
}
250
251
/// This impl requires enabling the optional `serde` Cargo feature
252
#[cfg(feature = "serde")]
253
impl<'de> serde::Deserialize<'de> for PotentialCodePoint {
254
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
255
    where
256
        D: serde::Deserializer<'de>,
257
    {
258
        if deserializer.is_human_readable() {
259
            let c = <char>::deserialize(deserializer)?;
260
            Ok(PotentialCodePoint::from_char(c))
261
        } else {
262
            let bytes = <[u8; 3]>::deserialize(deserializer)?;
263
            Ok(PotentialCodePoint(bytes))
264
        }
265
    }
266
}
267
268
/// This impl requires enabling the optional `databake` Cargo feature
269
#[cfg(feature = "databake")]
270
impl databake::Bake for PotentialCodePoint {
271
    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
272
        match self.try_to_char() {
273
            Ok(ch) => {
274
                env.insert("potential_utf");
275
                let ch = ch.bake(env);
276
                databake::quote! {
277
                    potential_utf::PotentialCodePoint::from_char(#ch)
278
                }
279
            }
280
            Err(_) => {
281
                env.insert("potential_utf");
282
                let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]);
283
                databake::quote! {
284
                    potential_utf::PotentialCodePoint::from_u24(#u24)
285
                }
286
            }
287
        }
288
    }
289
}
290
291
#[cfg(test)]
292
mod test {
293
    use super::*;
294
    use zerovec::ZeroVec;
295
296
    #[test]
297
    fn test_serde_fail() {
298
        let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]);
299
        serde_json::to_string(&uc).expect_err("serialize invalid char bytes");
300
        bincode::serialize(&uc).expect_err("serialize invalid char bytes");
301
    }
302
303
    #[test]
304
    fn test_serde_json() {
305
        let c = '🙃';
306
        let uc = PotentialCodePoint::from_char(c);
307
        let json_ser = serde_json::to_string(&uc).unwrap();
308
309
        assert_eq!(json_ser, r#""🙃""#);
310
311
        let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap();
312
313
        assert_eq!(uc, json_de);
314
    }
315
316
    #[test]
317
    fn test_serde_bincode() {
318
        let c = '🙃';
319
        let uc = PotentialCodePoint::from_char(c);
320
        let bytes_ser = bincode::serialize(&uc).unwrap();
321
322
        assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]);
323
324
        let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap();
325
326
        assert_eq!(uc, bytes_de);
327
    }
328
329
    #[test]
330
    fn test_representation() {
331
        let chars = ['w', 'ω', '文', '𑄃', '🙃'];
332
333
        // backed by [PotentialCodePoint]
334
        let uvchars: Vec<_> = chars
335
            .iter()
336
            .copied()
337
            .map(PotentialCodePoint::from_char)
338
            .collect();
339
        // backed by [RawBytesULE<3>]
340
        let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect();
341
342
        let ule_bytes = zvec.as_bytes();
343
        let uvbytes;
344
        unsafe {
345
            let ptr = &uvchars[..] as *const _ as *const u8;
346
            uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len());
347
        }
348
349
        // PotentialCodePoint is defined as little-endian, so this must be true on all platforms
350
        // also asserts that to_unaligned/from_unaligned are no-ops
351
        assert_eq!(uvbytes, ule_bytes);
352
353
        assert_eq!(
354
            &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
355
            ule_bytes
356
        );
357
    }
358
359
    #[test]
360
    fn test_char_bake() {
361
        databake::test_bake!(
362
            PotentialCodePoint,
363
            const,
364
            crate::PotentialCodePoint::from_char('b'),
365
            potential_utf
366
        );
367
        // surrogate code point
368
        databake::test_bake!(
369
            PotentialCodePoint,
370
            const,
371
            crate::PotentialCodePoint::from_u24(55296u32),
372
            potential_utf
373
        );
374
    }
375
}