/rust/registry/src/index.crates.io-6f17d22bba15001f/icu_casemap-1.5.1/src/casemapper.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | use crate::internals::{CaseMapLocale, FoldOptions, FullCaseWriteable, StringAndWriteable}; |
6 | | use crate::provider::data::MappingKind; |
7 | | use crate::provider::CaseMapV1; |
8 | | use crate::provider::CaseMapV1Marker; |
9 | | use crate::set::ClosureSink; |
10 | | use crate::titlecase::{LeadingAdjustment, TitlecaseOptions, TrailingCase}; |
11 | | use alloc::string::String; |
12 | | use icu_locid::LanguageIdentifier; |
13 | | use icu_provider::prelude::*; |
14 | | use writeable::Writeable; |
15 | | |
16 | | /// A struct with the ability to convert characters and strings to uppercase or lowercase, |
17 | | /// or fold them to a normalized form for case-insensitive comparison. |
18 | | /// |
19 | | /// # Examples |
20 | | /// |
21 | | /// ```rust |
22 | | /// use icu::casemap::CaseMapper; |
23 | | /// use icu::locid::langid; |
24 | | /// |
25 | | /// let cm = CaseMapper::new(); |
26 | | /// |
27 | | /// assert_eq!( |
28 | | /// cm.uppercase_to_string("hello world", &langid!("und")), |
29 | | /// "HELLO WORLD" |
30 | | /// ); |
31 | | /// assert_eq!( |
32 | | /// cm.lowercase_to_string("Γειά σου Κόσμε", &langid!("und")), |
33 | | /// "γειά σου κόσμε" |
34 | | /// ); |
35 | | /// ``` |
36 | | #[derive(Clone, Debug)] |
37 | | pub struct CaseMapper { |
38 | | pub(crate) data: DataPayload<CaseMapV1Marker>, |
39 | | } |
40 | | |
41 | | #[cfg(feature = "compiled_data")] |
42 | | impl Default for CaseMapper { |
43 | 0 | fn default() -> Self { |
44 | 0 | Self::new() |
45 | 0 | } |
46 | | } |
47 | | |
48 | | impl AsRef<CaseMapper> for CaseMapper { |
49 | 0 | fn as_ref(&self) -> &CaseMapper { |
50 | 0 | self |
51 | 0 | } |
52 | | } |
53 | | |
54 | | impl CaseMapper { |
55 | | /// Creates a [`CaseMapper`] using compiled data. |
56 | | /// |
57 | | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
58 | | /// |
59 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
60 | | /// |
61 | | /// # Examples |
62 | | /// |
63 | | /// ```rust |
64 | | /// use icu::casemap::CaseMapper; |
65 | | /// use icu::locid::langid; |
66 | | /// |
67 | | /// let cm = CaseMapper::new(); |
68 | | /// |
69 | | /// assert_eq!( |
70 | | /// cm.uppercase_to_string("hello world", &langid!("und")), |
71 | | /// "HELLO WORLD" |
72 | | /// ); |
73 | | /// ``` |
74 | | #[cfg(feature = "compiled_data")] |
75 | 0 | pub const fn new() -> Self { |
76 | 0 | Self { |
77 | 0 | data: DataPayload::from_static_ref(crate::provider::Baked::SINGLETON_PROPS_CASEMAP_V1), |
78 | 0 | } |
79 | 0 | } |
80 | | |
81 | | icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: DataError, |
82 | | #[cfg(skip)] |
83 | | functions: [ |
84 | | new, |
85 | | try_new_with_any_provider, |
86 | | try_new_with_buffer_provider, |
87 | | try_new_unstable, |
88 | | Self, |
89 | | ]); |
90 | | |
91 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] |
92 | 0 | pub fn try_new_unstable<P>(provider: &P) -> Result<CaseMapper, DataError> |
93 | 0 | where |
94 | 0 | P: DataProvider<CaseMapV1Marker> + ?Sized, |
95 | 0 | { |
96 | 0 | let data = provider.load(Default::default())?.take_payload()?; |
97 | 0 | Ok(Self { data }) |
98 | 0 | } Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::try_new_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::try_new_unstable::<_> |
99 | | |
100 | | /// Returns the full lowercase mapping of the given string as a [`Writeable`]. |
101 | | /// This function is context and language sensitive. Callers should pass the text's language |
102 | | /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or |
103 | | /// `Default::default()` for the root locale. |
104 | | /// |
105 | | /// See [`Self::lowercase_to_string()`] for the equivalent convenience function that returns a String, |
106 | | /// as well as for an example. |
107 | 0 | pub fn lowercase<'a>( |
108 | 0 | &'a self, |
109 | 0 | src: &'a str, |
110 | 0 | langid: &LanguageIdentifier, |
111 | 0 | ) -> impl Writeable + 'a { |
112 | 0 | self.data.get().full_helper_writeable::<false>( |
113 | 0 | src, |
114 | 0 | CaseMapLocale::from_langid(langid), |
115 | 0 | MappingKind::Lower, |
116 | 0 | TrailingCase::default(), |
117 | 0 | ) |
118 | 0 | } |
119 | | |
120 | | /// Returns the full uppercase mapping of the given string as a [`Writeable`]. |
121 | | /// This function is context and language sensitive. Callers should pass the text's language |
122 | | /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or |
123 | | /// `Default::default()` for the root locale. |
124 | | /// |
125 | | /// See [`Self::uppercase_to_string()`] for the equivalent convenience function that returns a String, |
126 | | /// as well as for an example. |
127 | 0 | pub fn uppercase<'a>( |
128 | 0 | &'a self, |
129 | 0 | src: &'a str, |
130 | 0 | langid: &LanguageIdentifier, |
131 | 0 | ) -> impl Writeable + 'a { |
132 | 0 | self.data.get().full_helper_writeable::<false>( |
133 | 0 | src, |
134 | 0 | CaseMapLocale::from_langid(langid), |
135 | 0 | MappingKind::Upper, |
136 | 0 | TrailingCase::default(), |
137 | 0 | ) |
138 | 0 | } |
139 | | |
140 | | /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating |
141 | | /// the string as a single segment (and thus only titlecasing the beginning of it). Performs |
142 | | /// the specified leading adjustment behavior from the options without loading additional data. |
143 | | /// |
144 | | /// This should typically be used as a lower-level helper to construct the titlecasing operation desired |
145 | | /// by the application, for example one can titlecase on a per-word basis by mixing this with |
146 | | /// a `WordSegmenter`. |
147 | | /// |
148 | | /// This function is context and language sensitive. Callers should pass the text's language |
149 | | /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or |
150 | | /// `Default::default()` for the root locale. |
151 | | /// |
152 | | /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`] |
153 | | /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load |
154 | | /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See |
155 | | /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between |
156 | | /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode |
157 | | /// is [`LeadingAdjustment::None`]. |
158 | | /// |
159 | | /// See [`Self::titlecase_segment_with_only_case_data_to_string()`] for the equivalent convenience function that returns a String, |
160 | | /// as well as for an example. |
161 | | /// |
162 | | /// [`TitlecaseMapper`]: crate::TitlecaseMapper |
163 | 0 | pub fn titlecase_segment_with_only_case_data<'a>( |
164 | 0 | &'a self, |
165 | 0 | src: &'a str, |
166 | 0 | langid: &LanguageIdentifier, |
167 | 0 | options: TitlecaseOptions, |
168 | 0 | ) -> impl Writeable + 'a { |
169 | 0 | self.titlecase_segment_with_adjustment(src, langid, options, |data, ch| data.is_cased(ch)) |
170 | 0 | } |
171 | | |
172 | | /// Helper to support different leading adjustment behaviors, |
173 | | /// `char_is_lead` is a function that returns true for a character that is allowed to be the |
174 | | /// first relevant character in a titlecasing string, when `leading_adjustment != None` |
175 | | /// |
176 | | /// We return a concrete type instead of `impl Trait` so the return value can be mixed with that of other calls |
177 | | /// to this function with different closures |
178 | 0 | pub(crate) fn titlecase_segment_with_adjustment<'a>( |
179 | 0 | &'a self, |
180 | 0 | src: &'a str, |
181 | 0 | langid: &LanguageIdentifier, |
182 | 0 | options: TitlecaseOptions, |
183 | 0 | char_is_lead: impl Fn(&CaseMapV1, char) -> bool, |
184 | 0 | ) -> StringAndWriteable<FullCaseWriteable<'a, true>> { |
185 | 0 | let data = self.data.get(); |
186 | 0 | let (head, rest) = match options.leading_adjustment { |
187 | | LeadingAdjustment::Auto | LeadingAdjustment::ToCased => { |
188 | 0 | let first_cased = src.char_indices().find(|(_i, ch)| char_is_lead(data, *ch)); Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_adjustment::<<icu_casemap::titlecase::TitlecaseMapper<icu_casemap::casemapper::CaseMapper>>::titlecase_segment::{closure#0}>::{closure#0} Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_adjustment::<<icu_casemap::titlecase::TitlecaseMapper<icu_casemap::casemapper::CaseMapper>>::titlecase_segment::{closure#1}>::{closure#0} Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_adjustment::<<icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_only_case_data::{closure#0}>::{closure#0} |
189 | 0 | if let Some((first_cased, _ch)) = first_cased { |
190 | 0 | ( |
191 | 0 | src.get(..first_cased).unwrap_or(""), |
192 | 0 | src.get(first_cased..).unwrap_or(""), |
193 | 0 | ) |
194 | | } else { |
195 | 0 | (src, "") |
196 | | } |
197 | | } |
198 | 0 | LeadingAdjustment::None => ("", src), |
199 | | }; |
200 | 0 | let writeable = data.full_helper_writeable::<true>( |
201 | 0 | rest, |
202 | 0 | CaseMapLocale::from_langid(langid), |
203 | 0 | MappingKind::Title, |
204 | 0 | options.trailing_case, |
205 | 0 | ); |
206 | 0 | StringAndWriteable { |
207 | 0 | string: head, |
208 | 0 | writeable, |
209 | 0 | } |
210 | 0 | } Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_adjustment::<<icu_casemap::titlecase::TitlecaseMapper<icu_casemap::casemapper::CaseMapper>>::titlecase_segment::{closure#0}> Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_adjustment::<<icu_casemap::titlecase::TitlecaseMapper<icu_casemap::casemapper::CaseMapper>>::titlecase_segment::{closure#1}> Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_adjustment::<<icu_casemap::casemapper::CaseMapper>::titlecase_segment_with_only_case_data::{closure#0}> |
211 | | /// Case-folds the characters in the given string as a [`Writeable`]. |
212 | | /// This function is locale-independent and context-insensitive. |
213 | | /// |
214 | | /// Can be used to test if two strings are case-insensitively equivalent. |
215 | | /// |
216 | | /// See [`Self::fold_string()`] for the equivalent convenience function that returns a String, |
217 | | /// as well as for an example. |
218 | 0 | pub fn fold<'a>(&'a self, src: &'a str) -> impl Writeable + 'a { |
219 | 0 | self.data.get().full_helper_writeable::<false>( |
220 | 0 | src, |
221 | 0 | CaseMapLocale::Root, |
222 | 0 | MappingKind::Fold, |
223 | 0 | TrailingCase::default(), |
224 | 0 | ) |
225 | 0 | } |
226 | | |
227 | | /// Case-folds the characters in the given string as a [`Writeable`], |
228 | | /// using Turkic (T) mappings for dotted/dotless I. |
229 | | /// This function is locale-independent and context-insensitive. |
230 | | /// |
231 | | /// Can be used to test if two strings are case-insensitively equivalent. |
232 | | /// |
233 | | /// See [`Self::fold_turkic_string()`] for the equivalent convenience function that returns a String, |
234 | | /// as well as for an example. |
235 | 0 | pub fn fold_turkic<'a>(&'a self, src: &'a str) -> impl Writeable + 'a { |
236 | 0 | self.data.get().full_helper_writeable::<false>( |
237 | 0 | src, |
238 | 0 | CaseMapLocale::Turkish, |
239 | 0 | MappingKind::Fold, |
240 | 0 | TrailingCase::default(), |
241 | 0 | ) |
242 | 0 | } |
243 | | |
244 | | /// Returns the full lowercase mapping of the given string as a String. |
245 | | /// |
246 | | /// This function is context and language sensitive. Callers should pass the text's language |
247 | | /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or |
248 | | /// `Default::default()` for the root locale. |
249 | | /// |
250 | | /// See [`Self::lowercase()`] for the equivalent lower-level function that returns a [`Writeable`] |
251 | | /// |
252 | | /// # Examples |
253 | | /// |
254 | | /// ```rust |
255 | | /// use icu::casemap::CaseMapper; |
256 | | /// use icu::locid::langid; |
257 | | /// |
258 | | /// let cm = CaseMapper::new(); |
259 | | /// let root = langid!("und"); |
260 | | /// |
261 | | /// assert_eq!(cm.lowercase_to_string("hEllO WorLd", &root), "hello world"); |
262 | | /// assert_eq!(cm.lowercase_to_string("Γειά σου Κόσμε", &root), "γειά σου κόσμε"); |
263 | | /// assert_eq!(cm.lowercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया"); |
264 | | /// assert_eq!(cm.lowercase_to_string("Привет мир", &root), "привет мир"); |
265 | | /// |
266 | | /// // Some behavior is language-sensitive |
267 | | /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &root), "constantinople"); |
268 | | /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &langid!("tr")), "constantınople"); |
269 | | /// ``` |
270 | 0 | pub fn lowercase_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String { |
271 | 0 | self.lowercase(src, langid).write_to_string().into_owned() |
272 | 0 | } |
273 | | |
274 | | /// Returns the full uppercase mapping of the given string as a String. |
275 | | /// |
276 | | /// This function is context and language sensitive. Callers should pass the text's language |
277 | | /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or |
278 | | /// `Default::default()` for the root locale. |
279 | | /// |
280 | | /// See [`Self::uppercase()`] for the equivalent lower-level function that returns a [`Writeable`] |
281 | | /// |
282 | | /// # Examples |
283 | | /// |
284 | | /// ```rust |
285 | | /// use icu::casemap::CaseMapper; |
286 | | /// use icu::locid::langid; |
287 | | /// |
288 | | /// let cm = CaseMapper::new(); |
289 | | /// let root = langid!("und"); |
290 | | /// |
291 | | /// assert_eq!(cm.uppercase_to_string("hEllO WorLd", &root), "HELLO WORLD"); |
292 | | /// assert_eq!(cm.uppercase_to_string("Γειά σου Κόσμε", &root), "ΓΕΙΆ ΣΟΥ ΚΌΣΜΕ"); |
293 | | /// assert_eq!(cm.uppercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया"); |
294 | | /// assert_eq!(cm.uppercase_to_string("Привет мир", &root), "ПРИВЕТ МИР"); |
295 | | /// |
296 | | /// // Some behavior is language-sensitive |
297 | | /// assert_eq!(cm.uppercase_to_string("istanbul", &root), "ISTANBUL"); |
298 | | /// assert_eq!(cm.uppercase_to_string("istanbul", &langid!("tr")), "İSTANBUL"); // Turkish dotted i |
299 | | /// |
300 | | /// assert_eq!(cm.uppercase_to_string("և Երևանի", &root), "ԵՒ ԵՐԵՒԱՆԻ"); |
301 | | /// assert_eq!(cm.uppercase_to_string("և Երևանի", &langid!("hy")), "ԵՎ ԵՐԵՎԱՆԻ"); // Eastern Armenian ech-yiwn ligature |
302 | | /// ``` |
303 | 0 | pub fn uppercase_to_string(&self, src: &str, langid: &LanguageIdentifier) -> String { |
304 | 0 | self.uppercase(src, langid).write_to_string().into_owned() |
305 | 0 | } |
306 | | |
307 | | /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating |
308 | | /// the string as a single segment (and thus only titlecasing the beginning of it). Performs |
309 | | /// the specified leading adjustment behavior from the options without loading additional data. |
310 | | /// |
311 | | /// Note that [`TitlecaseMapper`] has better behavior, most users should consider using |
312 | | /// it instead. This method primarily exists for people who care about the amount of data being loaded. |
313 | | /// |
314 | | /// This should typically be used as a lower-level helper to construct the titlecasing operation desired |
315 | | /// by the application, for example one can titlecase on a per-word basis by mixing this with |
316 | | /// a `WordSegmenter`. |
317 | | /// |
318 | | /// This function is context and language sensitive. Callers should pass the text's language |
319 | | /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or |
320 | | /// `Default::default()` for the root locale. |
321 | | /// |
322 | | /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`] |
323 | | /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load |
324 | | /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See |
325 | | /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between |
326 | | /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode |
327 | | /// is [`LeadingAdjustment::None`]. |
328 | | /// |
329 | | /// See [`Self::titlecase_segment_with_only_case_data()`] for the equivalent lower-level function that returns a [`Writeable`] |
330 | | /// |
331 | | /// # Examples |
332 | | /// |
333 | | /// ```rust |
334 | | /// use icu::casemap::CaseMapper; |
335 | | /// use icu::locid::langid; |
336 | | /// |
337 | | /// let cm = CaseMapper::new(); |
338 | | /// let root = langid!("und"); |
339 | | /// |
340 | | /// let default_options = Default::default(); |
341 | | /// |
342 | | /// // note that the subsequent words are not titlecased, this function assumes |
343 | | /// // that the entire string is a single segment and only titlecases at the beginning. |
344 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("hEllO WorLd", &root, default_options), "Hello world"); |
345 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε"); |
346 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया"); |
347 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Привет мир", &root, default_options), "Привет мир"); |
348 | | /// |
349 | | /// // Some behavior is language-sensitive |
350 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &root, default_options), "Istanbul"); |
351 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i |
352 | | /// |
353 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &root, default_options), "Եւ երևանի"); |
354 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature |
355 | | /// |
356 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &root, default_options), "Ijkdijk"); |
357 | | /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph |
358 | | /// ``` |
359 | | /// |
360 | | /// [`TitlecaseMapper`]: crate::TitlecaseMapper |
361 | 0 | pub fn titlecase_segment_with_only_case_data_to_string( |
362 | 0 | &self, |
363 | 0 | src: &str, |
364 | 0 | langid: &LanguageIdentifier, |
365 | 0 | options: TitlecaseOptions, |
366 | 0 | ) -> String { |
367 | 0 | self.titlecase_segment_with_only_case_data(src, langid, options) |
368 | 0 | .write_to_string() |
369 | 0 | .into_owned() |
370 | 0 | } |
371 | | |
372 | | /// Case-folds the characters in the given string as a String. |
373 | | /// This function is locale-independent and context-insensitive. |
374 | | /// |
375 | | /// Can be used to test if two strings are case-insensitively equivalent. |
376 | | /// |
377 | | /// See [`Self::fold()`] for the equivalent lower-level function that returns a [`Writeable`] |
378 | | ///s s |
379 | | /// # Examples |
380 | | /// |
381 | | /// ```rust |
382 | | /// use icu::casemap::CaseMapper; |
383 | | /// |
384 | | /// let cm = CaseMapper::new(); |
385 | | /// |
386 | | /// // Check if two strings are equivalent case insensitively |
387 | | /// assert_eq!(cm.fold_string("hEllO WorLd"), cm.fold_string("HELLO worlD")); |
388 | | /// |
389 | | /// assert_eq!(cm.fold_string("hEllO WorLd"), "hello world"); |
390 | | /// assert_eq!(cm.fold_string("Γειά σου Κόσμε"), "γειά σου κόσμε"); |
391 | | /// assert_eq!(cm.fold_string("नमस्ते दुनिया"), "नमस्ते दुनिया"); |
392 | | /// assert_eq!(cm.fold_string("Привет мир"), "привет мир"); |
393 | | /// ``` |
394 | 0 | pub fn fold_string(&self, src: &str) -> String { |
395 | 0 | self.fold(src).write_to_string().into_owned() |
396 | 0 | } |
397 | | |
398 | | /// Case-folds the characters in the given string as a String, |
399 | | /// using Turkic (T) mappings for dotted/dotless I. |
400 | | /// This function is locale-independent and context-insensitive. |
401 | | /// |
402 | | /// Can be used to test if two strings are case-insensitively equivalent. |
403 | | /// |
404 | | /// See [`Self::fold_turkic()`] for the equivalent lower-level function that returns a [`Writeable`] |
405 | | /// |
406 | | /// # Examples |
407 | | /// |
408 | | /// ```rust |
409 | | /// use icu::casemap::CaseMapper; |
410 | | /// |
411 | | /// let cm = CaseMapper::new(); |
412 | | /// |
413 | | /// // Check if two strings are equivalent case insensitively |
414 | | /// assert_eq!(cm.fold_turkic_string("İstanbul"), cm.fold_turkic_string("iSTANBUL")); |
415 | | /// |
416 | | /// assert_eq!(cm.fold_turkic_string("İstanbul not Constantinople"), "istanbul not constantinople"); |
417 | | /// assert_eq!(cm.fold_turkic_string("Istanbul not Constantınople"), "ıstanbul not constantınople"); |
418 | | /// |
419 | | /// assert_eq!(cm.fold_turkic_string("hEllO WorLd"), "hello world"); |
420 | | /// assert_eq!(cm.fold_turkic_string("Γειά σου Κόσμε"), "γειά σου κόσμε"); |
421 | | /// assert_eq!(cm.fold_turkic_string("नमस्ते दुनिया"), "नमस्ते दुनिया"); |
422 | | /// assert_eq!(cm.fold_turkic_string("Привет мир"), "привет мир"); |
423 | | /// ``` |
424 | 0 | pub fn fold_turkic_string(&self, src: &str) -> String { |
425 | 0 | self.fold_turkic(src).write_to_string().into_owned() |
426 | 0 | } |
427 | | |
428 | | /// Adds all simple case mappings and the full case folding for `c` to `set`. |
429 | | /// Also adds special case closure mappings. |
430 | | /// |
431 | | /// Identical to [`CaseMapCloser::add_case_closure_to()`], see docs there for more information. |
432 | | /// This method is duplicated so that one does not need to load extra unfold data |
433 | | /// if they only need this and not also [`CaseMapCloser::add_string_case_closure_to()`]. |
434 | | /// |
435 | | /// |
436 | | /// # Examples |
437 | | /// |
438 | | /// ```rust |
439 | | /// use icu::casemap::CaseMapper; |
440 | | /// use icu::collections::codepointinvlist::CodePointInversionListBuilder; |
441 | | /// |
442 | | /// let cm = CaseMapper::new(); |
443 | | /// let mut builder = CodePointInversionListBuilder::new(); |
444 | | /// cm.add_case_closure_to('s', &mut builder); |
445 | | /// |
446 | | /// let set = builder.build(); |
447 | | /// |
448 | | /// assert!(set.contains('S')); |
449 | | /// assert!(set.contains('ſ')); |
450 | | /// assert!(!set.contains('s')); // does not contain itself |
451 | | /// ``` |
452 | | /// |
453 | | /// [`CaseMapCloser::add_case_closure_to()`]: crate::CaseMapCloser::add_case_closure_to |
454 | | /// [`CaseMapCloser::add_string_case_closure_to()`]: crate::CaseMapCloser::add_string_case_closure_to |
455 | 0 | pub fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) { |
456 | 0 | self.data.get().add_case_closure_to(c, set); |
457 | 0 | } Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::add_case_closure_to::<icu_collections::codepointinvlist::builder::CodePointInversionListBuilder> Unexecuted instantiation: <icu_casemap::casemapper::CaseMapper>::add_case_closure_to::<_> |
458 | | |
459 | | /// Returns the lowercase mapping of the given `char`. |
460 | | /// This function only implements simple and common mappings. Full mappings, |
461 | | /// which can map one `char` to a string, are not included. |
462 | | /// For full mappings, use [`CaseMapper::lowercase`]. |
463 | | /// |
464 | | /// # Examples |
465 | | /// |
466 | | /// ```rust |
467 | | /// use icu::casemap::CaseMapper; |
468 | | /// |
469 | | /// let cm = CaseMapper::new(); |
470 | | /// |
471 | | /// assert_eq!(cm.simple_lowercase('C'), 'c'); |
472 | | /// assert_eq!(cm.simple_lowercase('c'), 'c'); |
473 | | /// assert_eq!(cm.simple_lowercase('Ć'), 'ć'); |
474 | | /// assert_eq!(cm.simple_lowercase('Γ'), 'γ'); |
475 | | /// ``` |
476 | 0 | pub fn simple_lowercase(&self, c: char) -> char { |
477 | 0 | self.data.get().simple_lower(c) |
478 | 0 | } |
479 | | |
480 | | /// Returns the uppercase mapping of the given `char`. |
481 | | /// This function only implements simple and common mappings. Full mappings, |
482 | | /// which can map one `char` to a string, are not included. |
483 | | /// For full mappings, use [`CaseMapper::uppercase`]. |
484 | | /// |
485 | | /// # Examples |
486 | | /// |
487 | | /// ```rust |
488 | | /// use icu::casemap::CaseMapper; |
489 | | /// |
490 | | /// let cm = CaseMapper::new(); |
491 | | /// |
492 | | /// assert_eq!(cm.simple_uppercase('c'), 'C'); |
493 | | /// assert_eq!(cm.simple_uppercase('C'), 'C'); |
494 | | /// assert_eq!(cm.simple_uppercase('ć'), 'Ć'); |
495 | | /// assert_eq!(cm.simple_uppercase('γ'), 'Γ'); |
496 | | /// |
497 | | /// assert_eq!(cm.simple_uppercase('dz'), 'DZ'); |
498 | | /// ``` |
499 | 0 | pub fn simple_uppercase(&self, c: char) -> char { |
500 | 0 | self.data.get().simple_upper(c) |
501 | 0 | } |
502 | | |
503 | | /// Returns the titlecase mapping of the given `char`. |
504 | | /// This function only implements simple and common mappings. Full mappings, |
505 | | /// which can map one `char` to a string, are not included. |
506 | | /// |
507 | | /// # Examples |
508 | | /// |
509 | | /// ```rust |
510 | | /// use icu::casemap::CaseMapper; |
511 | | /// |
512 | | /// let cm = CaseMapper::new(); |
513 | | /// |
514 | | /// assert_eq!(cm.simple_titlecase('dz'), 'Dz'); |
515 | | /// |
516 | | /// assert_eq!(cm.simple_titlecase('c'), 'C'); |
517 | | /// assert_eq!(cm.simple_titlecase('C'), 'C'); |
518 | | /// assert_eq!(cm.simple_titlecase('ć'), 'Ć'); |
519 | | /// assert_eq!(cm.simple_titlecase('γ'), 'Γ'); |
520 | | /// ``` |
521 | 0 | pub fn simple_titlecase(&self, c: char) -> char { |
522 | 0 | self.data.get().simple_title(c) |
523 | 0 | } |
524 | | |
525 | | /// Returns the simple case folding of the given char. |
526 | | /// For full mappings, use [`CaseMapper::fold`]. |
527 | | /// |
528 | | /// This function can be used to perform caseless matches on |
529 | | /// individual characters. |
530 | | /// > *Note:* With Unicode 15.0 data, there are three |
531 | | /// > pairs of characters for which equivalence under this |
532 | | /// > function is inconsistent with equivalence of the |
533 | | /// > one-character strings under [`CaseMapper::fold`]. |
534 | | /// > This is resolved in Unicode 15.1 and later. |
535 | | /// |
536 | | /// For compatibility applications where simple case folding |
537 | | /// of strings is required, this function can be applied to |
538 | | /// each character of a string. Note that the resulting |
539 | | /// equivalence relation is different from that obtained |
540 | | /// by [`CaseMapper::fold`]: |
541 | | /// The strings "Straße" and "STRASSE" are distinct |
542 | | /// under simple case folding, but are equivalent under |
543 | | /// default (full) case folding. |
544 | | /// |
545 | | /// # Examples |
546 | | /// |
547 | | /// ```rust |
548 | | /// use icu::casemap::CaseMapper; |
549 | | /// |
550 | | /// let cm = CaseMapper::new(); |
551 | | /// |
552 | | /// // perform case insensitive checks |
553 | | /// assert_eq!(cm.simple_fold('σ'), cm.simple_fold('ς')); |
554 | | /// assert_eq!(cm.simple_fold('Σ'), cm.simple_fold('ς')); |
555 | | /// |
556 | | /// assert_eq!(cm.simple_fold('c'), 'c'); |
557 | | /// assert_eq!(cm.simple_fold('Ć'), 'ć'); |
558 | | /// assert_eq!(cm.simple_fold('Γ'), 'γ'); |
559 | | /// assert_eq!(cm.simple_fold('ς'), 'σ'); |
560 | | /// |
561 | | /// assert_eq!(cm.simple_fold('ß'), 'ß'); |
562 | | /// assert_eq!(cm.simple_fold('I'), 'i'); |
563 | | /// assert_eq!(cm.simple_fold('İ'), 'İ'); |
564 | | /// assert_eq!(cm.simple_fold('ı'), 'ı'); |
565 | | /// ``` |
566 | 0 | pub fn simple_fold(&self, c: char) -> char { |
567 | 0 | self.data.get().simple_fold(c, FoldOptions::default()) |
568 | 0 | } |
569 | | |
570 | | /// Returns the simple case folding of the given char, using Turkic (T) mappings for |
571 | | /// dotted/dotless i. This function does not fold `i` and `I` to the same character. Instead, |
572 | | /// `I` will fold to `ı`, and `İ` will fold to `i`. Otherwise, this is the same as |
573 | | /// [`CaseMapper::fold()`]. |
574 | | /// |
575 | | /// You can use the case folding to perform Turkic caseless matches on characters |
576 | | /// provided they don't full-casefold to strings. To avoid that situation, |
577 | | /// convert to a string and use [`CaseMapper::fold_turkic`]. |
578 | | /// |
579 | | /// |
580 | | /// # Examples |
581 | | /// |
582 | | /// ```rust |
583 | | /// use icu::casemap::CaseMapper; |
584 | | /// |
585 | | /// let cm = CaseMapper::new(); |
586 | | /// |
587 | | /// assert_eq!(cm.simple_fold_turkic('I'), 'ı'); |
588 | | /// assert_eq!(cm.simple_fold_turkic('İ'), 'i'); |
589 | | /// ``` |
590 | 0 | pub fn simple_fold_turkic(&self, c: char) -> char { |
591 | 0 | self.data |
592 | 0 | .get() |
593 | 0 | .simple_fold(c, FoldOptions::with_turkic_mappings()) |
594 | 0 | } |
595 | | } |
596 | | |
597 | | #[cfg(test)] |
598 | | mod tests { |
599 | | use super::*; |
600 | | use icu_locid::langid; |
601 | | |
602 | | #[test] |
603 | | /// Tests for SpecialCasing.txt. Some of the special cases are data-driven, some are code-driven |
604 | | fn test_special_cases() { |
605 | | let cm = CaseMapper::new(); |
606 | | let root = langid!("und"); |
607 | | let default_options = Default::default(); |
608 | | |
609 | | // Ligatures |
610 | | |
611 | | // U+FB00 LATIN SMALL LIGATURE FF |
612 | | assert_eq!(cm.uppercase_to_string("ff", &root), "FF"); |
613 | | // U+FB05 LATIN SMALL LIGATURE LONG S T |
614 | | assert_eq!(cm.uppercase_to_string("ſt", &root), "ST"); |
615 | | |
616 | | // No corresponding uppercased character |
617 | | |
618 | | // U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE |
619 | | assert_eq!(cm.uppercase_to_string("ʼn", &root), "ʼN"); |
620 | | |
621 | | // U+1F50 GREEK SMALL LETTER UPSILON WITH PSILI |
622 | | assert_eq!(cm.uppercase_to_string("ὐ", &root), "Υ̓"); |
623 | | // U+1FF6 GREEK SMALL LETTER OMEGA WITH PERISPOMENI |
624 | | assert_eq!(cm.uppercase_to_string("ῶ", &root), "Ω͂"); |
625 | | |
626 | | // YPOGEGRAMMENI / PROSGEGRAMMENI special cases |
627 | | |
628 | | // E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA> |
629 | | assert_eq!( |
630 | | cm.uppercase_to_string("α\u{0313}\u{0345}", &root), |
631 | | "Α\u{0313}Ι" |
632 | | ); |
633 | | // but the YPOGEGRAMMENI should not titlecase |
634 | | assert_eq!( |
635 | | cm.titlecase_segment_with_only_case_data_to_string( |
636 | | "α\u{0313}\u{0345}", |
637 | | &root, |
638 | | default_options |
639 | | ), |
640 | | "Α\u{0313}\u{0345}" |
641 | | ); |
642 | | |
643 | | // U+1F80 GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI |
644 | | assert_eq!( |
645 | | cm.titlecase_segment_with_only_case_data_to_string("ᾀ", &root, default_options), |
646 | | "ᾈ" |
647 | | ); |
648 | | assert_eq!(cm.uppercase_to_string("ᾀ", &root), "ἈΙ"); |
649 | | |
650 | | // U+1FFC GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI |
651 | | assert_eq!(cm.lowercase_to_string("ῼ", &root), "ῳ"); |
652 | | assert_eq!( |
653 | | cm.titlecase_segment_with_only_case_data_to_string("ῼ", &root, default_options), |
654 | | "ῼ" |
655 | | ); |
656 | | assert_eq!(cm.uppercase_to_string("ῼ", &root), "ΩΙ"); |
657 | | |
658 | | // U+1F98 GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI |
659 | | assert_eq!(cm.lowercase_to_string("ᾘ", &root), "ᾐ"); |
660 | | assert_eq!( |
661 | | cm.titlecase_segment_with_only_case_data_to_string("ᾘ", &root, default_options), |
662 | | "ᾘ" |
663 | | ); |
664 | | assert_eq!(cm.uppercase_to_string("ᾘ", &root), "ἨΙ"); |
665 | | |
666 | | // U+1FB2 GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI |
667 | | assert_eq!(cm.lowercase_to_string("ᾲ", &root), "ᾲ"); |
668 | | assert_eq!( |
669 | | cm.titlecase_segment_with_only_case_data_to_string("ᾲ", &root, default_options), |
670 | | "Ὰ\u{345}" |
671 | | ); |
672 | | assert_eq!(cm.uppercase_to_string("ᾲ", &root), "ᾺΙ"); |
673 | | |
674 | | // Final sigma test |
675 | | // U+03A3 GREEK CAPITAL LETTER SIGMA in Final_Sigma context |
676 | | assert_eq!(cm.lowercase_to_string("ΙΙΙΣ", &root), "ιιις"); |
677 | | |
678 | | // Turkish / Azeri |
679 | | let tr = langid!("tr"); |
680 | | let az = langid!("az"); |
681 | | // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE |
682 | | assert_eq!(cm.lowercase_to_string("İ", &tr), "i"); |
683 | | assert_eq!(cm.lowercase_to_string("İ", &az), "i"); |
684 | | assert_eq!( |
685 | | cm.titlecase_segment_with_only_case_data_to_string("İ", &tr, default_options), |
686 | | "İ" |
687 | | ); |
688 | | assert_eq!( |
689 | | cm.titlecase_segment_with_only_case_data_to_string("İ", &az, default_options), |
690 | | "İ" |
691 | | ); |
692 | | assert_eq!(cm.uppercase_to_string("İ", &tr), "İ"); |
693 | | assert_eq!(cm.uppercase_to_string("İ", &az), "İ"); |
694 | | |
695 | | // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE |
696 | | assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i"); |
697 | | assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i"); |
698 | | assert_eq!( |
699 | | cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &tr, default_options), |
700 | | "I\u{0307}" |
701 | | ); |
702 | | assert_eq!( |
703 | | cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &az, default_options), |
704 | | "I\u{0307}" |
705 | | ); |
706 | | assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}"); |
707 | | assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}"); |
708 | | |
709 | | // U+0049 LATIN CAPITAL LETTER I |
710 | | assert_eq!(cm.lowercase_to_string("I", &tr), "ı"); |
711 | | assert_eq!(cm.lowercase_to_string("I", &az), "ı"); |
712 | | assert_eq!( |
713 | | cm.titlecase_segment_with_only_case_data_to_string("I", &tr, default_options), |
714 | | "I" |
715 | | ); |
716 | | assert_eq!( |
717 | | cm.titlecase_segment_with_only_case_data_to_string("I", &az, default_options), |
718 | | "I" |
719 | | ); |
720 | | assert_eq!(cm.uppercase_to_string("I", &tr), "I"); |
721 | | assert_eq!(cm.uppercase_to_string("I", &az), "I"); |
722 | | |
723 | | // U+0069 LATIN SMALL LETTER I |
724 | | assert_eq!(cm.lowercase_to_string("i", &tr), "i"); |
725 | | assert_eq!(cm.lowercase_to_string("i", &az), "i"); |
726 | | assert_eq!( |
727 | | cm.titlecase_segment_with_only_case_data_to_string("i", &tr, default_options), |
728 | | "İ" |
729 | | ); |
730 | | assert_eq!( |
731 | | cm.titlecase_segment_with_only_case_data_to_string("i", &az, default_options), |
732 | | "İ" |
733 | | ); |
734 | | assert_eq!(cm.uppercase_to_string("i", &tr), "İ"); |
735 | | assert_eq!(cm.uppercase_to_string("i", &az), "İ"); |
736 | | } |
737 | | |
738 | | #[test] |
739 | | fn test_cherokee_case_folding() { |
740 | | let case_mapping = CaseMapper::new(); |
741 | | assert_eq!(case_mapping.simple_fold('Ꭰ'), 'Ꭰ'); |
742 | | assert_eq!(case_mapping.simple_fold('ꭰ'), 'Ꭰ'); |
743 | | assert_eq!(case_mapping.simple_fold_turkic('Ꭰ'), 'Ꭰ'); |
744 | | assert_eq!(case_mapping.simple_fold_turkic('ꭰ'), 'Ꭰ'); |
745 | | assert_eq!(case_mapping.fold_string("Ꭰ"), "Ꭰ"); |
746 | | assert_eq!(case_mapping.fold_string("ꭰ"), "Ꭰ"); |
747 | | assert_eq!(case_mapping.fold_turkic_string("Ꭰ"), "Ꭰ"); |
748 | | assert_eq!(case_mapping.fold_turkic_string("ꭰ"), "Ꭰ"); |
749 | | } |
750 | | } |