/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_casemap-1.5.1/src/internals.rs
Line | Count | Source |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | //! This module contains most of the actual algorithms for case mapping. |
6 | | //! |
7 | | //! Primarily, it implements methods on `CaseMapV1`, which contains the data model. |
8 | | |
9 | | use crate::greek_to_me::{ |
10 | | self, GreekCombiningCharacterSequenceDiacritics, GreekDiacritics, GreekPrecomposedLetterData, |
11 | | GreekVowel, |
12 | | }; |
13 | | use crate::provider::data::{DotType, MappingKind}; |
14 | | use crate::provider::exception_helpers::ExceptionSlot; |
15 | | use crate::provider::{CaseMapUnfoldV1, CaseMapV1}; |
16 | | use crate::set::ClosureSink; |
17 | | use crate::titlecase::TrailingCase; |
18 | | use core::fmt; |
19 | | use icu_locid::LanguageIdentifier; |
20 | | use writeable::Writeable; |
21 | | |
22 | | const ACUTE: char = '\u{301}'; |
23 | | |
24 | | // Used to control the behavior of CaseMapper::fold. |
25 | | // Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i. |
26 | | #[derive(Copy, Clone, Default)] |
27 | | pub struct FoldOptions { |
28 | | exclude_special_i: bool, |
29 | | } |
30 | | |
31 | | impl FoldOptions { |
32 | 0 | pub fn with_turkic_mappings() -> Self { |
33 | 0 | Self { |
34 | 0 | exclude_special_i: true, |
35 | 0 | } |
36 | 0 | } |
37 | | } |
38 | | |
39 | | /// Helper type that wraps a writeable in a prefix string |
40 | | pub(crate) struct StringAndWriteable<'a, W> { |
41 | | pub string: &'a str, |
42 | | pub writeable: W, |
43 | | } |
44 | | |
45 | | impl<'a, Wr: Writeable> Writeable for StringAndWriteable<'a, Wr> { |
46 | 0 | fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result { |
47 | 0 | sink.write_str(self.string)?; |
48 | 0 | self.writeable.write_to(sink) |
49 | 0 | } Unexecuted instantiation: <icu_casemap::internals::StringAndWriteable<icu_casemap::internals::FullCaseWriteable<true>> as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable> Unexecuted instantiation: <icu_casemap::internals::StringAndWriteable<icu_casemap::internals::FullCaseWriteable<true>> as writeable::Writeable>::write_to::<alloc::string::String> |
50 | 0 | fn writeable_length_hint(&self) -> writeable::LengthHint { |
51 | 0 | writeable::LengthHint::exact(self.string.len()) + self.writeable.writeable_length_hint() |
52 | 0 | } |
53 | | } |
54 | | |
55 | | pub(crate) struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> { |
56 | | data: &'a CaseMapV1<'a>, |
57 | | src: &'a str, |
58 | | locale: CaseMapLocale, |
59 | | mapping: MappingKind, |
60 | | titlecase_tail_casing: TrailingCase, |
61 | | } |
62 | | |
63 | | impl<'a, const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'a, IS_TITLE_CONTEXT> { |
64 | | #[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds |
65 | 0 | fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result { |
66 | 0 | let src = self.src; |
67 | 0 | let mut mapping = self.mapping; |
68 | 0 | let mut iter = src.char_indices(); |
69 | 0 | for (i, c) in &mut iter { |
70 | 0 | let context = ContextIterator::new(&src[..i], &src[i..]); |
71 | 0 | self.data |
72 | 0 | .full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?; |
73 | 0 | if IS_TITLE_CONTEXT { |
74 | 0 | if self.titlecase_tail_casing == TrailingCase::Lower { |
75 | 0 | mapping = MappingKind::Lower; |
76 | 0 | } else { |
77 | 0 | break; |
78 | | } |
79 | 0 | } |
80 | | } |
81 | | // Write the rest of the string |
82 | 0 | if IS_TITLE_CONTEXT && self.titlecase_tail_casing == TrailingCase::Unchanged { |
83 | 0 | sink.write_str(iter.as_str())?; |
84 | 0 | } |
85 | 0 | Ok(()) |
86 | 0 | } Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<false> as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable> Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<true> as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable> Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<false> as writeable::Writeable>::write_to::<alloc::string::String> Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<true> as writeable::Writeable>::write_to::<alloc::string::String> |
87 | 0 | fn writeable_length_hint(&self) -> writeable::LengthHint { |
88 | 0 | writeable::LengthHint::at_least(self.src.len()) |
89 | 0 | } Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<false> as writeable::Writeable>::writeable_length_hint Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<true> as writeable::Writeable>::writeable_length_hint |
90 | | } |
91 | | |
92 | | impl<'data> CaseMapV1<'data> { |
93 | 0 | fn simple_helper(&self, c: char, kind: MappingKind) -> char { |
94 | 0 | let data = self.lookup_data(c); |
95 | 0 | if !data.has_exception() { |
96 | 0 | if data.is_relevant_to(kind) { |
97 | 0 | let folded = c as i32 + data.delta() as i32; |
98 | | // GIGO: delta should be valid |
99 | 0 | char::from_u32(folded as u32).unwrap_or(c) |
100 | | } else { |
101 | 0 | c |
102 | | } |
103 | | } else { |
104 | 0 | let idx = data.exception_index(); |
105 | 0 | let exception = self.exceptions.get(idx); |
106 | 0 | if data.is_relevant_to(kind) { |
107 | 0 | if let Some(simple) = exception.get_simple_case_slot_for(c) { |
108 | 0 | return simple; |
109 | 0 | } |
110 | 0 | } |
111 | 0 | exception.slot_char_for_kind(kind).unwrap_or(c) |
112 | | } |
113 | 0 | } |
114 | | |
115 | | // Returns the lowercase mapping of the given `char`. |
116 | | #[inline] |
117 | 0 | pub(crate) fn simple_lower(&self, c: char) -> char { |
118 | 0 | self.simple_helper(c, MappingKind::Lower) |
119 | 0 | } |
120 | | |
121 | | // Returns the uppercase mapping of the given `char`. |
122 | | #[inline] |
123 | 0 | pub(crate) fn simple_upper(&self, c: char) -> char { |
124 | 0 | self.simple_helper(c, MappingKind::Upper) |
125 | 0 | } |
126 | | |
127 | | // Returns the titlecase mapping of the given `char`. |
128 | | #[inline] |
129 | 0 | pub(crate) fn simple_title(&self, c: char) -> char { |
130 | 0 | self.simple_helper(c, MappingKind::Title) |
131 | 0 | } |
132 | | |
133 | | // Return the simple case folding mapping of the given char. |
134 | | #[inline] |
135 | 0 | pub(crate) fn simple_fold(&self, c: char, options: FoldOptions) -> char { |
136 | 0 | let data = self.lookup_data(c); |
137 | 0 | if !data.has_exception() { |
138 | 0 | if data.is_upper_or_title() { |
139 | 0 | let folded = c as i32 + data.delta() as i32; |
140 | | // GIGO: delta should be valid |
141 | 0 | char::from_u32(folded as u32).unwrap_or(c) |
142 | | } else { |
143 | 0 | c |
144 | | } |
145 | | } else { |
146 | | // TODO: if we move conditional fold and no_simple_case_folding into |
147 | | // simple_helper, this function can just call simple_helper. |
148 | 0 | let idx = data.exception_index(); |
149 | 0 | let exception = self.exceptions.get(idx); |
150 | 0 | if exception.bits.has_conditional_fold() { |
151 | 0 | self.simple_fold_special_case(c, options) |
152 | 0 | } else if exception.bits.no_simple_case_folding() { |
153 | 0 | c |
154 | 0 | } else if data.is_upper_or_title() && exception.has_slot(ExceptionSlot::Delta) { |
155 | | // unwrap_or case should never happen but best to avoid panics |
156 | 0 | exception.get_simple_case_slot_for(c).unwrap_or('\0') |
157 | 0 | } else if let Some(slot_char) = exception.slot_char_for_kind(MappingKind::Fold) { |
158 | 0 | slot_char |
159 | | } else { |
160 | 0 | c |
161 | | } |
162 | | } |
163 | 0 | } |
164 | | |
165 | 0 | fn dot_type(&self, c: char) -> DotType { |
166 | 0 | let data = self.lookup_data(c); |
167 | 0 | if !data.has_exception() { |
168 | 0 | data.dot_type() |
169 | | } else { |
170 | 0 | let idx = data.exception_index(); |
171 | 0 | self.exceptions.get(idx).bits.dot_type() |
172 | | } |
173 | 0 | } |
174 | | |
175 | | // Returns true if this code point is is case-sensitive. |
176 | | // This is not currently exposed. |
177 | | #[allow(dead_code)] |
178 | 0 | fn is_case_sensitive(&self, c: char) -> bool { |
179 | 0 | let data = self.lookup_data(c); |
180 | 0 | if !data.has_exception() { |
181 | 0 | data.is_sensitive() |
182 | | } else { |
183 | 0 | let idx = data.exception_index(); |
184 | 0 | self.exceptions.get(idx).bits.is_sensitive() |
185 | | } |
186 | 0 | } |
187 | | |
188 | | /// Returns whether the character is cased |
189 | 0 | pub(crate) fn is_cased(&self, c: char) -> bool { |
190 | 0 | self.lookup_data(c).case_type().is_some() |
191 | 0 | } |
192 | | |
193 | | #[inline(always)] |
194 | | // IS_TITLE_CONTEXT must be true if kind is MappingKind::Title |
195 | | // The kind may be a different kind with IS_TITLE_CONTEXT still true because |
196 | | // titlecasing a segment involves switching to lowercase later |
197 | 0 | fn full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>( |
198 | 0 | &self, |
199 | 0 | c: char, |
200 | 0 | context: ContextIterator, |
201 | 0 | locale: CaseMapLocale, |
202 | 0 | kind: MappingKind, |
203 | 0 | sink: &mut W, |
204 | 0 | ) -> fmt::Result { |
205 | | // If using a title mapping IS_TITLE_CONTEXT must be true |
206 | 0 | debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT); |
207 | | // In a title context, kind MUST be Title or Lower |
208 | 0 | debug_assert!( |
209 | 0 | !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower |
210 | | ); |
211 | | |
212 | | // ICU4C's non-standard extension for Dutch IJ titlecasing |
213 | | // handled here instead of in full_lower_special_case because J does not have conditional |
214 | | // special casemapping. |
215 | 0 | if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower { |
216 | | // When titlecasing, a J found immediately after an I at the beginning of the segment |
217 | | // should also uppercase. They are both allowed to have an acute accent but it must |
218 | | // be present on both letters or neither. They may not have any other combining marks. |
219 | 0 | if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) { |
220 | 0 | return sink.write_char('J'); |
221 | 0 | } |
222 | 0 | } |
223 | | |
224 | | // ICU4C's non-standard extension for Greek uppercasing: |
225 | | // https://icu.unicode.org/design/case/greek-upper. |
226 | | // Effectively removes Greek accents from Greek vowels during uppercasing, |
227 | | // whilst attempting to preserve additional marks like the dialytika (diæresis) |
228 | | // and ypogegrammeni (combining small iota). |
229 | 0 | if !IS_TITLE_CONTEXT && locale == CaseMapLocale::Greek && kind == MappingKind::Upper { |
230 | | // Remove all combining diacritics on a Greek letter. |
231 | | // Ypogegrammeni is not an accent mark and is handled by regular casemapping (it turns into |
232 | | // a capital iota). |
233 | | // The dialytika is removed here, but it may be added again when the base letter is being processed. |
234 | 0 | if greek_to_me::is_greek_diacritic_except_ypogegrammeni(c) |
235 | 0 | && context.preceded_by_greek_letter() |
236 | | { |
237 | 0 | return Ok(()); |
238 | 0 | } |
239 | 0 | let data = greek_to_me::get_data(c); |
240 | | // Check if the character is a Greek vowel |
241 | 0 | match data { |
242 | 0 | Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => { |
243 | | // Get the diacritics on the character itself, and add any further combining diacritics |
244 | | // from the context. |
245 | 0 | let mut diacritics = context.add_greek_diacritics(precomposed_diacritics); |
246 | | // If the previous vowel had an accent (which would be removed) but no dialytika, |
247 | | // and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate |
248 | | // the now-unaccented adjacent vowels from a digraph/diphthong. |
249 | | // Use a precomposed dialytika if the accent was precomposed, and a combining dialytika |
250 | | // if the accent was combining, so as to map NFD to NFD and NFC to NFC. |
251 | 0 | if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ) |
252 | | { |
253 | 0 | if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() { |
254 | 0 | if !preceding_vowel.combining.dialytika |
255 | 0 | && !preceding_vowel.precomposed.dialytika |
256 | | { |
257 | 0 | if preceding_vowel.combining.accented { |
258 | 0 | diacritics.dialytika = true; |
259 | 0 | } else { |
260 | 0 | precomposed_diacritics.dialytika = |
261 | 0 | preceding_vowel.precomposed.accented; |
262 | 0 | } |
263 | 0 | } |
264 | 0 | } |
265 | 0 | } |
266 | | // Write the base of the uppercased combining character sequence. |
267 | | // In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed. |
268 | | // In some branches the base has a precomposed diacritic. |
269 | | // In the case of the Greek disjunctive "or", a combining tonos may also be written. |
270 | 0 | match vowel { |
271 | | GreekVowel::Η => { |
272 | | // The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish |
273 | | // the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή). |
274 | | // |
275 | | // A lone η with an accent other than the oxia/tonos is not expected, |
276 | | // so there is no need to special-case the oxia/tonos. |
277 | | // The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex, |
278 | | // so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle |
279 | | // (e.g. να είναι) since Byzantine times anyway. |
280 | 0 | if diacritics.accented |
281 | 0 | && !context.followed_by_cased_letter(self) |
282 | 0 | && !context.preceded_by_cased_letter(self) |
283 | 0 | && !diacritics.ypogegrammeni |
284 | | { |
285 | 0 | if precomposed_diacritics.accented { |
286 | 0 | sink.write_char('Ή')?; |
287 | | } else { |
288 | 0 | sink.write_char('Η')?; |
289 | 0 | sink.write_char(greek_to_me::TONOS)?; |
290 | | } |
291 | | } else { |
292 | 0 | sink.write_char('Η')?; |
293 | | } |
294 | | } |
295 | 0 | GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika { |
296 | 0 | diacritics.dialytika = false; |
297 | 0 | 'Ϊ' |
298 | | } else { |
299 | 0 | vowel.into() |
300 | 0 | })?, |
301 | 0 | GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika { |
302 | 0 | diacritics.dialytika = false; |
303 | 0 | 'Ϋ' |
304 | | } else { |
305 | 0 | vowel.into() |
306 | 0 | })?, |
307 | 0 | _ => sink.write_char(vowel.into())?, |
308 | | }; |
309 | 0 | if diacritics.dialytika { |
310 | 0 | sink.write_char(greek_to_me::DIALYTIKA)?; |
311 | 0 | } |
312 | 0 | if precomposed_diacritics.ypogegrammeni { |
313 | 0 | sink.write_char('Ι')?; |
314 | 0 | } |
315 | | |
316 | 0 | return Ok(()); |
317 | | } |
318 | | // Rho might have breathing marks, we handle it specially |
319 | | // to remove them |
320 | | Some(GreekPrecomposedLetterData::Consonant(true)) => { |
321 | 0 | sink.write_char(greek_to_me::CAPITAL_RHO)?; |
322 | 0 | return Ok(()); |
323 | | } |
324 | 0 | _ => (), |
325 | | } |
326 | 0 | } |
327 | | |
328 | 0 | let data = self.lookup_data(c); |
329 | 0 | if !data.has_exception() { |
330 | 0 | if data.is_relevant_to(kind) { |
331 | 0 | let mapped = c as i32 + data.delta() as i32; |
332 | | // GIGO: delta should be valid |
333 | 0 | let mapped = char::from_u32(mapped as u32).unwrap_or(c); |
334 | 0 | sink.write_char(mapped) |
335 | | } else { |
336 | 0 | sink.write_char(c) |
337 | | } |
338 | | } else { |
339 | 0 | let idx = data.exception_index(); |
340 | 0 | let exception = self.exceptions.get(idx); |
341 | 0 | if exception.bits.has_conditional_special() { |
342 | 0 | if let Some(special) = match kind { |
343 | | MappingKind::Lower => { |
344 | 0 | self.full_lower_special_case::<IS_TITLE_CONTEXT>(c, context, locale) |
345 | | } |
346 | 0 | MappingKind::Fold => self.full_fold_special_case(c, context, locale), |
347 | 0 | MappingKind::Upper | MappingKind::Title => self |
348 | 0 | .full_upper_or_title_special_case::<IS_TITLE_CONTEXT>(c, context, locale), |
349 | | } { |
350 | 0 | return special.write_to(sink); |
351 | 0 | } |
352 | 0 | } |
353 | 0 | if let Some(mapped_string) = exception.get_fullmappings_slot_for_kind(kind) { |
354 | 0 | if !mapped_string.is_empty() { |
355 | 0 | return sink.write_str(mapped_string); |
356 | 0 | } |
357 | 0 | } |
358 | | |
359 | 0 | if kind == MappingKind::Fold && exception.bits.no_simple_case_folding() { |
360 | 0 | return sink.write_char(c); |
361 | 0 | } |
362 | | |
363 | 0 | if data.is_relevant_to(kind) { |
364 | 0 | if let Some(simple) = exception.get_simple_case_slot_for(c) { |
365 | 0 | return sink.write_char(simple); |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | 0 | if let Some(slot_char) = exception.slot_char_for_kind(kind) { |
370 | 0 | sink.write_char(slot_char) |
371 | | } else { |
372 | 0 | sink.write_char(c) |
373 | | } |
374 | | } |
375 | 0 | } Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<false, diplomat_runtime::writeable::DiplomatWriteable> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<true, diplomat_runtime::writeable::DiplomatWriteable> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<false, alloc::string::String> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<true, alloc::string::String> |
376 | | |
377 | | // These constants are used for hardcoded locale-specific foldings. |
378 | | const I_DOT: &'static str = "\u{69}\u{307}"; |
379 | | const J_DOT: &'static str = "\u{6a}\u{307}"; |
380 | | const I_OGONEK_DOT: &'static str = "\u{12f}\u{307}"; |
381 | | const I_DOT_GRAVE: &'static str = "\u{69}\u{307}\u{300}"; |
382 | | const I_DOT_ACUTE: &'static str = "\u{69}\u{307}\u{301}"; |
383 | | const I_DOT_TILDE: &'static str = "\u{69}\u{307}\u{303}"; |
384 | | |
385 | | // Special case folding mappings, hardcoded. |
386 | | // This handles the special Turkic mappings for uppercase I and dotted uppercase I |
387 | | // For non-Turkic languages, this mapping is normally not used. |
388 | | // For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. |
389 | 0 | fn simple_fold_special_case(&self, c: char, options: FoldOptions) -> char { |
390 | 0 | debug_assert!(c == '\u{49}' || c == '\u{130}'); |
391 | 0 | let is_turkic = options.exclude_special_i; |
392 | 0 | match (c, is_turkic) { |
393 | | // Turkic mappings |
394 | 0 | ('\u{49}', true) => '\u{131}', // 0049; T; 0131; # LATIN CAPITAL LETTER I |
395 | 0 | ('\u{130}', true) => '\u{69}', /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
396 | | |
397 | | // Default mappings |
398 | 0 | ('\u{49}', false) => '\u{69}', // 0049; C; 0069; # LATIN CAPITAL LETTER I |
399 | | |
400 | | // There is no simple case folding for U+130. |
401 | 0 | (c, _) => c, |
402 | | } |
403 | 0 | } |
404 | | |
405 | 0 | fn full_lower_special_case<const IS_TITLE_CONTEXT: bool>( |
406 | 0 | &self, |
407 | 0 | c: char, |
408 | 0 | context: ContextIterator, |
409 | 0 | locale: CaseMapLocale, |
410 | 0 | ) -> Option<FullMappingResult> { |
411 | 0 | if locale == CaseMapLocale::Lithuanian { |
412 | | // Lithuanian retains the dot in a lowercase i when followed by accents. |
413 | | // Introduce an explicit dot above when lowercasing capital I's and J's |
414 | | // whenever there are more accents above (of the accents used in |
415 | | // Lithuanian: grave, acute, and tilde above). |
416 | | |
417 | | // Check for accents above I, J, and I-with-ogonek. |
418 | 0 | if c == 'I' && context.followed_by_more_above(self) { |
419 | 0 | return Some(FullMappingResult::String(Self::I_DOT)); |
420 | 0 | } else if c == 'J' && context.followed_by_more_above(self) { |
421 | 0 | return Some(FullMappingResult::String(Self::J_DOT)); |
422 | 0 | } else if c == '\u{12e}' && context.followed_by_more_above(self) { |
423 | 0 | return Some(FullMappingResult::String(Self::I_OGONEK_DOT)); |
424 | 0 | } |
425 | | |
426 | | // These characters are precomposed with accents above, so we don't |
427 | | // have to look at the context. |
428 | 0 | if c == '\u{cc}' { |
429 | 0 | return Some(FullMappingResult::String(Self::I_DOT_GRAVE)); |
430 | 0 | } else if c == '\u{cd}' { |
431 | 0 | return Some(FullMappingResult::String(Self::I_DOT_ACUTE)); |
432 | 0 | } else if c == '\u{128}' { |
433 | 0 | return Some(FullMappingResult::String(Self::I_DOT_TILDE)); |
434 | 0 | } |
435 | 0 | } |
436 | | |
437 | 0 | if locale == CaseMapLocale::Turkish { |
438 | 0 | if c == '\u{130}' { |
439 | | // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri |
440 | 0 | return Some(FullMappingResult::CodePoint('i')); |
441 | 0 | } else if c == '\u{307}' && context.preceded_by_capital_i::<IS_TITLE_CONTEXT>(self) { |
442 | | // When lowercasing, remove dot_above in the sequence I + dot_above, |
443 | | // which will turn into i. This matches the behaviour of the |
444 | | // canonically equivalent I-dot_above. |
445 | | // |
446 | | // In a titlecase context, we do not want to apply this behavior to cases where the I |
447 | | // was at the beginning of the string, as that I and its marks should be handled by the |
448 | | // uppercasing rules (which ignore it, see below) |
449 | | |
450 | 0 | return Some(FullMappingResult::Remove); |
451 | 0 | } else if c == 'I' && !context.followed_by_dot_above(self) { |
452 | | // When lowercasing, unless an I is before a dot_above, it turns |
453 | | // into a dotless i. |
454 | 0 | return Some(FullMappingResult::CodePoint('\u{131}')); |
455 | 0 | } |
456 | 0 | } |
457 | | |
458 | 0 | if c == '\u{130}' { |
459 | | // Preserve canonical equivalence for I with dot. Turkic is handled above. |
460 | 0 | return Some(FullMappingResult::String(Self::I_DOT)); |
461 | 0 | } |
462 | | |
463 | 0 | if c == '\u{3a3}' |
464 | 0 | && context.preceded_by_cased_letter(self) |
465 | 0 | && !context.followed_by_cased_letter(self) |
466 | | { |
467 | | // Greek capital sigman maps depending on surrounding cased letters. |
468 | 0 | return Some(FullMappingResult::CodePoint('\u{3c2}')); |
469 | 0 | } |
470 | | |
471 | | // No relevant special case mapping. Use a normal mapping. |
472 | 0 | None |
473 | 0 | } Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_lower_special_case::<false> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_lower_special_case::<true> |
474 | | |
475 | 0 | fn full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>( |
476 | 0 | &self, |
477 | 0 | c: char, |
478 | 0 | context: ContextIterator, |
479 | 0 | locale: CaseMapLocale, |
480 | 0 | ) -> Option<FullMappingResult> { |
481 | 0 | if locale == CaseMapLocale::Turkish && c == 'i' { |
482 | | // In Turkic languages, i turns into a dotted capital I. |
483 | 0 | return Some(FullMappingResult::CodePoint('\u{130}')); |
484 | 0 | } |
485 | 0 | if locale == CaseMapLocale::Lithuanian |
486 | 0 | && c == '\u{307}' |
487 | 0 | && context.preceded_by_soft_dotted(self) |
488 | | { |
489 | | // Lithuanian retains the dot in a lowercase i when followed by accents. |
490 | | // Remove dot_above after i with upper or titlecase. |
491 | 0 | return Some(FullMappingResult::Remove); |
492 | 0 | } |
493 | | // ICU4C's non-standard extension for Armenian ligature ech-yiwn. |
494 | 0 | if c == '\u{587}' { |
495 | 0 | return match (locale, IS_TITLE_CONTEXT) { |
496 | 0 | (CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")), |
497 | 0 | (CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")), |
498 | 0 | (_, false) => Some(FullMappingResult::String("ԵՒ")), |
499 | 0 | (_, true) => Some(FullMappingResult::String("Եւ")), |
500 | | }; |
501 | 0 | } |
502 | 0 | None |
503 | 0 | } Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_upper_or_title_special_case::<false> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_upper_or_title_special_case::<true> |
504 | | |
505 | 0 | fn full_fold_special_case( |
506 | 0 | &self, |
507 | 0 | c: char, |
508 | 0 | _context: ContextIterator, |
509 | 0 | locale: CaseMapLocale, |
510 | 0 | ) -> Option<FullMappingResult> { |
511 | 0 | let is_turkic = locale == CaseMapLocale::Turkish; |
512 | 0 | match (c, is_turkic) { |
513 | | // Turkic mappings |
514 | 0 | ('\u{49}', true) => Some(FullMappingResult::CodePoint('\u{131}')), |
515 | 0 | ('\u{130}', true) => Some(FullMappingResult::CodePoint('\u{69}')), |
516 | | |
517 | | // Default mappings |
518 | 0 | ('\u{49}', false) => Some(FullMappingResult::CodePoint('\u{69}')), |
519 | 0 | ('\u{130}', false) => Some(FullMappingResult::String(Self::I_DOT)), |
520 | 0 | (_, _) => None, |
521 | | } |
522 | 0 | } |
523 | | /// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists |
524 | | /// to avoid perf impacts on other more common modes of operation |
525 | | /// |
526 | | /// titlecase_tail_casing is only read in IS_TITLE_CONTEXT |
527 | 0 | pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>( |
528 | 0 | &'a self, |
529 | 0 | src: &'a str, |
530 | 0 | locale: CaseMapLocale, |
531 | 0 | mapping: MappingKind, |
532 | 0 | titlecase_tail_casing: TrailingCase, |
533 | 0 | ) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT> { |
534 | | // Ensure that they are either both true or both false, i.e. an XNOR operation |
535 | 0 | debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title))); |
536 | | |
537 | 0 | FullCaseWriteable::<IS_TITLE_CONTEXT> { |
538 | 0 | data: self, |
539 | 0 | src, |
540 | 0 | locale, |
541 | 0 | mapping, |
542 | 0 | titlecase_tail_casing, |
543 | 0 | } |
544 | 0 | } Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper_writeable::<false> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper_writeable::<true> |
545 | | |
546 | | /// Adds all simple case mappings and the full case folding for `c` to `set`. |
547 | | /// Also adds special case closure mappings. |
548 | | /// The character itself is not added. |
549 | | /// For example, the mappings |
550 | | /// - for s include long s |
551 | | /// - for sharp s include ss |
552 | | /// - for k include the Kelvin sign |
553 | 0 | pub(crate) fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) { |
554 | | // Hardcode the case closure of i and its relatives and ignore the |
555 | | // data file data for these characters. |
556 | | // The Turkic dotless i and dotted I with their case mapping conditions |
557 | | // and case folding option make the related characters behave specially. |
558 | | // This code matches their closure behavior to their case folding behavior. |
559 | 0 | match c { |
560 | | // Regular i and I are in one equivalence class. |
561 | | '\u{49}' => { |
562 | 0 | set.add_char('\u{69}'); |
563 | 0 | return; |
564 | | } |
565 | | '\u{69}' => { |
566 | 0 | set.add_char('\u{49}'); |
567 | 0 | return; |
568 | | } |
569 | | |
570 | | // Dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) |
571 | | '\u{130}' => { |
572 | 0 | set.add_string(Self::I_DOT); |
573 | 0 | return; |
574 | | } |
575 | | |
576 | | // Dotless i is in a class by itself |
577 | | '\u{131}' => { |
578 | 0 | return; |
579 | | } |
580 | | |
581 | 0 | _ => {} |
582 | | } |
583 | | |
584 | 0 | let data = self.lookup_data(c); |
585 | 0 | if !data.has_exception() { |
586 | 0 | if data.case_type().is_some() { |
587 | 0 | let delta = data.delta() as i32; |
588 | 0 | if delta != 0 { |
589 | 0 | // Add the one simple case mapping, no matter what type it is. |
590 | 0 | let codepoint = c as i32 + delta; |
591 | 0 | // GIGO: delta should be valid |
592 | 0 | let mapped = char::from_u32(codepoint as u32).unwrap_or(c); |
593 | 0 | set.add_char(mapped); |
594 | 0 | } |
595 | 0 | } |
596 | 0 | return; |
597 | 0 | } |
598 | | |
599 | | // c has exceptions, so there may be multiple simple and/or full case mappings. |
600 | 0 | let idx = data.exception_index(); |
601 | 0 | let exception = self.exceptions.get(idx); |
602 | | |
603 | | // Add all simple case mappings. |
604 | 0 | for slot in [ |
605 | 0 | ExceptionSlot::Lower, |
606 | 0 | ExceptionSlot::Fold, |
607 | 0 | ExceptionSlot::Upper, |
608 | 0 | ExceptionSlot::Title, |
609 | | ] { |
610 | 0 | if let Some(simple) = exception.get_char_slot(slot) { |
611 | 0 | set.add_char(simple); |
612 | 0 | } |
613 | | } |
614 | 0 | if let Some(simple) = exception.get_simple_case_slot_for(c) { |
615 | 0 | set.add_char(simple); |
616 | 0 | } |
617 | | |
618 | 0 | exception.add_full_and_closure_mappings(set); |
619 | 0 | } Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_case_closure_to::<icu_collections::codepointinvlist::builder::CodePointInversionListBuilder> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_case_closure_to::<_> |
620 | | |
621 | | /// Maps the string to single code points and adds the associated case closure |
622 | | /// mappings. |
623 | | /// |
624 | | /// (see docs on CaseMapper::add_string_case_closure_to) |
625 | 0 | pub(crate) fn add_string_case_closure_to<S: ClosureSink>( |
626 | 0 | &self, |
627 | 0 | s: &str, |
628 | 0 | set: &mut S, |
629 | 0 | unfold_data: &CaseMapUnfoldV1, |
630 | 0 | ) -> bool { |
631 | 0 | if s.chars().count() <= 1 { |
632 | | // The string is too short to find any match. |
633 | 0 | return false; |
634 | 0 | } |
635 | 0 | match unfold_data.get(s) { |
636 | 0 | Some(closure_string) => { |
637 | 0 | for c in closure_string.chars() { |
638 | 0 | set.add_char(c); |
639 | 0 | self.add_case_closure_to(c, set); |
640 | 0 | } |
641 | 0 | true |
642 | | } |
643 | 0 | None => false, |
644 | | } |
645 | 0 | } Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_string_case_closure_to::<icu_collections::codepointinvlist::builder::CodePointInversionListBuilder> Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_string_case_closure_to::<_> |
646 | | } |
647 | | |
648 | | // An internal representation of locale. Non-Root values of this |
649 | | // enumeration imply that hard-coded special cases exist for this |
650 | | // language. |
651 | | #[derive(Copy, Clone, Eq, PartialEq, Debug)] |
652 | | pub enum CaseMapLocale { |
653 | | Root, |
654 | | Turkish, |
655 | | Lithuanian, |
656 | | Greek, |
657 | | Dutch, |
658 | | Armenian, |
659 | | } |
660 | | |
661 | | impl CaseMapLocale { |
662 | 0 | pub const fn from_langid(langid: &LanguageIdentifier) -> Self { |
663 | | use icu_locid::subtags::{language, Language}; |
664 | | const TR: Language = language!("tr"); |
665 | | const AZ: Language = language!("az"); |
666 | | const LT: Language = language!("lt"); |
667 | | const EL: Language = language!("el"); |
668 | | const NL: Language = language!("nl"); |
669 | | const HY: Language = language!("hy"); |
670 | 0 | match langid.language { |
671 | 0 | TR | AZ => Self::Turkish, |
672 | 0 | LT => Self::Lithuanian, |
673 | 0 | EL => Self::Greek, |
674 | 0 | NL => Self::Dutch, |
675 | 0 | HY => Self::Armenian, |
676 | 0 | _ => Self::Root, |
677 | | } |
678 | 0 | } |
679 | | } |
680 | | |
681 | | pub enum FullMappingResult<'a> { |
682 | | Remove, |
683 | | CodePoint(char), |
684 | | String(&'a str), |
685 | | } |
686 | | |
687 | | impl<'a> FullMappingResult<'a> { |
688 | | #[allow(dead_code)] |
689 | 0 | fn add_to_set<S: ClosureSink>(&self, set: &mut S) { |
690 | 0 | match *self { |
691 | 0 | FullMappingResult::CodePoint(c) => set.add_char(c), |
692 | 0 | FullMappingResult::String(s) => set.add_string(s), |
693 | 0 | FullMappingResult::Remove => {} |
694 | | } |
695 | 0 | } |
696 | | } |
697 | | |
698 | | impl Writeable for FullMappingResult<'_> { |
699 | 0 | fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result { |
700 | 0 | match *self { |
701 | 0 | FullMappingResult::CodePoint(c) => sink.write_char(c), |
702 | 0 | FullMappingResult::String(s) => sink.write_str(s), |
703 | 0 | FullMappingResult::Remove => Ok(()), |
704 | | } |
705 | 0 | } Unexecuted instantiation: <icu_casemap::internals::FullMappingResult as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable> Unexecuted instantiation: <icu_casemap::internals::FullMappingResult as writeable::Writeable>::write_to::<alloc::string::String> |
706 | | } |
707 | | |
708 | | pub(crate) struct ContextIterator<'a> { |
709 | | before: &'a str, |
710 | | after: &'a str, |
711 | | } |
712 | | |
713 | | impl<'a> ContextIterator<'a> { |
714 | | // Returns a context iterator with the characters before |
715 | | // and after the character at a given index, given the preceding |
716 | | // string and the succeeding string including the character itself |
717 | 0 | pub fn new(before: &'a str, char_and_after: &'a str) -> Self { |
718 | 0 | let mut char_and_after = char_and_after.chars(); |
719 | 0 | char_and_after.next(); // skip the character itself |
720 | 0 | let after = char_and_after.as_str(); |
721 | 0 | Self { before, after } |
722 | 0 | } |
723 | | |
724 | 0 | fn add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics { |
725 | 0 | diacritics.consume_greek_diacritics(self.after); |
726 | 0 | diacritics |
727 | 0 | } |
728 | | |
729 | 0 | fn preceded_by_greek_letter(&self) -> bool { |
730 | 0 | greek_to_me::preceded_by_greek_letter(self.before) |
731 | 0 | } |
732 | | |
733 | 0 | fn preceding_greek_vowel_diacritics( |
734 | 0 | &self, |
735 | 0 | ) -> Option<GreekCombiningCharacterSequenceDiacritics> { |
736 | 0 | greek_to_me::preceding_greek_vowel_diacritics(self.before) |
737 | 0 | } |
738 | | |
739 | 0 | fn preceded_by_soft_dotted(&self, mapping: &CaseMapV1) -> bool { |
740 | 0 | for c in self.before.chars().rev() { |
741 | 0 | match mapping.dot_type(c) { |
742 | 0 | DotType::SoftDotted => return true, |
743 | 0 | DotType::OtherAccent => continue, |
744 | 0 | _ => return false, |
745 | | } |
746 | | } |
747 | 0 | false |
748 | 0 | } |
749 | | /// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between. |
750 | | /// |
751 | | /// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string |
752 | 0 | fn preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>( |
753 | 0 | &self, |
754 | 0 | mapping: &CaseMapV1, |
755 | 0 | ) -> bool { |
756 | 0 | let mut iter = self.before.chars().rev(); |
757 | 0 | while let Some(c) = iter.next() { |
758 | 0 | if c == 'I' { |
759 | 0 | if I_MUST_NOT_START_STRING { |
760 | 0 | return iter.next().is_some(); |
761 | | } else { |
762 | 0 | return true; |
763 | | } |
764 | 0 | } |
765 | 0 | if mapping.dot_type(c) != DotType::OtherAccent { |
766 | 0 | break; |
767 | 0 | } |
768 | | } |
769 | 0 | false |
770 | 0 | } Unexecuted instantiation: <icu_casemap::internals::ContextIterator>::preceded_by_capital_i::<false> Unexecuted instantiation: <icu_casemap::internals::ContextIterator>::preceded_by_capital_i::<true> |
771 | 0 | fn preceded_by_cased_letter(&self, mapping: &CaseMapV1) -> bool { |
772 | 0 | for c in self.before.chars().rev() { |
773 | 0 | let data = mapping.lookup_data(c); |
774 | 0 | if !data.is_ignorable() { |
775 | 0 | return data.case_type().is_some(); |
776 | 0 | } |
777 | | } |
778 | 0 | false |
779 | 0 | } |
780 | 0 | fn followed_by_cased_letter(&self, mapping: &CaseMapV1) -> bool { |
781 | 0 | for c in self.after.chars() { |
782 | 0 | let data = mapping.lookup_data(c); |
783 | 0 | if !data.is_ignorable() { |
784 | 0 | return data.case_type().is_some(); |
785 | 0 | } |
786 | | } |
787 | 0 | false |
788 | 0 | } |
789 | 0 | fn followed_by_more_above(&self, mapping: &CaseMapV1) -> bool { |
790 | 0 | for c in self.after.chars() { |
791 | 0 | match mapping.dot_type(c) { |
792 | 0 | DotType::Above => return true, |
793 | 0 | DotType::OtherAccent => continue, |
794 | 0 | _ => return false, |
795 | | } |
796 | | } |
797 | 0 | false |
798 | 0 | } |
799 | 0 | fn followed_by_dot_above(&self, mapping: &CaseMapV1) -> bool { |
800 | 0 | for c in self.after.chars() { |
801 | 0 | if c == '\u{307}' { |
802 | 0 | return true; |
803 | 0 | } |
804 | 0 | if mapping.dot_type(c) != DotType::OtherAccent { |
805 | 0 | return false; |
806 | 0 | } |
807 | | } |
808 | 0 | false |
809 | 0 | } |
810 | | |
811 | | /// Checks the preceding and surrounding context of a j or J |
812 | | /// and returns true if it is preceded by an i or I at the start of the string. |
813 | | /// If one has an acute accent, |
814 | | /// both must have the accent for this to return true. No other accents are handled. |
815 | 0 | fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMapV1) -> bool { |
816 | 0 | let mut before = self.before.chars().rev(); |
817 | 0 | let mut i_has_acute = false; |
818 | | loop { |
819 | 0 | match before.next() { |
820 | 0 | Some('i') | Some('I') => break, |
821 | | Some('í') | Some('Í') => { |
822 | 0 | i_has_acute = true; |
823 | 0 | break; |
824 | | } |
825 | 0 | Some(ACUTE) => i_has_acute = true, |
826 | 0 | _ => return false, |
827 | | } |
828 | | } |
829 | | |
830 | 0 | if before.next().is_some() { |
831 | | // not at the beginning of a string, doesn't matter |
832 | 0 | return false; |
833 | 0 | } |
834 | 0 | let mut j_has_acute = false; |
835 | 0 | for c in self.after.chars() { |
836 | 0 | if c == ACUTE { |
837 | 0 | j_has_acute = true; |
838 | 0 | continue; |
839 | 0 | } |
840 | | // We are supposed to check that `j` has no other combining marks aside |
841 | | // from potentially an acute accent. Once we hit the first non-combining mark |
842 | | // we are done. |
843 | | // |
844 | | // ICU4C checks for `gc=Mn` to determine if something is a combining mark, |
845 | | // however this requires extra data (and is the *only* point in the casemapping algorithm |
846 | | // where there is a direct dependency on properties data not mediated by the casemapping data trie). |
847 | | // |
848 | | // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does. |
849 | | // |
850 | | // See https://unicode-org.atlassian.net/browse/ICU-22429 |
851 | 0 | match mapping.dot_type(c) { |
852 | | // Not a combining character; ccc = 0 |
853 | 0 | DotType::NoDot | DotType::SoftDotted => break, |
854 | | // found combining character, bail |
855 | 0 | _ => return false, |
856 | | } |
857 | | } |
858 | | |
859 | | // either both should have an acute accent, or none. this is an XNOR operation |
860 | 0 | !(j_has_acute ^ i_has_acute) |
861 | 0 | } |
862 | | } |