Coverage Report

Created: 2026-02-14 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_casemap-1.5.1/src/internals.rs
Line
Count
Source
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5
//! This module contains most of the actual algorithms for case mapping.
6
//!
7
//! Primarily, it implements methods on `CaseMapV1`, which contains the data model.
8
9
use crate::greek_to_me::{
10
    self, GreekCombiningCharacterSequenceDiacritics, GreekDiacritics, GreekPrecomposedLetterData,
11
    GreekVowel,
12
};
13
use crate::provider::data::{DotType, MappingKind};
14
use crate::provider::exception_helpers::ExceptionSlot;
15
use crate::provider::{CaseMapUnfoldV1, CaseMapV1};
16
use crate::set::ClosureSink;
17
use crate::titlecase::TrailingCase;
18
use core::fmt;
19
use icu_locid::LanguageIdentifier;
20
use writeable::Writeable;
21
22
const ACUTE: char = '\u{301}';
23
24
// Used to control the behavior of CaseMapper::fold.
25
// Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i.
26
#[derive(Copy, Clone, Default)]
27
pub struct FoldOptions {
28
    exclude_special_i: bool,
29
}
30
31
impl FoldOptions {
32
0
    pub fn with_turkic_mappings() -> Self {
33
0
        Self {
34
0
            exclude_special_i: true,
35
0
        }
36
0
    }
37
}
38
39
/// Helper type that wraps a writeable in a prefix string
40
pub(crate) struct StringAndWriteable<'a, W> {
41
    pub string: &'a str,
42
    pub writeable: W,
43
}
44
45
impl<'a, Wr: Writeable> Writeable for StringAndWriteable<'a, Wr> {
46
0
    fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
47
0
        sink.write_str(self.string)?;
48
0
        self.writeable.write_to(sink)
49
0
    }
Unexecuted instantiation: <icu_casemap::internals::StringAndWriteable<icu_casemap::internals::FullCaseWriteable<true>> as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable>
Unexecuted instantiation: <icu_casemap::internals::StringAndWriteable<icu_casemap::internals::FullCaseWriteable<true>> as writeable::Writeable>::write_to::<alloc::string::String>
50
0
    fn writeable_length_hint(&self) -> writeable::LengthHint {
51
0
        writeable::LengthHint::exact(self.string.len()) + self.writeable.writeable_length_hint()
52
0
    }
53
}
54
55
pub(crate) struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> {
56
    data: &'a CaseMapV1<'a>,
57
    src: &'a str,
58
    locale: CaseMapLocale,
59
    mapping: MappingKind,
60
    titlecase_tail_casing: TrailingCase,
61
}
62
63
impl<'a, const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'a, IS_TITLE_CONTEXT> {
64
    #[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds
65
0
    fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
66
0
        let src = self.src;
67
0
        let mut mapping = self.mapping;
68
0
        let mut iter = src.char_indices();
69
0
        for (i, c) in &mut iter {
70
0
            let context = ContextIterator::new(&src[..i], &src[i..]);
71
0
            self.data
72
0
                .full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?;
73
0
            if IS_TITLE_CONTEXT {
74
0
                if self.titlecase_tail_casing == TrailingCase::Lower {
75
0
                    mapping = MappingKind::Lower;
76
0
                } else {
77
0
                    break;
78
                }
79
0
            }
80
        }
81
        // Write the rest of the string
82
0
        if IS_TITLE_CONTEXT && self.titlecase_tail_casing == TrailingCase::Unchanged {
83
0
            sink.write_str(iter.as_str())?;
84
0
        }
85
0
        Ok(())
86
0
    }
Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<false> as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable>
Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<true> as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable>
Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<false> as writeable::Writeable>::write_to::<alloc::string::String>
Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<true> as writeable::Writeable>::write_to::<alloc::string::String>
87
0
    fn writeable_length_hint(&self) -> writeable::LengthHint {
88
0
        writeable::LengthHint::at_least(self.src.len())
89
0
    }
Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<false> as writeable::Writeable>::writeable_length_hint
Unexecuted instantiation: <icu_casemap::internals::FullCaseWriteable<true> as writeable::Writeable>::writeable_length_hint
90
}
91
92
impl<'data> CaseMapV1<'data> {
93
0
    fn simple_helper(&self, c: char, kind: MappingKind) -> char {
94
0
        let data = self.lookup_data(c);
95
0
        if !data.has_exception() {
96
0
            if data.is_relevant_to(kind) {
97
0
                let folded = c as i32 + data.delta() as i32;
98
                // GIGO: delta should be valid
99
0
                char::from_u32(folded as u32).unwrap_or(c)
100
            } else {
101
0
                c
102
            }
103
        } else {
104
0
            let idx = data.exception_index();
105
0
            let exception = self.exceptions.get(idx);
106
0
            if data.is_relevant_to(kind) {
107
0
                if let Some(simple) = exception.get_simple_case_slot_for(c) {
108
0
                    return simple;
109
0
                }
110
0
            }
111
0
            exception.slot_char_for_kind(kind).unwrap_or(c)
112
        }
113
0
    }
114
115
    // Returns the lowercase mapping of the given `char`.
116
    #[inline]
117
0
    pub(crate) fn simple_lower(&self, c: char) -> char {
118
0
        self.simple_helper(c, MappingKind::Lower)
119
0
    }
120
121
    // Returns the uppercase mapping of the given `char`.
122
    #[inline]
123
0
    pub(crate) fn simple_upper(&self, c: char) -> char {
124
0
        self.simple_helper(c, MappingKind::Upper)
125
0
    }
126
127
    // Returns the titlecase mapping of the given `char`.
128
    #[inline]
129
0
    pub(crate) fn simple_title(&self, c: char) -> char {
130
0
        self.simple_helper(c, MappingKind::Title)
131
0
    }
132
133
    // Return the simple case folding mapping of the given char.
134
    #[inline]
135
0
    pub(crate) fn simple_fold(&self, c: char, options: FoldOptions) -> char {
136
0
        let data = self.lookup_data(c);
137
0
        if !data.has_exception() {
138
0
            if data.is_upper_or_title() {
139
0
                let folded = c as i32 + data.delta() as i32;
140
                // GIGO: delta should be valid
141
0
                char::from_u32(folded as u32).unwrap_or(c)
142
            } else {
143
0
                c
144
            }
145
        } else {
146
            // TODO: if we move conditional fold and no_simple_case_folding into
147
            // simple_helper, this function can just call simple_helper.
148
0
            let idx = data.exception_index();
149
0
            let exception = self.exceptions.get(idx);
150
0
            if exception.bits.has_conditional_fold() {
151
0
                self.simple_fold_special_case(c, options)
152
0
            } else if exception.bits.no_simple_case_folding() {
153
0
                c
154
0
            } else if data.is_upper_or_title() && exception.has_slot(ExceptionSlot::Delta) {
155
                // unwrap_or case should never happen but best to avoid panics
156
0
                exception.get_simple_case_slot_for(c).unwrap_or('\0')
157
0
            } else if let Some(slot_char) = exception.slot_char_for_kind(MappingKind::Fold) {
158
0
                slot_char
159
            } else {
160
0
                c
161
            }
162
        }
163
0
    }
164
165
0
    fn dot_type(&self, c: char) -> DotType {
166
0
        let data = self.lookup_data(c);
167
0
        if !data.has_exception() {
168
0
            data.dot_type()
169
        } else {
170
0
            let idx = data.exception_index();
171
0
            self.exceptions.get(idx).bits.dot_type()
172
        }
173
0
    }
174
175
    // Returns true if this code point is is case-sensitive.
176
    // This is not currently exposed.
177
    #[allow(dead_code)]
178
0
    fn is_case_sensitive(&self, c: char) -> bool {
179
0
        let data = self.lookup_data(c);
180
0
        if !data.has_exception() {
181
0
            data.is_sensitive()
182
        } else {
183
0
            let idx = data.exception_index();
184
0
            self.exceptions.get(idx).bits.is_sensitive()
185
        }
186
0
    }
187
188
    /// Returns whether the character is cased
189
0
    pub(crate) fn is_cased(&self, c: char) -> bool {
190
0
        self.lookup_data(c).case_type().is_some()
191
0
    }
192
193
    #[inline(always)]
194
    // IS_TITLE_CONTEXT must be true if kind is MappingKind::Title
195
    // The kind may be a different kind with IS_TITLE_CONTEXT still true because
196
    // titlecasing a segment involves switching to lowercase later
197
0
    fn full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>(
198
0
        &self,
199
0
        c: char,
200
0
        context: ContextIterator,
201
0
        locale: CaseMapLocale,
202
0
        kind: MappingKind,
203
0
        sink: &mut W,
204
0
    ) -> fmt::Result {
205
        // If using a title mapping IS_TITLE_CONTEXT must be true
206
0
        debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT);
207
        // In a title context, kind MUST be Title or Lower
208
0
        debug_assert!(
209
0
            !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
210
        );
211
212
        // ICU4C's non-standard extension for Dutch IJ titlecasing
213
        // handled here instead of in full_lower_special_case because J does not have conditional
214
        // special casemapping.
215
0
        if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
216
            // When titlecasing, a J found immediately after an I at the beginning of the segment
217
            // should also uppercase. They are both allowed to have an acute accent but it must
218
            // be present on both letters or neither. They may not have any other combining marks.
219
0
            if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
220
0
                return sink.write_char('J');
221
0
            }
222
0
        }
223
224
        // ICU4C's non-standard extension for Greek uppercasing:
225
        // https://icu.unicode.org/design/case/greek-upper.
226
        // Effectively removes Greek accents from Greek vowels during uppercasing,
227
        // whilst attempting to preserve additional marks like the dialytika (diæresis)
228
        // and ypogegrammeni (combining small iota).
229
0
        if !IS_TITLE_CONTEXT && locale == CaseMapLocale::Greek && kind == MappingKind::Upper {
230
            // Remove all combining diacritics on a Greek letter.
231
            // Ypogegrammeni is not an accent mark and is handled by regular casemapping (it turns into
232
            // a capital iota).
233
            // The dialytika is removed here, but it may be added again when the base letter is being processed.
234
0
            if greek_to_me::is_greek_diacritic_except_ypogegrammeni(c)
235
0
                && context.preceded_by_greek_letter()
236
            {
237
0
                return Ok(());
238
0
            }
239
0
            let data = greek_to_me::get_data(c);
240
            // Check if the character is a Greek vowel
241
0
            match data {
242
0
                Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => {
243
                    // Get the diacritics on the character itself, and add any further combining diacritics
244
                    // from the context.
245
0
                    let mut diacritics = context.add_greek_diacritics(precomposed_diacritics);
246
                    // If the previous vowel had an accent (which would be removed) but no dialytika,
247
                    // and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate
248
                    // the now-unaccented adjacent vowels from a digraph/diphthong.
249
                    // Use a precomposed dialytika if the accent was precomposed, and a combining dialytika
250
                    // if the accent was combining, so as to map NFD to NFD and NFC to NFC.
251
0
                    if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ)
252
                    {
253
0
                        if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() {
254
0
                            if !preceding_vowel.combining.dialytika
255
0
                                && !preceding_vowel.precomposed.dialytika
256
                            {
257
0
                                if preceding_vowel.combining.accented {
258
0
                                    diacritics.dialytika = true;
259
0
                                } else {
260
0
                                    precomposed_diacritics.dialytika =
261
0
                                        preceding_vowel.precomposed.accented;
262
0
                                }
263
0
                            }
264
0
                        }
265
0
                    }
266
                    // Write the base of the uppercased combining character sequence.
267
                    // In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed.
268
                    // In some branches the base has a precomposed diacritic.
269
                    // In the case of the Greek disjunctive "or", a combining tonos may also be written.
270
0
                    match vowel {
271
                        GreekVowel::Η => {
272
                            // The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish
273
                            // the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή).
274
                            //
275
                            // A lone η with an accent other than the oxia/tonos is not expected,
276
                            // so there is no need to special-case the oxia/tonos.
277
                            // The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex,
278
                            // so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle
279
                            // (e.g. να είναι) since Byzantine times anyway.
280
0
                            if diacritics.accented
281
0
                                && !context.followed_by_cased_letter(self)
282
0
                                && !context.preceded_by_cased_letter(self)
283
0
                                && !diacritics.ypogegrammeni
284
                            {
285
0
                                if precomposed_diacritics.accented {
286
0
                                    sink.write_char('Ή')?;
287
                                } else {
288
0
                                    sink.write_char('Η')?;
289
0
                                    sink.write_char(greek_to_me::TONOS)?;
290
                                }
291
                            } else {
292
0
                                sink.write_char('Η')?;
293
                            }
294
                        }
295
0
                        GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika {
296
0
                            diacritics.dialytika = false;
297
0
                            'Ϊ'
298
                        } else {
299
0
                            vowel.into()
300
0
                        })?,
301
0
                        GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika {
302
0
                            diacritics.dialytika = false;
303
0
                            'Ϋ'
304
                        } else {
305
0
                            vowel.into()
306
0
                        })?,
307
0
                        _ => sink.write_char(vowel.into())?,
308
                    };
309
0
                    if diacritics.dialytika {
310
0
                        sink.write_char(greek_to_me::DIALYTIKA)?;
311
0
                    }
312
0
                    if precomposed_diacritics.ypogegrammeni {
313
0
                        sink.write_char('Ι')?;
314
0
                    }
315
316
0
                    return Ok(());
317
                }
318
                // Rho might have breathing marks, we handle it specially
319
                // to remove them
320
                Some(GreekPrecomposedLetterData::Consonant(true)) => {
321
0
                    sink.write_char(greek_to_me::CAPITAL_RHO)?;
322
0
                    return Ok(());
323
                }
324
0
                _ => (),
325
            }
326
0
        }
327
328
0
        let data = self.lookup_data(c);
329
0
        if !data.has_exception() {
330
0
            if data.is_relevant_to(kind) {
331
0
                let mapped = c as i32 + data.delta() as i32;
332
                // GIGO: delta should be valid
333
0
                let mapped = char::from_u32(mapped as u32).unwrap_or(c);
334
0
                sink.write_char(mapped)
335
            } else {
336
0
                sink.write_char(c)
337
            }
338
        } else {
339
0
            let idx = data.exception_index();
340
0
            let exception = self.exceptions.get(idx);
341
0
            if exception.bits.has_conditional_special() {
342
0
                if let Some(special) = match kind {
343
                    MappingKind::Lower => {
344
0
                        self.full_lower_special_case::<IS_TITLE_CONTEXT>(c, context, locale)
345
                    }
346
0
                    MappingKind::Fold => self.full_fold_special_case(c, context, locale),
347
0
                    MappingKind::Upper | MappingKind::Title => self
348
0
                        .full_upper_or_title_special_case::<IS_TITLE_CONTEXT>(c, context, locale),
349
                } {
350
0
                    return special.write_to(sink);
351
0
                }
352
0
            }
353
0
            if let Some(mapped_string) = exception.get_fullmappings_slot_for_kind(kind) {
354
0
                if !mapped_string.is_empty() {
355
0
                    return sink.write_str(mapped_string);
356
0
                }
357
0
            }
358
359
0
            if kind == MappingKind::Fold && exception.bits.no_simple_case_folding() {
360
0
                return sink.write_char(c);
361
0
            }
362
363
0
            if data.is_relevant_to(kind) {
364
0
                if let Some(simple) = exception.get_simple_case_slot_for(c) {
365
0
                    return sink.write_char(simple);
366
0
                }
367
0
            }
368
369
0
            if let Some(slot_char) = exception.slot_char_for_kind(kind) {
370
0
                sink.write_char(slot_char)
371
            } else {
372
0
                sink.write_char(c)
373
            }
374
        }
375
0
    }
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<false, diplomat_runtime::writeable::DiplomatWriteable>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<true, diplomat_runtime::writeable::DiplomatWriteable>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<false, alloc::string::String>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper::<true, alloc::string::String>
376
377
    // These constants are used for hardcoded locale-specific foldings.
378
    const I_DOT: &'static str = "\u{69}\u{307}";
379
    const J_DOT: &'static str = "\u{6a}\u{307}";
380
    const I_OGONEK_DOT: &'static str = "\u{12f}\u{307}";
381
    const I_DOT_GRAVE: &'static str = "\u{69}\u{307}\u{300}";
382
    const I_DOT_ACUTE: &'static str = "\u{69}\u{307}\u{301}";
383
    const I_DOT_TILDE: &'static str = "\u{69}\u{307}\u{303}";
384
385
    // Special case folding mappings, hardcoded.
386
    // This handles the special Turkic mappings for uppercase I and dotted uppercase I
387
    // For non-Turkic languages, this mapping is normally not used.
388
    // For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
389
0
    fn simple_fold_special_case(&self, c: char, options: FoldOptions) -> char {
390
0
        debug_assert!(c == '\u{49}' || c == '\u{130}');
391
0
        let is_turkic = options.exclude_special_i;
392
0
        match (c, is_turkic) {
393
            // Turkic mappings
394
0
            ('\u{49}', true) => '\u{131}', // 0049; T; 0131; # LATIN CAPITAL LETTER I
395
0
            ('\u{130}', true) => '\u{69}', /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
396
397
            // Default mappings
398
0
            ('\u{49}', false) => '\u{69}', // 0049; C; 0069; # LATIN CAPITAL LETTER I
399
400
            // There is no simple case folding for U+130.
401
0
            (c, _) => c,
402
        }
403
0
    }
404
405
0
    fn full_lower_special_case<const IS_TITLE_CONTEXT: bool>(
406
0
        &self,
407
0
        c: char,
408
0
        context: ContextIterator,
409
0
        locale: CaseMapLocale,
410
0
    ) -> Option<FullMappingResult> {
411
0
        if locale == CaseMapLocale::Lithuanian {
412
            // Lithuanian retains the dot in a lowercase i when followed by accents.
413
            // Introduce an explicit dot above when lowercasing capital I's and J's
414
            // whenever there are more accents above (of the accents used in
415
            // Lithuanian: grave, acute, and tilde above).
416
417
            // Check for accents above I, J, and I-with-ogonek.
418
0
            if c == 'I' && context.followed_by_more_above(self) {
419
0
                return Some(FullMappingResult::String(Self::I_DOT));
420
0
            } else if c == 'J' && context.followed_by_more_above(self) {
421
0
                return Some(FullMappingResult::String(Self::J_DOT));
422
0
            } else if c == '\u{12e}' && context.followed_by_more_above(self) {
423
0
                return Some(FullMappingResult::String(Self::I_OGONEK_DOT));
424
0
            }
425
426
            // These characters are precomposed with accents above, so we don't
427
            // have to look at the context.
428
0
            if c == '\u{cc}' {
429
0
                return Some(FullMappingResult::String(Self::I_DOT_GRAVE));
430
0
            } else if c == '\u{cd}' {
431
0
                return Some(FullMappingResult::String(Self::I_DOT_ACUTE));
432
0
            } else if c == '\u{128}' {
433
0
                return Some(FullMappingResult::String(Self::I_DOT_TILDE));
434
0
            }
435
0
        }
436
437
0
        if locale == CaseMapLocale::Turkish {
438
0
            if c == '\u{130}' {
439
                // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
440
0
                return Some(FullMappingResult::CodePoint('i'));
441
0
            } else if c == '\u{307}' && context.preceded_by_capital_i::<IS_TITLE_CONTEXT>(self) {
442
                // When lowercasing, remove dot_above in the sequence I + dot_above,
443
                // which will turn into i. This matches the behaviour of the
444
                // canonically equivalent I-dot_above.
445
                //
446
                // In a titlecase context, we do not want to apply this behavior to cases where the I
447
                // was at the beginning of the string, as that I and its marks should be handled by the
448
                // uppercasing rules (which ignore it, see below)
449
450
0
                return Some(FullMappingResult::Remove);
451
0
            } else if c == 'I' && !context.followed_by_dot_above(self) {
452
                // When lowercasing, unless an I is before a dot_above, it turns
453
                // into a dotless i.
454
0
                return Some(FullMappingResult::CodePoint('\u{131}'));
455
0
            }
456
0
        }
457
458
0
        if c == '\u{130}' {
459
            // Preserve canonical equivalence for I with dot. Turkic is handled above.
460
0
            return Some(FullMappingResult::String(Self::I_DOT));
461
0
        }
462
463
0
        if c == '\u{3a3}'
464
0
            && context.preceded_by_cased_letter(self)
465
0
            && !context.followed_by_cased_letter(self)
466
        {
467
            // Greek capital sigman maps depending on surrounding cased letters.
468
0
            return Some(FullMappingResult::CodePoint('\u{3c2}'));
469
0
        }
470
471
        // No relevant special case mapping. Use a normal mapping.
472
0
        None
473
0
    }
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_lower_special_case::<false>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_lower_special_case::<true>
474
475
0
    fn full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>(
476
0
        &self,
477
0
        c: char,
478
0
        context: ContextIterator,
479
0
        locale: CaseMapLocale,
480
0
    ) -> Option<FullMappingResult> {
481
0
        if locale == CaseMapLocale::Turkish && c == 'i' {
482
            // In Turkic languages, i turns into a dotted capital I.
483
0
            return Some(FullMappingResult::CodePoint('\u{130}'));
484
0
        }
485
0
        if locale == CaseMapLocale::Lithuanian
486
0
            && c == '\u{307}'
487
0
            && context.preceded_by_soft_dotted(self)
488
        {
489
            // Lithuanian retains the dot in a lowercase i when followed by accents.
490
            // Remove dot_above after i with upper or titlecase.
491
0
            return Some(FullMappingResult::Remove);
492
0
        }
493
        // ICU4C's non-standard extension for Armenian ligature ech-yiwn.
494
0
        if c == '\u{587}' {
495
0
            return match (locale, IS_TITLE_CONTEXT) {
496
0
                (CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")),
497
0
                (CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")),
498
0
                (_, false) => Some(FullMappingResult::String("ԵՒ")),
499
0
                (_, true) => Some(FullMappingResult::String("Եւ")),
500
            };
501
0
        }
502
0
        None
503
0
    }
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_upper_or_title_special_case::<false>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_upper_or_title_special_case::<true>
504
505
0
    fn full_fold_special_case(
506
0
        &self,
507
0
        c: char,
508
0
        _context: ContextIterator,
509
0
        locale: CaseMapLocale,
510
0
    ) -> Option<FullMappingResult> {
511
0
        let is_turkic = locale == CaseMapLocale::Turkish;
512
0
        match (c, is_turkic) {
513
            // Turkic mappings
514
0
            ('\u{49}', true) => Some(FullMappingResult::CodePoint('\u{131}')),
515
0
            ('\u{130}', true) => Some(FullMappingResult::CodePoint('\u{69}')),
516
517
            // Default mappings
518
0
            ('\u{49}', false) => Some(FullMappingResult::CodePoint('\u{69}')),
519
0
            ('\u{130}', false) => Some(FullMappingResult::String(Self::I_DOT)),
520
0
            (_, _) => None,
521
        }
522
0
    }
523
    /// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists
524
    /// to avoid perf impacts on other more common modes of operation
525
    ///
526
    /// titlecase_tail_casing is only read in IS_TITLE_CONTEXT
527
0
    pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>(
528
0
        &'a self,
529
0
        src: &'a str,
530
0
        locale: CaseMapLocale,
531
0
        mapping: MappingKind,
532
0
        titlecase_tail_casing: TrailingCase,
533
0
    ) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT> {
534
        // Ensure that they are either both true or both false, i.e. an XNOR operation
535
0
        debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title)));
536
537
0
        FullCaseWriteable::<IS_TITLE_CONTEXT> {
538
0
            data: self,
539
0
            src,
540
0
            locale,
541
0
            mapping,
542
0
            titlecase_tail_casing,
543
0
        }
544
0
    }
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper_writeable::<false>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::full_helper_writeable::<true>
545
546
    /// Adds all simple case mappings and the full case folding for `c` to `set`.
547
    /// Also adds special case closure mappings.
548
    /// The character itself is not added.
549
    /// For example, the mappings
550
    /// - for s include long s
551
    /// - for sharp s include ss
552
    /// - for k include the Kelvin sign
553
0
    pub(crate) fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) {
554
        // Hardcode the case closure of i and its relatives and ignore the
555
        // data file data for these characters.
556
        // The Turkic dotless i and dotted I with their case mapping conditions
557
        // and case folding option make the related characters behave specially.
558
        // This code matches their closure behavior to their case folding behavior.
559
0
        match c {
560
            // Regular i and I are in one equivalence class.
561
            '\u{49}' => {
562
0
                set.add_char('\u{69}');
563
0
                return;
564
            }
565
            '\u{69}' => {
566
0
                set.add_char('\u{49}');
567
0
                return;
568
            }
569
570
            // Dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>)
571
            '\u{130}' => {
572
0
                set.add_string(Self::I_DOT);
573
0
                return;
574
            }
575
576
            // Dotless i is in a class by itself
577
            '\u{131}' => {
578
0
                return;
579
            }
580
581
0
            _ => {}
582
        }
583
584
0
        let data = self.lookup_data(c);
585
0
        if !data.has_exception() {
586
0
            if data.case_type().is_some() {
587
0
                let delta = data.delta() as i32;
588
0
                if delta != 0 {
589
0
                    // Add the one simple case mapping, no matter what type it is.
590
0
                    let codepoint = c as i32 + delta;
591
0
                    // GIGO: delta should be valid
592
0
                    let mapped = char::from_u32(codepoint as u32).unwrap_or(c);
593
0
                    set.add_char(mapped);
594
0
                }
595
0
            }
596
0
            return;
597
0
        }
598
599
        // c has exceptions, so there may be multiple simple and/or full case mappings.
600
0
        let idx = data.exception_index();
601
0
        let exception = self.exceptions.get(idx);
602
603
        // Add all simple case mappings.
604
0
        for slot in [
605
0
            ExceptionSlot::Lower,
606
0
            ExceptionSlot::Fold,
607
0
            ExceptionSlot::Upper,
608
0
            ExceptionSlot::Title,
609
        ] {
610
0
            if let Some(simple) = exception.get_char_slot(slot) {
611
0
                set.add_char(simple);
612
0
            }
613
        }
614
0
        if let Some(simple) = exception.get_simple_case_slot_for(c) {
615
0
            set.add_char(simple);
616
0
        }
617
618
0
        exception.add_full_and_closure_mappings(set);
619
0
    }
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_case_closure_to::<icu_collections::codepointinvlist::builder::CodePointInversionListBuilder>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_case_closure_to::<_>
620
621
    /// Maps the string to single code points and adds the associated case closure
622
    /// mappings.
623
    ///
624
    /// (see docs on CaseMapper::add_string_case_closure_to)
625
0
    pub(crate) fn add_string_case_closure_to<S: ClosureSink>(
626
0
        &self,
627
0
        s: &str,
628
0
        set: &mut S,
629
0
        unfold_data: &CaseMapUnfoldV1,
630
0
    ) -> bool {
631
0
        if s.chars().count() <= 1 {
632
            // The string is too short to find any match.
633
0
            return false;
634
0
        }
635
0
        match unfold_data.get(s) {
636
0
            Some(closure_string) => {
637
0
                for c in closure_string.chars() {
638
0
                    set.add_char(c);
639
0
                    self.add_case_closure_to(c, set);
640
0
                }
641
0
                true
642
            }
643
0
            None => false,
644
        }
645
0
    }
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_string_case_closure_to::<icu_collections::codepointinvlist::builder::CodePointInversionListBuilder>
Unexecuted instantiation: <icu_casemap::provider::CaseMapV1>::add_string_case_closure_to::<_>
646
}
647
648
// An internal representation of locale. Non-Root values of this
649
// enumeration imply that hard-coded special cases exist for this
650
// language.
651
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
652
pub enum CaseMapLocale {
653
    Root,
654
    Turkish,
655
    Lithuanian,
656
    Greek,
657
    Dutch,
658
    Armenian,
659
}
660
661
impl CaseMapLocale {
662
0
    pub const fn from_langid(langid: &LanguageIdentifier) -> Self {
663
        use icu_locid::subtags::{language, Language};
664
        const TR: Language = language!("tr");
665
        const AZ: Language = language!("az");
666
        const LT: Language = language!("lt");
667
        const EL: Language = language!("el");
668
        const NL: Language = language!("nl");
669
        const HY: Language = language!("hy");
670
0
        match langid.language {
671
0
            TR | AZ => Self::Turkish,
672
0
            LT => Self::Lithuanian,
673
0
            EL => Self::Greek,
674
0
            NL => Self::Dutch,
675
0
            HY => Self::Armenian,
676
0
            _ => Self::Root,
677
        }
678
0
    }
679
}
680
681
pub enum FullMappingResult<'a> {
682
    Remove,
683
    CodePoint(char),
684
    String(&'a str),
685
}
686
687
impl<'a> FullMappingResult<'a> {
688
    #[allow(dead_code)]
689
0
    fn add_to_set<S: ClosureSink>(&self, set: &mut S) {
690
0
        match *self {
691
0
            FullMappingResult::CodePoint(c) => set.add_char(c),
692
0
            FullMappingResult::String(s) => set.add_string(s),
693
0
            FullMappingResult::Remove => {}
694
        }
695
0
    }
696
}
697
698
impl Writeable for FullMappingResult<'_> {
699
0
    fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
700
0
        match *self {
701
0
            FullMappingResult::CodePoint(c) => sink.write_char(c),
702
0
            FullMappingResult::String(s) => sink.write_str(s),
703
0
            FullMappingResult::Remove => Ok(()),
704
        }
705
0
    }
Unexecuted instantiation: <icu_casemap::internals::FullMappingResult as writeable::Writeable>::write_to::<diplomat_runtime::writeable::DiplomatWriteable>
Unexecuted instantiation: <icu_casemap::internals::FullMappingResult as writeable::Writeable>::write_to::<alloc::string::String>
706
}
707
708
pub(crate) struct ContextIterator<'a> {
709
    before: &'a str,
710
    after: &'a str,
711
}
712
713
impl<'a> ContextIterator<'a> {
714
    // Returns a context iterator with the characters before
715
    // and after the character at a given index, given the preceding
716
    // string and the succeeding string including the character itself
717
0
    pub fn new(before: &'a str, char_and_after: &'a str) -> Self {
718
0
        let mut char_and_after = char_and_after.chars();
719
0
        char_and_after.next(); // skip the character itself
720
0
        let after = char_and_after.as_str();
721
0
        Self { before, after }
722
0
    }
723
724
0
    fn add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics {
725
0
        diacritics.consume_greek_diacritics(self.after);
726
0
        diacritics
727
0
    }
728
729
0
    fn preceded_by_greek_letter(&self) -> bool {
730
0
        greek_to_me::preceded_by_greek_letter(self.before)
731
0
    }
732
733
0
    fn preceding_greek_vowel_diacritics(
734
0
        &self,
735
0
    ) -> Option<GreekCombiningCharacterSequenceDiacritics> {
736
0
        greek_to_me::preceding_greek_vowel_diacritics(self.before)
737
0
    }
738
739
0
    fn preceded_by_soft_dotted(&self, mapping: &CaseMapV1) -> bool {
740
0
        for c in self.before.chars().rev() {
741
0
            match mapping.dot_type(c) {
742
0
                DotType::SoftDotted => return true,
743
0
                DotType::OtherAccent => continue,
744
0
                _ => return false,
745
            }
746
        }
747
0
        false
748
0
    }
749
    /// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between.
750
    ///
751
    /// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string
752
0
    fn preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>(
753
0
        &self,
754
0
        mapping: &CaseMapV1,
755
0
    ) -> bool {
756
0
        let mut iter = self.before.chars().rev();
757
0
        while let Some(c) = iter.next() {
758
0
            if c == 'I' {
759
0
                if I_MUST_NOT_START_STRING {
760
0
                    return iter.next().is_some();
761
                } else {
762
0
                    return true;
763
                }
764
0
            }
765
0
            if mapping.dot_type(c) != DotType::OtherAccent {
766
0
                break;
767
0
            }
768
        }
769
0
        false
770
0
    }
Unexecuted instantiation: <icu_casemap::internals::ContextIterator>::preceded_by_capital_i::<false>
Unexecuted instantiation: <icu_casemap::internals::ContextIterator>::preceded_by_capital_i::<true>
771
0
    fn preceded_by_cased_letter(&self, mapping: &CaseMapV1) -> bool {
772
0
        for c in self.before.chars().rev() {
773
0
            let data = mapping.lookup_data(c);
774
0
            if !data.is_ignorable() {
775
0
                return data.case_type().is_some();
776
0
            }
777
        }
778
0
        false
779
0
    }
780
0
    fn followed_by_cased_letter(&self, mapping: &CaseMapV1) -> bool {
781
0
        for c in self.after.chars() {
782
0
            let data = mapping.lookup_data(c);
783
0
            if !data.is_ignorable() {
784
0
                return data.case_type().is_some();
785
0
            }
786
        }
787
0
        false
788
0
    }
789
0
    fn followed_by_more_above(&self, mapping: &CaseMapV1) -> bool {
790
0
        for c in self.after.chars() {
791
0
            match mapping.dot_type(c) {
792
0
                DotType::Above => return true,
793
0
                DotType::OtherAccent => continue,
794
0
                _ => return false,
795
            }
796
        }
797
0
        false
798
0
    }
799
0
    fn followed_by_dot_above(&self, mapping: &CaseMapV1) -> bool {
800
0
        for c in self.after.chars() {
801
0
            if c == '\u{307}' {
802
0
                return true;
803
0
            }
804
0
            if mapping.dot_type(c) != DotType::OtherAccent {
805
0
                return false;
806
0
            }
807
        }
808
0
        false
809
0
    }
810
811
    /// Checks the preceding and surrounding context of a j or J
812
    /// and returns true if it is preceded by an i or I at the start of the string.
813
    /// If one has an acute accent,
814
    /// both must have the accent for this to return true. No other accents are handled.
815
0
    fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMapV1) -> bool {
816
0
        let mut before = self.before.chars().rev();
817
0
        let mut i_has_acute = false;
818
        loop {
819
0
            match before.next() {
820
0
                Some('i') | Some('I') => break,
821
                Some('í') | Some('Í') => {
822
0
                    i_has_acute = true;
823
0
                    break;
824
                }
825
0
                Some(ACUTE) => i_has_acute = true,
826
0
                _ => return false,
827
            }
828
        }
829
830
0
        if before.next().is_some() {
831
            // not at the beginning of a string, doesn't matter
832
0
            return false;
833
0
        }
834
0
        let mut j_has_acute = false;
835
0
        for c in self.after.chars() {
836
0
            if c == ACUTE {
837
0
                j_has_acute = true;
838
0
                continue;
839
0
            }
840
            // We are supposed to check that `j` has no other combining marks aside
841
            // from potentially an acute accent. Once we hit the first non-combining mark
842
            // we are done.
843
            //
844
            // ICU4C checks for `gc=Mn` to determine if something is a combining mark,
845
            // however this requires extra data (and is the *only* point in the casemapping algorithm
846
            // where there is a direct dependency on properties data not mediated by the casemapping data trie).
847
            //
848
            // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
849
            //
850
            // See https://unicode-org.atlassian.net/browse/ICU-22429
851
0
            match mapping.dot_type(c) {
852
                // Not a combining character; ccc = 0
853
0
                DotType::NoDot | DotType::SoftDotted => break,
854
                // found combining character, bail
855
0
                _ => return false,
856
            }
857
        }
858
859
        // either both should have an acute accent, or none. this is an XNOR operation
860
0
        !(j_has_acute ^ i_has_acute)
861
0
    }
862
}