/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_casemap-1.5.1/src/internals.rs

Source
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! This module contains most of the actual algorithms for case mapping.
//!
//! Primarily, it implements methods on `CaseMapV1`, which contains the data model.

use crate::greek_to_me::{
    self, GreekCombiningCharacterSequenceDiacritics, GreekDiacritics, GreekPrecomposedLetterData,
    GreekVowel,
};
use crate::provider::data::{DotType, MappingKind};
use crate::provider::exception_helpers::ExceptionSlot;
use crate::provider::{CaseMapUnfoldV1, CaseMapV1};
use crate::set::ClosureSink;
use crate::titlecase::TrailingCase;
use core::fmt;
use icu_locid::LanguageIdentifier;
use writeable::Writeable;

const ACUTE: char = '\u{301}';

// Used to control the behavior of CaseMapper::fold.
// Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i.
#[derive(Copy, Clone, Default)]
pub struct FoldOptions {
    exclude_special_i: bool,
}

impl FoldOptions {
    pub fn with_turkic_mappings() -> Self {
        Self {
            exclude_special_i: true,
        }
    }
}

/// Helper type that wraps a writeable in a prefix string
pub(crate) struct StringAndWriteable<'a, W> {
    pub string: &'a str,
    pub writeable: W,
}

impl<'a, Wr: Writeable> Writeable for StringAndWriteable<'a, Wr> {
    fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
        sink.write_str(self.string)?;
        self.writeable.write_to(sink)
    }
    fn writeable_length_hint(&self) -> writeable::LengthHint {
        writeable::LengthHint::exact(self.string.len()) + self.writeable.writeable_length_hint()
    }
}

pub(crate) struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> {
    data: &'a CaseMapV1<'a>,
    src: &'a str,
    locale: CaseMapLocale,
    mapping: MappingKind,
    titlecase_tail_casing: TrailingCase,
}

impl<'a, const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'a, IS_TITLE_CONTEXT> {
    #[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds
    fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
        let src = self.src;
        let mut mapping = self.mapping;
        let mut iter = src.char_indices();
        for (i, c) in &mut iter {
            let context = ContextIterator::new(&src[..i], &src[i..]);
            self.data
                .full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?;
            if IS_TITLE_CONTEXT {
                if self.titlecase_tail_casing == TrailingCase::Lower {
                    mapping = MappingKind::Lower;
                } else {
                    break;
                }
            }
        }
        // Write the rest of the string
        if IS_TITLE_CONTEXT && self.titlecase_tail_casing == TrailingCase::Unchanged {
            sink.write_str(iter.as_str())?;
        }
        Ok(())
    }
    fn writeable_length_hint(&self) -> writeable::LengthHint {
        writeable::LengthHint::at_least(self.src.len())
    }
}

impl<'data> CaseMapV1<'data> {
    fn simple_helper(&self, c: char, kind: MappingKind) -> char {
        let data = self.lookup_data(c);
        if !data.has_exception() {
            if data.is_relevant_to(kind) {
                let folded = c as i32 + data.delta() as i32;
                // GIGO: delta should be valid
                char::from_u32(folded as u32).unwrap_or(c)
            } else {
                c
            }
        } else {
            let idx = data.exception_index();
            let exception = self.exceptions.get(idx);
            if data.is_relevant_to(kind) {
                if let Some(simple) = exception.get_simple_case_slot_for(c) {
                    return simple;
                }
            }
            exception.slot_char_for_kind(kind).unwrap_or(c)
        }
    }

    // Returns the lowercase mapping of the given `char`.
    #[inline]
    pub(crate) fn simple_lower(&self, c: char) -> char {
        self.simple_helper(c, MappingKind::Lower)
    }

    // Returns the uppercase mapping of the given `char`.
    #[inline]
    pub(crate) fn simple_upper(&self, c: char) -> char {
        self.simple_helper(c, MappingKind::Upper)
    }

    // Returns the titlecase mapping of the given `char`.
    #[inline]
    pub(crate) fn simple_title(&self, c: char) -> char {
        self.simple_helper(c, MappingKind::Title)
    }

    // Return the simple case folding mapping of the given char.
    #[inline]
    pub(crate) fn simple_fold(&self, c: char, options: FoldOptions) -> char {
        let data = self.lookup_data(c);
        if !data.has_exception() {
            if data.is_upper_or_title() {
                let folded = c as i32 + data.delta() as i32;
                // GIGO: delta should be valid
                char::from_u32(folded as u32).unwrap_or(c)
            } else {
                c
            }
        } else {
            // TODO: if we move conditional fold and no_simple_case_folding into
            // simple_helper, this function can just call simple_helper.
            let idx = data.exception_index();
            let exception = self.exceptions.get(idx);
            if exception.bits.has_conditional_fold() {
                self.simple_fold_special_case(c, options)
            } else if exception.bits.no_simple_case_folding() {
                c
            } else if data.is_upper_or_title() && exception.has_slot(ExceptionSlot::Delta) {
                // unwrap_or case should never happen but best to avoid panics
                exception.get_simple_case_slot_for(c).unwrap_or('\0')
            } else if let Some(slot_char) = exception.slot_char_for_kind(MappingKind::Fold) {
                slot_char
            } else {
                c
            }
        }
    }

    fn dot_type(&self, c: char) -> DotType {
        let data = self.lookup_data(c);
        if !data.has_exception() {
            data.dot_type()
        } else {
            let idx = data.exception_index();
            self.exceptions.get(idx).bits.dot_type()
        }
    }

    // Returns true if this code point is is case-sensitive.
    // This is not currently exposed.
    #[allow(dead_code)]
    fn is_case_sensitive(&self, c: char) -> bool {
        let data = self.lookup_data(c);
        if !data.has_exception() {
            data.is_sensitive()
        } else {
            let idx = data.exception_index();
            self.exceptions.get(idx).bits.is_sensitive()
        }
    }

    /// Returns whether the character is cased
    pub(crate) fn is_cased(&self, c: char) -> bool {
        self.lookup_data(c).case_type().is_some()
    }

    #[inline(always)]
    // IS_TITLE_CONTEXT must be true if kind is MappingKind::Title
    // The kind may be a different kind with IS_TITLE_CONTEXT still true because
    // titlecasing a segment involves switching to lowercase later
    fn full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>(
        &self,
        c: char,
        context: ContextIterator,
        locale: CaseMapLocale,
        kind: MappingKind,
        sink: &mut W,
    ) -> fmt::Result {
        // If using a title mapping IS_TITLE_CONTEXT must be true
        debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT);
        // In a title context, kind MUST be Title or Lower
        debug_assert!(
            !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
        );

        // ICU4C's non-standard extension for Dutch IJ titlecasing
        // handled here instead of in full_lower_special_case because J does not have conditional
        // special casemapping.
        if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
            // When titlecasing, a J found immediately after an I at the beginning of the segment
            // should also uppercase. They are both allowed to have an acute accent but it must
            // be present on both letters or neither. They may not have any other combining marks.
            if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
                return sink.write_char('J');
            }
        }

        // ICU4C's non-standard extension for Greek uppercasing:
        // https://icu.unicode.org/design/case/greek-upper.
        // Effectively removes Greek accents from Greek vowels during uppercasing,
        // whilst attempting to preserve additional marks like the dialytika (diæresis)
        // and ypogegrammeni (combining small iota).
        if !IS_TITLE_CONTEXT && locale == CaseMapLocale::Greek && kind == MappingKind::Upper {
            // Remove all combining diacritics on a Greek letter.
            // Ypogegrammeni is not an accent mark and is handled by regular casemapping (it turns into
            // a capital iota).
            // The dialytika is removed here, but it may be added again when the base letter is being processed.
            if greek_to_me::is_greek_diacritic_except_ypogegrammeni(c)
                && context.preceded_by_greek_letter()
            {
                return Ok(());
            }
            let data = greek_to_me::get_data(c);
            // Check if the character is a Greek vowel
            match data {
                Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => {
                    // Get the diacritics on the character itself, and add any further combining diacritics
                    // from the context.
                    let mut diacritics = context.add_greek_diacritics(precomposed_diacritics);
                    // If the previous vowel had an accent (which would be removed) but no dialytika,
                    // and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate
                    // the now-unaccented adjacent vowels from a digraph/diphthong.
                    // Use a precomposed dialytika if the accent was precomposed, and a combining dialytika
                    // if the accent was combining, so as to map NFD to NFD and NFC to NFC.
                    if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ)
                    {
                        if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() {
                            if !preceding_vowel.combining.dialytika
                                && !preceding_vowel.precomposed.dialytika
                            {
                                if preceding_vowel.combining.accented {
                                    diacritics.dialytika = true;
                                } else {
                                    precomposed_diacritics.dialytika =
                                        preceding_vowel.precomposed.accented;
                                }
                            }
                        }
                    }
                    // Write the base of the uppercased combining character sequence.
                    // In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed.
                    // In some branches the base has a precomposed diacritic.
                    // In the case of the Greek disjunctive "or", a combining tonos may also be written.
                    match vowel {
                        GreekVowel::Η => {
                            // The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish
                            // the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή).
                            //
                            // A lone η with an accent other than the oxia/tonos is not expected,
                            // so there is no need to special-case the oxia/tonos.
                            // The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex,
                            // so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle
                            // (e.g. να είναι) since Byzantine times anyway.
                            if diacritics.accented
                                && !context.followed_by_cased_letter(self)
                                && !context.preceded_by_cased_letter(self)
                                && !diacritics.ypogegrammeni
                            {
                                if precomposed_diacritics.accented {
                                    sink.write_char('Ή')?;
                                } else {
                                    sink.write_char('Η')?;
                                    sink.write_char(greek_to_me::TONOS)?;
                                }
                            } else {
                                sink.write_char('Η')?;
                            }
                        }
                        GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika {
                            diacritics.dialytika = false;
                            'Ϊ'
                        } else {
                            vowel.into()
                        })?,
                        GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika {
                            diacritics.dialytika = false;
                            'Ϋ'
                        } else {
                            vowel.into()
                        })?,
                        _ => sink.write_char(vowel.into())?,
                    };
                    if diacritics.dialytika {
                        sink.write_char(greek_to_me::DIALYTIKA)?;
                    }
                    if precomposed_diacritics.ypogegrammeni {
                        sink.write_char('Ι')?;
                    }

                    return Ok(());
                }
                // Rho might have breathing marks, we handle it specially
                // to remove them
                Some(GreekPrecomposedLetterData::Consonant(true)) => {
                    sink.write_char(greek_to_me::CAPITAL_RHO)?;
                    return Ok(());
                }
                _ => (),
            }
        }

        let data = self.lookup_data(c);
        if !data.has_exception() {
            if data.is_relevant_to(kind) {
                let mapped = c as i32 + data.delta() as i32;
                // GIGO: delta should be valid
                let mapped = char::from_u32(mapped as u32).unwrap_or(c);
                sink.write_char(mapped)
            } else {
                sink.write_char(c)
            }
        } else {
            let idx = data.exception_index();
            let exception = self.exceptions.get(idx);
            if exception.bits.has_conditional_special() {
                if let Some(special) = match kind {
                    MappingKind::Lower => {
                        self.full_lower_special_case::<IS_TITLE_CONTEXT>(c, context, locale)
                    }
                    MappingKind::Fold => self.full_fold_special_case(c, context, locale),
                    MappingKind::Upper | MappingKind::Title => self
                        .full_upper_or_title_special_case::<IS_TITLE_CONTEXT>(c, context, locale),
                } {
                    return special.write_to(sink);
                }
            }
            if let Some(mapped_string) = exception.get_fullmappings_slot_for_kind(kind) {
                if !mapped_string.is_empty() {
                    return sink.write_str(mapped_string);
                }
            }

            if kind == MappingKind::Fold && exception.bits.no_simple_case_folding() {
                return sink.write_char(c);
            }

            if data.is_relevant_to(kind) {
                if let Some(simple) = exception.get_simple_case_slot_for(c) {
                    return sink.write_char(simple);
                }
            }

            if let Some(slot_char) = exception.slot_char_for_kind(kind) {
                sink.write_char(slot_char)
            } else {
                sink.write_char(c)
            }
        }
    }

    // These constants are used for hardcoded locale-specific foldings.
    const I_DOT: &'static str = "\u{69}\u{307}";
    const J_DOT: &'static str = "\u{6a}\u{307}";
    const I_OGONEK_DOT: &'static str = "\u{12f}\u{307}";
    const I_DOT_GRAVE: &'static str = "\u{69}\u{307}\u{300}";
    const I_DOT_ACUTE: &'static str = "\u{69}\u{307}\u{301}";
    const I_DOT_TILDE: &'static str = "\u{69}\u{307}\u{303}";

    // Special case folding mappings, hardcoded.
    // This handles the special Turkic mappings for uppercase I and dotted uppercase I
    // For non-Turkic languages, this mapping is normally not used.
    // For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
    fn simple_fold_special_case(&self, c: char, options: FoldOptions) -> char {
        debug_assert!(c == '\u{49}' || c == '\u{130}');
        let is_turkic = options.exclude_special_i;
        match (c, is_turkic) {
            // Turkic mappings
            ('\u{49}', true) => '\u{131}', // 0049; T; 0131; # LATIN CAPITAL LETTER I
            ('\u{130}', true) => '\u{69}', /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */

            // Default mappings
            ('\u{49}', false) => '\u{69}', // 0049; C; 0069; # LATIN CAPITAL LETTER I

            // There is no simple case folding for U+130.
            (c, _) => c,
        }
    }

    fn full_lower_special_case<const IS_TITLE_CONTEXT: bool>(
        &self,
        c: char,
        context: ContextIterator,
        locale: CaseMapLocale,
    ) -> Option<FullMappingResult> {
        if locale == CaseMapLocale::Lithuanian {
            // Lithuanian retains the dot in a lowercase i when followed by accents.
            // Introduce an explicit dot above when lowercasing capital I's and J's
            // whenever there are more accents above (of the accents used in
            // Lithuanian: grave, acute, and tilde above).

            // Check for accents above I, J, and I-with-ogonek.
            if c == 'I' && context.followed_by_more_above(self) {
                return Some(FullMappingResult::String(Self::I_DOT));
            } else if c == 'J' && context.followed_by_more_above(self) {
                return Some(FullMappingResult::String(Self::J_DOT));
            } else if c == '\u{12e}' && context.followed_by_more_above(self) {
                return Some(FullMappingResult::String(Self::I_OGONEK_DOT));
            }

            // These characters are precomposed with accents above, so we don't
            // have to look at the context.
            if c == '\u{cc}' {
                return Some(FullMappingResult::String(Self::I_DOT_GRAVE));
            } else if c == '\u{cd}' {
                return Some(FullMappingResult::String(Self::I_DOT_ACUTE));
            } else if c == '\u{128}' {
                return Some(FullMappingResult::String(Self::I_DOT_TILDE));
            }
        }

        if locale == CaseMapLocale::Turkish {
            if c == '\u{130}' {
                // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
                return Some(FullMappingResult::CodePoint('i'));
            } else if c == '\u{307}' && context.preceded_by_capital_i::<IS_TITLE_CONTEXT>(self) {
                // When lowercasing, remove dot_above in the sequence I + dot_above,
                // which will turn into i. This matches the behaviour of the
                // canonically equivalent I-dot_above.
                //
                // In a titlecase context, we do not want to apply this behavior to cases where the I
                // was at the beginning of the string, as that I and its marks should be handled by the
                // uppercasing rules (which ignore it, see below)

                return Some(FullMappingResult::Remove);
            } else if c == 'I' && !context.followed_by_dot_above(self) {
                // When lowercasing, unless an I is before a dot_above, it turns
                // into a dotless i.
                return Some(FullMappingResult::CodePoint('\u{131}'));
            }
        }

        if c == '\u{130}' {
            // Preserve canonical equivalence for I with dot. Turkic is handled above.
            return Some(FullMappingResult::String(Self::I_DOT));
        }

        if c == '\u{3a3}'
            && context.preceded_by_cased_letter(self)
            && !context.followed_by_cased_letter(self)
        {
            // Greek capital sigman maps depending on surrounding cased letters.
            return Some(FullMappingResult::CodePoint('\u{3c2}'));
        }

        // No relevant special case mapping. Use a normal mapping.
        None
    }

    fn full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>(
        &self,
        c: char,
        context: ContextIterator,
        locale: CaseMapLocale,
    ) -> Option<FullMappingResult> {
        if locale == CaseMapLocale::Turkish && c == 'i' {
            // In Turkic languages, i turns into a dotted capital I.
            return Some(FullMappingResult::CodePoint('\u{130}'));
        }
        if locale == CaseMapLocale::Lithuanian
            && c == '\u{307}'
            && context.preceded_by_soft_dotted(self)
        {
            // Lithuanian retains the dot in a lowercase i when followed by accents.
            // Remove dot_above after i with upper or titlecase.
            return Some(FullMappingResult::Remove);
        }
        // ICU4C's non-standard extension for Armenian ligature ech-yiwn.
        if c == '\u{587}' {
            return match (locale, IS_TITLE_CONTEXT) {
                (CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")),
                (CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")),
                (_, false) => Some(FullMappingResult::String("ԵՒ")),
                (_, true) => Some(FullMappingResult::String("Եւ")),
            };
        }
        None
    }

    fn full_fold_special_case(
        &self,
        c: char,
        _context: ContextIterator,
        locale: CaseMapLocale,
    ) -> Option<FullMappingResult> {
        let is_turkic = locale == CaseMapLocale::Turkish;
        match (c, is_turkic) {
            // Turkic mappings
            ('\u{49}', true) => Some(FullMappingResult::CodePoint('\u{131}')),
            ('\u{130}', true) => Some(FullMappingResult::CodePoint('\u{69}')),

            // Default mappings
            ('\u{49}', false) => Some(FullMappingResult::CodePoint('\u{69}')),
            ('\u{130}', false) => Some(FullMappingResult::String(Self::I_DOT)),
            (_, _) => None,
        }
    }
    /// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists
    /// to avoid perf impacts on other more common modes of operation
    ///
    /// titlecase_tail_casing is only read in IS_TITLE_CONTEXT
    pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>(
        &'a self,
        src: &'a str,
        locale: CaseMapLocale,
        mapping: MappingKind,
        titlecase_tail_casing: TrailingCase,
    ) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT> {
        // Ensure that they are either both true or both false, i.e. an XNOR operation
        debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title)));

        FullCaseWriteable::<IS_TITLE_CONTEXT> {
            data: self,
            src,
            locale,
            mapping,
            titlecase_tail_casing,
        }
    }

    /// Adds all simple case mappings and the full case folding for `c` to `set`.
    /// Also adds special case closure mappings.
    /// The character itself is not added.
    /// For example, the mappings
    /// - for s include long s
    /// - for sharp s include ss
    /// - for k include the Kelvin sign
    pub(crate) fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) {
        // Hardcode the case closure of i and its relatives and ignore the
        // data file data for these characters.
        // The Turkic dotless i and dotted I with their case mapping conditions
        // and case folding option make the related characters behave specially.
        // This code matches their closure behavior to their case folding behavior.
        match c {
            // Regular i and I are in one equivalence class.
            '\u{49}' => {
                set.add_char('\u{69}');
                return;
            }
            '\u{69}' => {
                set.add_char('\u{49}');
                return;
            }

            // Dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>)
            '\u{130}' => {
                set.add_string(Self::I_DOT);
                return;
            }

            // Dotless i is in a class by itself
            '\u{131}' => {
                return;
            }

            _ => {}
        }

        let data = self.lookup_data(c);
        if !data.has_exception() {
            if data.case_type().is_some() {
                let delta = data.delta() as i32;
                if delta != 0 {
                    // Add the one simple case mapping, no matter what type it is.
                    let codepoint = c as i32 + delta;
                    // GIGO: delta should be valid
                    let mapped = char::from_u32(codepoint as u32).unwrap_or(c);
                    set.add_char(mapped);
                }
            }
            return;
        }

        // c has exceptions, so there may be multiple simple and/or full case mappings.
        let idx = data.exception_index();
        let exception = self.exceptions.get(idx);

        // Add all simple case mappings.
        for slot in [
            ExceptionSlot::Lower,
            ExceptionSlot::Fold,
            ExceptionSlot::Upper,
            ExceptionSlot::Title,
        ] {
            if let Some(simple) = exception.get_char_slot(slot) {
                set.add_char(simple);
            }
        }
        if let Some(simple) = exception.get_simple_case_slot_for(c) {
            set.add_char(simple);
        }

        exception.add_full_and_closure_mappings(set);
    }

    /// Maps the string to single code points and adds the associated case closure
    /// mappings.
    ///
    /// (see docs on CaseMapper::add_string_case_closure_to)
    pub(crate) fn add_string_case_closure_to<S: ClosureSink>(
        &self,
        s: &str,
        set: &mut S,
        unfold_data: &CaseMapUnfoldV1,
    ) -> bool {
        if s.chars().count() <= 1 {
            // The string is too short to find any match.
            return false;
        }
        match unfold_data.get(s) {
            Some(closure_string) => {
                for c in closure_string.chars() {
                    set.add_char(c);
                    self.add_case_closure_to(c, set);
                }
                true
            }
            None => false,
        }
    }
}

// An internal representation of locale. Non-Root values of this
// enumeration imply that hard-coded special cases exist for this
// language.
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum CaseMapLocale {
    Root,
    Turkish,
    Lithuanian,
    Greek,
    Dutch,
    Armenian,
}

impl CaseMapLocale {
    pub const fn from_langid(langid: &LanguageIdentifier) -> Self {
        use icu_locid::subtags::{language, Language};
        const TR: Language = language!("tr");
        const AZ: Language = language!("az");
        const LT: Language = language!("lt");
        const EL: Language = language!("el");
        const NL: Language = language!("nl");
        const HY: Language = language!("hy");
        match langid.language {
            TR | AZ => Self::Turkish,
            LT => Self::Lithuanian,
            EL => Self::Greek,
            NL => Self::Dutch,
            HY => Self::Armenian,
            _ => Self::Root,
        }
    }
}

pub enum FullMappingResult<'a> {
    Remove,
    CodePoint(char),
    String(&'a str),
}

impl<'a> FullMappingResult<'a> {
    #[allow(dead_code)]
    fn add_to_set<S: ClosureSink>(&self, set: &mut S) {
        match *self {
            FullMappingResult::CodePoint(c) => set.add_char(c),
            FullMappingResult::String(s) => set.add_string(s),
            FullMappingResult::Remove => {}
        }
    }
}

impl Writeable for FullMappingResult<'_> {
    fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
        match *self {
            FullMappingResult::CodePoint(c) => sink.write_char(c),
            FullMappingResult::String(s) => sink.write_str(s),
            FullMappingResult::Remove => Ok(()),
        }
    }
}

pub(crate) struct ContextIterator<'a> {
    before: &'a str,
    after: &'a str,
}

impl<'a> ContextIterator<'a> {
    // Returns a context iterator with the characters before
    // and after the character at a given index, given the preceding
    // string and the succeeding string including the character itself
    pub fn new(before: &'a str, char_and_after: &'a str) -> Self {
        let mut char_and_after = char_and_after.chars();
        char_and_after.next(); // skip the character itself
        let after = char_and_after.as_str();
        Self { before, after }
    }

    fn add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics {
        diacritics.consume_greek_diacritics(self.after);
        diacritics
    }

    fn preceded_by_greek_letter(&self) -> bool {
        greek_to_me::preceded_by_greek_letter(self.before)
    }

    fn preceding_greek_vowel_diacritics(
        &self,
    ) -> Option<GreekCombiningCharacterSequenceDiacritics> {
        greek_to_me::preceding_greek_vowel_diacritics(self.before)
    }

    fn preceded_by_soft_dotted(&self, mapping: &CaseMapV1) -> bool {
        for c in self.before.chars().rev() {
            match mapping.dot_type(c) {
                DotType::SoftDotted => return true,
                DotType::OtherAccent => continue,
                _ => return false,
            }
        }
        false
    }
    /// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between.
    ///
    /// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string
    fn preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>(
        &self,
        mapping: &CaseMapV1,
    ) -> bool {
        let mut iter = self.before.chars().rev();
        while let Some(c) = iter.next() {
            if c == 'I' {
                if I_MUST_NOT_START_STRING {
                    return iter.next().is_some();
                } else {
                    return true;
                }
            }
            if mapping.dot_type(c) != DotType::OtherAccent {
                break;
            }
        }
        false
    }
    fn preceded_by_cased_letter(&self, mapping: &CaseMapV1) -> bool {
        for c in self.before.chars().rev() {
            let data = mapping.lookup_data(c);
            if !data.is_ignorable() {
                return data.case_type().is_some();
            }
        }
        false
    }
    fn followed_by_cased_letter(&self, mapping: &CaseMapV1) -> bool {
        for c in self.after.chars() {
            let data = mapping.lookup_data(c);
            if !data.is_ignorable() {
                return data.case_type().is_some();
            }
        }
        false
    }
    fn followed_by_more_above(&self, mapping: &CaseMapV1) -> bool {
        for c in self.after.chars() {
            match mapping.dot_type(c) {
                DotType::Above => return true,
                DotType::OtherAccent => continue,
                _ => return false,
            }
        }
        false
    }
    fn followed_by_dot_above(&self, mapping: &CaseMapV1) -> bool {
        for c in self.after.chars() {
            if c == '\u{307}' {
                return true;
            }
            if mapping.dot_type(c) != DotType::OtherAccent {
                return false;
            }
        }
        false
    }

    /// Checks the preceding and surrounding context of a j or J
    /// and returns true if it is preceded by an i or I at the start of the string.
    /// If one has an acute accent,
    /// both must have the accent for this to return true. No other accents are handled.
    fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMapV1) -> bool {
        let mut before = self.before.chars().rev();
        let mut i_has_acute = false;
        loop {
            match before.next() {
                Some('i') | Some('I') => break,
                Some('í') | Some('Í') => {
                    i_has_acute = true;
                    break;
                }
                Some(ACUTE) => i_has_acute = true,
                _ => return false,
            }
        }

        if before.next().is_some() {
            // not at the beginning of a string, doesn't matter
            return false;
        }
        let mut j_has_acute = false;
        for c in self.after.chars() {
            if c == ACUTE {
                j_has_acute = true;
                continue;
            }
            // We are supposed to check that `j` has no other combining marks aside
            // from potentially an acute accent. Once we hit the first non-combining mark
            // we are done.
            //
            // ICU4C checks for `gc=Mn` to determine if something is a combining mark,
            // however this requires extra data (and is the *only* point in the casemapping algorithm
            // where there is a direct dependency on properties data not mediated by the casemapping data trie).
            //
            // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
            //
            // See https://unicode-org.atlassian.net/browse/ICU-22429
            match mapping.dot_type(c) {
                // Not a combining character; ccc = 0
                DotType::NoDot | DotType::SoftDotted => break,
                // found combining character, bail
                _ => return false,
            }
        }

        // either both should have an acute accent, or none. this is an XNOR operation
        !(j_has_acute ^ i_has_acute)
    }
}

Coverage Report

Created: 2026-02-14 06:39