• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! This module contains most of the actual algorithms for case mapping.
6 //!
7 //! Primarily, it implements methods on `CaseMap`, which contains the data model.
8 
9 use crate::greek_to_me::{
10     self, GreekCombiningCharacterSequenceDiacritics, GreekDiacritics, GreekPrecomposedLetterData,
11     GreekVowel,
12 };
13 use crate::provider::data::{DotType, MappingKind};
14 use crate::provider::exception_helpers::ExceptionSlot;
15 use crate::provider::{CaseMap, CaseMapUnfold};
16 use crate::set::ClosureSink;
17 use crate::titlecase::TrailingCase;
18 use core::fmt;
19 use icu_locale_core::LanguageIdentifier;
20 use writeable::Writeable;
21 
22 const ACUTE: char = '\u{301}';
23 
24 // Used to control the behavior of CaseMapper::fold.
25 // Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i.
26 #[derive(Copy, Clone, Default)]
27 pub(crate) struct FoldOptions {
28     exclude_special_i: bool,
29 }
30 
31 impl FoldOptions {
with_turkic_mappings() -> Self32     pub fn with_turkic_mappings() -> Self {
33         Self {
34             exclude_special_i: true,
35         }
36     }
37 }
38 
39 /// Helper type that wraps a writeable in a prefix string
40 pub(crate) struct StringAndWriteable<'a, W> {
41     pub string: &'a str,
42     pub writeable: W,
43 }
44 
45 impl<Wr: Writeable> Writeable for StringAndWriteable<'_, Wr> {
write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result46     fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
47         sink.write_str(self.string)?;
48         self.writeable.write_to(sink)
49     }
writeable_length_hint(&self) -> writeable::LengthHint50     fn writeable_length_hint(&self) -> writeable::LengthHint {
51         writeable::LengthHint::exact(self.string.len()) + self.writeable.writeable_length_hint()
52     }
53 }
54 
55 pub(crate) struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> {
56     data: &'a CaseMap<'a>,
57     src: &'a str,
58     locale: CaseMapLocale,
59     mapping: MappingKind,
60     titlecase_tail_casing: TrailingCase,
61 }
62 
63 impl<const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'_, IS_TITLE_CONTEXT> {
64     #[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds
write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result65     fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
66         let src = self.src;
67         let mut mapping = self.mapping;
68         let mut iter = src.char_indices();
69         for (i, c) in &mut iter {
70             let context = ContextIterator::new(&src[..i], &src[i..]);
71             self.data
72                 .full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?;
73             if IS_TITLE_CONTEXT {
74                 if self.titlecase_tail_casing == TrailingCase::Lower {
75                     mapping = MappingKind::Lower;
76                 } else {
77                     break;
78                 }
79             }
80         }
81         // Write the rest of the string
82         if IS_TITLE_CONTEXT && self.titlecase_tail_casing == TrailingCase::Unchanged {
83             sink.write_str(iter.as_str())?;
84         }
85         Ok(())
86     }
writeable_length_hint(&self) -> writeable::LengthHint87     fn writeable_length_hint(&self) -> writeable::LengthHint {
88         writeable::LengthHint::at_least(self.src.len())
89     }
90 }
91 
92 impl<'data> CaseMap<'data> {
simple_helper(&self, c: char, kind: MappingKind) -> char93     fn simple_helper(&self, c: char, kind: MappingKind) -> char {
94         let data = self.lookup_data(c);
95         if !data.has_exception() {
96             if data.is_relevant_to(kind) {
97                 let folded = c as i32 + data.delta() as i32;
98                 // GIGO: delta should be valid
99                 char::from_u32(folded as u32).unwrap_or(c)
100             } else {
101                 c
102             }
103         } else {
104             let idx = data.exception_index();
105             let exception = self.exceptions.get(idx);
106             if data.is_relevant_to(kind) {
107                 if let Some(simple) = exception.get_simple_case_slot_for(c) {
108                     return simple;
109                 }
110             }
111             exception.slot_char_for_kind(kind).unwrap_or(c)
112         }
113     }
114 
115     // Returns the lowercase mapping of the given `char`.
116     #[inline]
simple_lower(&self, c: char) -> char117     pub(crate) fn simple_lower(&self, c: char) -> char {
118         self.simple_helper(c, MappingKind::Lower)
119     }
120 
121     // Returns the uppercase mapping of the given `char`.
122     #[inline]
simple_upper(&self, c: char) -> char123     pub(crate) fn simple_upper(&self, c: char) -> char {
124         self.simple_helper(c, MappingKind::Upper)
125     }
126 
127     // Returns the titlecase mapping of the given `char`.
128     #[inline]
simple_title(&self, c: char) -> char129     pub(crate) fn simple_title(&self, c: char) -> char {
130         self.simple_helper(c, MappingKind::Title)
131     }
132 
133     // Return the simple case folding mapping of the given char.
134     #[inline]
simple_fold(&self, c: char, options: FoldOptions) -> char135     pub(crate) fn simple_fold(&self, c: char, options: FoldOptions) -> char {
136         let data = self.lookup_data(c);
137         if !data.has_exception() {
138             if data.is_upper_or_title() {
139                 let folded = c as i32 + data.delta() as i32;
140                 // GIGO: delta should be valid
141                 char::from_u32(folded as u32).unwrap_or(c)
142             } else {
143                 c
144             }
145         } else {
146             // TODO: if we move conditional fold and no_simple_case_folding into
147             // simple_helper, this function can just call simple_helper.
148             let idx = data.exception_index();
149             let exception = self.exceptions.get(idx);
150             if exception.bits.has_conditional_fold() {
151                 self.simple_fold_special_case(c, options)
152             } else if exception.bits.no_simple_case_folding() {
153                 c
154             } else if data.is_upper_or_title() && exception.has_slot(ExceptionSlot::Delta) {
155                 // unwrap_or case should never happen but best to avoid panics
156                 exception.get_simple_case_slot_for(c).unwrap_or('\0')
157             } else if let Some(slot_char) = exception.slot_char_for_kind(MappingKind::Fold) {
158                 slot_char
159             } else {
160                 c
161             }
162         }
163     }
164 
dot_type(&self, c: char) -> DotType165     fn dot_type(&self, c: char) -> DotType {
166         let data = self.lookup_data(c);
167         if !data.has_exception() {
168             data.dot_type()
169         } else {
170             let idx = data.exception_index();
171             self.exceptions.get(idx).bits.dot_type()
172         }
173     }
174 
175     // Returns true if this code point is is case-sensitive.
176     // This is not currently exposed.
177     #[allow(dead_code)]
is_case_sensitive(&self, c: char) -> bool178     fn is_case_sensitive(&self, c: char) -> bool {
179         let data = self.lookup_data(c);
180         if !data.has_exception() {
181             data.is_sensitive()
182         } else {
183             let idx = data.exception_index();
184             self.exceptions.get(idx).bits.is_sensitive()
185         }
186     }
187 
188     /// Returns whether the character is cased
is_cased(&self, c: char) -> bool189     pub(crate) fn is_cased(&self, c: char) -> bool {
190         self.lookup_data(c).case_type().is_some()
191     }
192 
193     #[inline(always)]
194     // IS_TITLE_CONTEXT must be true if kind is MappingKind::Title
195     // The kind may be a different kind with IS_TITLE_CONTEXT still true because
196     // titlecasing a segment involves switching to lowercase later
full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>( &self, c: char, context: ContextIterator, locale: CaseMapLocale, kind: MappingKind, sink: &mut W, ) -> fmt::Result197     fn full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>(
198         &self,
199         c: char,
200         context: ContextIterator,
201         locale: CaseMapLocale,
202         kind: MappingKind,
203         sink: &mut W,
204     ) -> fmt::Result {
205         // If using a title mapping IS_TITLE_CONTEXT must be true
206         debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT);
207         // In a title context, kind MUST be Title or Lower
208         debug_assert!(
209             !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
210         );
211 
212         // ICU4C's non-standard extension for Dutch IJ titlecasing
213         // handled here instead of in full_lower_special_case because J does not have conditional
214         // special casemapping.
215         if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
216             // When titlecasing, a J found immediately after an I at the beginning of the segment
217             // should also uppercase. They are both allowed to have an acute accent but it must
218             // be present on both letters or neither. They may not have any other combining marks.
219             if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
220                 return sink.write_char('J');
221             }
222         }
223 
224         // ICU4C's non-standard extension for Greek uppercasing:
225         // https://icu.unicode.org/design/case/greek-upper.
226         // Effectively removes Greek accents from Greek vowels during uppercasing,
227         // whilst attempting to preserve additional marks like the dialytika (diæresis)
228         // and ypogegrammeni (combining small iota).
229         if !IS_TITLE_CONTEXT && locale == CaseMapLocale::Greek && kind == MappingKind::Upper {
230             // Remove all combining diacritics on a Greek letter.
231             // Ypogegrammeni is not an accent mark and is handled by regular casemapping (it turns into
232             // a capital iota).
233             // The dialytika is removed here, but it may be added again when the base letter is being processed.
234             if greek_to_me::is_greek_diacritic_except_ypogegrammeni(c)
235                 && context.preceded_by_greek_letter()
236             {
237                 return Ok(());
238             }
239             let data = greek_to_me::get_data(c);
240             // Check if the character is a Greek vowel
241             match data {
242                 Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => {
243                     // Get the diacritics on the character itself, and add any further combining diacritics
244                     // from the context.
245                     let mut diacritics = context.add_greek_diacritics(precomposed_diacritics);
246                     // If the previous vowel had an accent (which would be removed) but no dialytika,
247                     // and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate
248                     // the now-unaccented adjacent vowels from a digraph/diphthong.
249                     // Use a precomposed dialytika if the accent was precomposed, and a combining dialytika
250                     // if the accent was combining, so as to map NFD to NFD and NFC to NFC.
251                     if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ)
252                     {
253                         if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() {
254                             if !preceding_vowel.combining.dialytika
255                                 && !preceding_vowel.precomposed.dialytika
256                             {
257                                 if preceding_vowel.combining.accented {
258                                     diacritics.dialytika = true;
259                                 } else {
260                                     precomposed_diacritics.dialytika =
261                                         preceding_vowel.precomposed.accented;
262                                 }
263                             }
264                         }
265                     }
266                     // Write the base of the uppercased combining character sequence.
267                     // In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed.
268                     // In some branches the base has a precomposed diacritic.
269                     // In the case of the Greek disjunctive "or", a combining tonos may also be written.
270                     match vowel {
271                         GreekVowel::Η => {
272                             // The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish
273                             // the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή).
274                             //
275                             // A lone η with an accent other than the oxia/tonos is not expected,
276                             // so there is no need to special-case the oxia/tonos.
277                             // The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex,
278                             // so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle
279                             // (e.g. να είναι) since Byzantine times anyway.
280                             if diacritics.accented
281                                 && !context.followed_by_cased_letter(self)
282                                 && !context.preceded_by_cased_letter(self)
283                                 && !diacritics.ypogegrammeni
284                             {
285                                 if precomposed_diacritics.accented {
286                                     sink.write_char('Ή')?;
287                                 } else {
288                                     sink.write_char('Η')?;
289                                     sink.write_char(greek_to_me::TONOS)?;
290                                 }
291                             } else {
292                                 sink.write_char('Η')?;
293                             }
294                         }
295                         GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika {
296                             diacritics.dialytika = false;
297                             'Ϊ'
298                         } else {
299                             vowel.into()
300                         })?,
301                         GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika {
302                             diacritics.dialytika = false;
303                             'Ϋ'
304                         } else {
305                             vowel.into()
306                         })?,
307                         _ => sink.write_char(vowel.into())?,
308                     };
309                     if diacritics.dialytika {
310                         sink.write_char(greek_to_me::DIALYTIKA)?;
311                     }
312                     if precomposed_diacritics.ypogegrammeni {
313                         sink.write_char('Ι')?;
314                     }
315 
316                     return Ok(());
317                 }
318                 // Rho might have breathing marks, we handle it specially
319                 // to remove them
320                 Some(GreekPrecomposedLetterData::Consonant(true)) => {
321                     sink.write_char(greek_to_me::CAPITAL_RHO)?;
322                     return Ok(());
323                 }
324                 _ => (),
325             }
326         }
327 
328         let data = self.lookup_data(c);
329         if !data.has_exception() {
330             if data.is_relevant_to(kind) {
331                 let mapped = c as i32 + data.delta() as i32;
332                 // GIGO: delta should be valid
333                 let mapped = char::from_u32(mapped as u32).unwrap_or(c);
334                 sink.write_char(mapped)
335             } else {
336                 sink.write_char(c)
337             }
338         } else {
339             let idx = data.exception_index();
340             let exception = self.exceptions.get(idx);
341             if exception.bits.has_conditional_special() {
342                 if let Some(special) = match kind {
343                     MappingKind::Lower => {
344                         self.full_lower_special_case::<IS_TITLE_CONTEXT>(c, context, locale)
345                     }
346                     MappingKind::Fold => self.full_fold_special_case(c, context, locale),
347                     MappingKind::Upper | MappingKind::Title => self
348                         .full_upper_or_title_special_case::<IS_TITLE_CONTEXT>(c, context, locale),
349                 } {
350                     return special.write_to(sink);
351                 }
352             }
353             if let Some(mapped_string) = exception.get_fullmappings_slot_for_kind(kind) {
354                 if !mapped_string.is_empty() {
355                     return sink.write_str(mapped_string);
356                 }
357             }
358 
359             if kind == MappingKind::Fold && exception.bits.no_simple_case_folding() {
360                 return sink.write_char(c);
361             }
362 
363             if data.is_relevant_to(kind) {
364                 if let Some(simple) = exception.get_simple_case_slot_for(c) {
365                     return sink.write_char(simple);
366                 }
367             }
368 
369             if let Some(slot_char) = exception.slot_char_for_kind(kind) {
370                 sink.write_char(slot_char)
371             } else {
372                 sink.write_char(c)
373             }
374         }
375     }
376 
377     // These constants are used for hardcoded locale-specific foldings.
378     const I_DOT: &'static str = "\u{69}\u{307}";
379     const J_DOT: &'static str = "\u{6a}\u{307}";
380     const I_OGONEK_DOT: &'static str = "\u{12f}\u{307}";
381     const I_DOT_GRAVE: &'static str = "\u{69}\u{307}\u{300}";
382     const I_DOT_ACUTE: &'static str = "\u{69}\u{307}\u{301}";
383     const I_DOT_TILDE: &'static str = "\u{69}\u{307}\u{303}";
384 
385     // Special case folding mappings, hardcoded.
386     // This handles the special Turkic mappings for uppercase I and dotted uppercase I
387     // For non-Turkic languages, this mapping is normally not used.
388     // For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
simple_fold_special_case(&self, c: char, options: FoldOptions) -> char389     fn simple_fold_special_case(&self, c: char, options: FoldOptions) -> char {
390         debug_assert!(c == '\u{49}' || c == '\u{130}');
391         let is_turkic = options.exclude_special_i;
392         match (c, is_turkic) {
393             // Turkic mappings
394             ('\u{49}', true) => '\u{131}', // 0049; T; 0131; # LATIN CAPITAL LETTER I
395             ('\u{130}', true) => '\u{69}', /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
396 
397             // Default mappings
398             ('\u{49}', false) => '\u{69}', // 0049; C; 0069; # LATIN CAPITAL LETTER I
399 
400             // There is no simple case folding for U+130.
401             (c, _) => c,
402         }
403     }
404 
full_lower_special_case<const IS_TITLE_CONTEXT: bool>( &self, c: char, context: ContextIterator, locale: CaseMapLocale, ) -> Option<FullMappingResult>405     fn full_lower_special_case<const IS_TITLE_CONTEXT: bool>(
406         &self,
407         c: char,
408         context: ContextIterator,
409         locale: CaseMapLocale,
410     ) -> Option<FullMappingResult> {
411         if locale == CaseMapLocale::Lithuanian {
412             // Lithuanian retains the dot in a lowercase i when followed by accents.
413             // Introduce an explicit dot above when lowercasing capital I's and J's
414             // whenever there are more accents above (of the accents used in
415             // Lithuanian: grave, acute, and tilde above).
416 
417             // Check for accents above I, J, and I-with-ogonek.
418             if c == 'I' && context.followed_by_more_above(self) {
419                 return Some(FullMappingResult::String(Self::I_DOT));
420             } else if c == 'J' && context.followed_by_more_above(self) {
421                 return Some(FullMappingResult::String(Self::J_DOT));
422             } else if c == '\u{12e}' && context.followed_by_more_above(self) {
423                 return Some(FullMappingResult::String(Self::I_OGONEK_DOT));
424             }
425 
426             // These characters are precomposed with accents above, so we don't
427             // have to look at the context.
428             if c == '\u{cc}' {
429                 return Some(FullMappingResult::String(Self::I_DOT_GRAVE));
430             } else if c == '\u{cd}' {
431                 return Some(FullMappingResult::String(Self::I_DOT_ACUTE));
432             } else if c == '\u{128}' {
433                 return Some(FullMappingResult::String(Self::I_DOT_TILDE));
434             }
435         }
436 
437         if locale == CaseMapLocale::Turkish {
438             if c == '\u{130}' {
439                 // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
440                 return Some(FullMappingResult::CodePoint('i'));
441             } else if c == '\u{307}' && context.preceded_by_capital_i::<IS_TITLE_CONTEXT>(self) {
442                 // When lowercasing, remove dot_above in the sequence I + dot_above,
443                 // which will turn into i. This matches the behaviour of the
444                 // canonically equivalent I-dot_above.
445                 //
446                 // In a titlecase context, we do not want to apply this behavior to cases where the I
447                 // was at the beginning of the string, as that I and its marks should be handled by the
448                 // uppercasing rules (which ignore it, see below)
449 
450                 return Some(FullMappingResult::Remove);
451             } else if c == 'I' && !context.followed_by_dot_above(self) {
452                 // When lowercasing, unless an I is before a dot_above, it turns
453                 // into a dotless i.
454                 return Some(FullMappingResult::CodePoint('\u{131}'));
455             }
456         }
457 
458         if c == '\u{130}' {
459             // Preserve canonical equivalence for I with dot. Turkic is handled above.
460             return Some(FullMappingResult::String(Self::I_DOT));
461         }
462 
463         if c == '\u{3a3}'
464             && context.preceded_by_cased_letter(self)
465             && !context.followed_by_cased_letter(self)
466         {
467             // Greek capital sigman maps depending on surrounding cased letters.
468             return Some(FullMappingResult::CodePoint('\u{3c2}'));
469         }
470 
471         // No relevant special case mapping. Use a normal mapping.
472         None
473     }
474 
full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>( &self, c: char, context: ContextIterator, locale: CaseMapLocale, ) -> Option<FullMappingResult>475     fn full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>(
476         &self,
477         c: char,
478         context: ContextIterator,
479         locale: CaseMapLocale,
480     ) -> Option<FullMappingResult> {
481         if locale == CaseMapLocale::Turkish && c == 'i' {
482             // In Turkic languages, i turns into a dotted capital I.
483             return Some(FullMappingResult::CodePoint('\u{130}'));
484         }
485         if locale == CaseMapLocale::Lithuanian
486             && c == '\u{307}'
487             && context.preceded_by_soft_dotted(self)
488         {
489             // Lithuanian retains the dot in a lowercase i when followed by accents.
490             // Remove dot_above after i with upper or titlecase.
491             return Some(FullMappingResult::Remove);
492         }
493         // ICU4C's non-standard extension for Armenian ligature ech-yiwn.
494         if c == '\u{587}' {
495             return match (locale, IS_TITLE_CONTEXT) {
496                 (CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")),
497                 (CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")),
498                 (_, false) => Some(FullMappingResult::String("ԵՒ")),
499                 (_, true) => Some(FullMappingResult::String("Եւ")),
500             };
501         }
502         None
503     }
504 
full_fold_special_case( &self, c: char, _context: ContextIterator, locale: CaseMapLocale, ) -> Option<FullMappingResult>505     fn full_fold_special_case(
506         &self,
507         c: char,
508         _context: ContextIterator,
509         locale: CaseMapLocale,
510     ) -> Option<FullMappingResult> {
511         let is_turkic = locale == CaseMapLocale::Turkish;
512         match (c, is_turkic) {
513             // Turkic mappings
514             ('\u{49}', true) => Some(FullMappingResult::CodePoint('\u{131}')),
515             ('\u{130}', true) => Some(FullMappingResult::CodePoint('\u{69}')),
516 
517             // Default mappings
518             ('\u{49}', false) => Some(FullMappingResult::CodePoint('\u{69}')),
519             ('\u{130}', false) => Some(FullMappingResult::String(Self::I_DOT)),
520             (_, _) => None,
521         }
522     }
523     /// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists
524     /// to avoid perf impacts on other more common modes of operation
525     ///
526     /// titlecase_tail_casing is only read in IS_TITLE_CONTEXT
full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>( &'a self, src: &'a str, locale: CaseMapLocale, mapping: MappingKind, titlecase_tail_casing: TrailingCase, ) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT>527     pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>(
528         &'a self,
529         src: &'a str,
530         locale: CaseMapLocale,
531         mapping: MappingKind,
532         titlecase_tail_casing: TrailingCase,
533     ) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT> {
534         // Ensure that they are either both true or both false, i.e. an XNOR operation
535         debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title)));
536 
537         FullCaseWriteable::<IS_TITLE_CONTEXT> {
538             data: self,
539             src,
540             locale,
541             mapping,
542             titlecase_tail_casing,
543         }
544     }
545 
546     /// Adds all simple case mappings and the full case folding for `c` to `set`.
547     /// Also adds special case closure mappings.
548     /// The character itself is not added.
549     /// For example, the mappings
550     /// - for s include long s
551     /// - for sharp s include ss
552     /// - for k include the Kelvin sign
add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S)553     pub(crate) fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) {
554         // Hardcode the case closure of i and its relatives and ignore the
555         // data file data for these characters.
556         // The Turkic dotless i and dotted I with their case mapping conditions
557         // and case folding option make the related characters behave specially.
558         // This code matches their closure behavior to their case folding behavior.
559         match c {
560             // Regular i and I are in one equivalence class.
561             '\u{49}' => {
562                 set.add_char('\u{69}');
563                 return;
564             }
565             '\u{69}' => {
566                 set.add_char('\u{49}');
567                 return;
568             }
569 
570             // Dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>)
571             '\u{130}' => {
572                 set.add_string(Self::I_DOT);
573                 return;
574             }
575 
576             // Dotless i is in a class by itself
577             '\u{131}' => {
578                 return;
579             }
580 
581             _ => {}
582         }
583 
584         let data = self.lookup_data(c);
585         if !data.has_exception() {
586             if data.case_type().is_some() {
587                 let delta = data.delta() as i32;
588                 if delta != 0 {
589                     // Add the one simple case mapping, no matter what type it is.
590                     let codepoint = c as i32 + delta;
591                     // GIGO: delta should be valid
592                     let mapped = char::from_u32(codepoint as u32).unwrap_or(c);
593                     set.add_char(mapped);
594                 }
595             }
596             return;
597         }
598 
599         // c has exceptions, so there may be multiple simple and/or full case mappings.
600         let idx = data.exception_index();
601         let exception = self.exceptions.get(idx);
602 
603         // Add all simple case mappings.
604         for slot in [
605             ExceptionSlot::Lower,
606             ExceptionSlot::Fold,
607             ExceptionSlot::Upper,
608             ExceptionSlot::Title,
609         ] {
610             if let Some(simple) = exception.get_char_slot(slot) {
611                 set.add_char(simple);
612             }
613         }
614         if let Some(simple) = exception.get_simple_case_slot_for(c) {
615             set.add_char(simple);
616         }
617 
618         exception.add_full_and_closure_mappings(set);
619     }
620 
621     /// Maps the string to single code points and adds the associated case closure
622     /// mappings.
623     ///
624     /// (see docs on CaseMapper::add_string_case_closure_to)
add_string_case_closure_to<S: ClosureSink>( &self, s: &str, set: &mut S, unfold_data: &CaseMapUnfold, ) -> bool625     pub(crate) fn add_string_case_closure_to<S: ClosureSink>(
626         &self,
627         s: &str,
628         set: &mut S,
629         unfold_data: &CaseMapUnfold,
630     ) -> bool {
631         if s.chars().count() <= 1 {
632             // The string is too short to find any match.
633             return false;
634         }
635         match unfold_data.get(s) {
636             Some(closure_string) => {
637                 for c in closure_string.chars() {
638                     set.add_char(c);
639                     self.add_case_closure_to(c, set);
640                 }
641                 true
642             }
643             None => false,
644         }
645     }
646 }
647 
648 // An internal representation of locale. Non-Root values of this
649 // enumeration imply that hard-coded special cases exist for this
650 // language.
651 #[derive(Copy, Clone, Eq, PartialEq, Debug)]
652 pub enum CaseMapLocale {
653     Root,
654     Turkish,
655     Lithuanian,
656     Greek,
657     Dutch,
658     Armenian,
659 }
660 
661 impl CaseMapLocale {
from_langid(langid: &LanguageIdentifier) -> Self662     pub const fn from_langid(langid: &LanguageIdentifier) -> Self {
663         use icu_locale_core::subtags::{language, Language};
664         const TR: Language = language!("tr");
665         const AZ: Language = language!("az");
666         const LT: Language = language!("lt");
667         const EL: Language = language!("el");
668         const NL: Language = language!("nl");
669         const HY: Language = language!("hy");
670         match langid.language {
671             TR | AZ => Self::Turkish,
672             LT => Self::Lithuanian,
673             EL => Self::Greek,
674             NL => Self::Dutch,
675             HY => Self::Armenian,
676             _ => Self::Root,
677         }
678     }
679 }
680 
681 pub enum FullMappingResult<'a> {
682     Remove,
683     CodePoint(char),
684     String(&'a str),
685 }
686 
687 impl FullMappingResult<'_> {
688     #[allow(dead_code)]
add_to_set<S: ClosureSink>(&self, set: &mut S)689     fn add_to_set<S: ClosureSink>(&self, set: &mut S) {
690         match *self {
691             FullMappingResult::CodePoint(c) => set.add_char(c),
692             FullMappingResult::String(s) => set.add_string(s),
693             FullMappingResult::Remove => {}
694         }
695     }
696 }
697 
698 impl Writeable for FullMappingResult<'_> {
write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result699     fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
700         match *self {
701             FullMappingResult::CodePoint(c) => sink.write_char(c),
702             FullMappingResult::String(s) => sink.write_str(s),
703             FullMappingResult::Remove => Ok(()),
704         }
705     }
706 }
707 
708 pub(crate) struct ContextIterator<'a> {
709     before: &'a str,
710     after: &'a str,
711 }
712 
713 impl<'a> ContextIterator<'a> {
714     // Returns a context iterator with the characters before
715     // and after the character at a given index, given the preceding
716     // string and the succeeding string including the character itself
new(before: &'a str, char_and_after: &'a str) -> Self717     pub fn new(before: &'a str, char_and_after: &'a str) -> Self {
718         let mut char_and_after = char_and_after.chars();
719         char_and_after.next(); // skip the character itself
720         let after = char_and_after.as_str();
721         Self { before, after }
722     }
723 
add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics724     fn add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics {
725         diacritics.consume_greek_diacritics(self.after);
726         diacritics
727     }
728 
preceded_by_greek_letter(&self) -> bool729     fn preceded_by_greek_letter(&self) -> bool {
730         greek_to_me::preceded_by_greek_letter(self.before)
731     }
732 
preceding_greek_vowel_diacritics( &self, ) -> Option<GreekCombiningCharacterSequenceDiacritics>733     fn preceding_greek_vowel_diacritics(
734         &self,
735     ) -> Option<GreekCombiningCharacterSequenceDiacritics> {
736         greek_to_me::preceding_greek_vowel_diacritics(self.before)
737     }
738 
preceded_by_soft_dotted(&self, mapping: &CaseMap) -> bool739     fn preceded_by_soft_dotted(&self, mapping: &CaseMap) -> bool {
740         for c in self.before.chars().rev() {
741             match mapping.dot_type(c) {
742                 DotType::SoftDotted => return true,
743                 DotType::OtherAccent => continue,
744                 _ => return false,
745             }
746         }
747         false
748     }
749     /// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between.
750     ///
751     /// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string
preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>( &self, mapping: &CaseMap, ) -> bool752     fn preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>(
753         &self,
754         mapping: &CaseMap,
755     ) -> bool {
756         let mut iter = self.before.chars().rev();
757         while let Some(c) = iter.next() {
758             if c == 'I' {
759                 if I_MUST_NOT_START_STRING {
760                     return iter.next().is_some();
761                 } else {
762                     return true;
763                 }
764             }
765             if mapping.dot_type(c) != DotType::OtherAccent {
766                 break;
767             }
768         }
769         false
770     }
preceded_by_cased_letter(&self, mapping: &CaseMap) -> bool771     fn preceded_by_cased_letter(&self, mapping: &CaseMap) -> bool {
772         for c in self.before.chars().rev() {
773             let data = mapping.lookup_data(c);
774             if !data.is_ignorable() {
775                 return data.case_type().is_some();
776             }
777         }
778         false
779     }
followed_by_cased_letter(&self, mapping: &CaseMap) -> bool780     fn followed_by_cased_letter(&self, mapping: &CaseMap) -> bool {
781         for c in self.after.chars() {
782             let data = mapping.lookup_data(c);
783             if !data.is_ignorable() {
784                 return data.case_type().is_some();
785             }
786         }
787         false
788     }
followed_by_more_above(&self, mapping: &CaseMap) -> bool789     fn followed_by_more_above(&self, mapping: &CaseMap) -> bool {
790         for c in self.after.chars() {
791             match mapping.dot_type(c) {
792                 DotType::Above => return true,
793                 DotType::OtherAccent => continue,
794                 _ => return false,
795             }
796         }
797         false
798     }
followed_by_dot_above(&self, mapping: &CaseMap) -> bool799     fn followed_by_dot_above(&self, mapping: &CaseMap) -> bool {
800         for c in self.after.chars() {
801             if c == '\u{307}' {
802                 return true;
803             }
804             if mapping.dot_type(c) != DotType::OtherAccent {
805                 return false;
806             }
807         }
808         false
809     }
810 
811     /// Checks the preceding and surrounding context of a j or J
812     /// and returns true if it is preceded by an i or I at the start of the string.
813     /// If one has an acute accent,
814     /// both must have the accent for this to return true. No other accents are handled.
is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMap) -> bool815     fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMap) -> bool {
816         let mut before = self.before.chars().rev();
817         let mut i_has_acute = false;
818         loop {
819             match before.next() {
820                 Some('i') | Some('I') => break,
821                 Some('í') | Some('Í') => {
822                     i_has_acute = true;
823                     break;
824                 }
825                 Some(ACUTE) => i_has_acute = true,
826                 _ => return false,
827             }
828         }
829 
830         if before.next().is_some() {
831             // not at the beginning of a string, doesn't matter
832             return false;
833         }
834         let mut j_has_acute = false;
835         for c in self.after.chars() {
836             if c == ACUTE {
837                 j_has_acute = true;
838                 continue;
839             }
840             // We are supposed to check that `j` has no other combining marks aside
841             // from potentially an acute accent. Once we hit the first non-combining mark
842             // we are done.
843             //
844             // ICU4C checks for `gc=Mn` to determine if something is a combining mark,
845             // however this requires extra data (and is the *only* point in the casemapping algorithm
846             // where there is a direct dependency on properties data not mediated by the casemapping data trie).
847             //
848             // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
849             //
850             // See https://unicode-org.atlassian.net/browse/ICU-22429
851             match mapping.dot_type(c) {
852                 // Not a combining character; ccc = 0
853                 DotType::NoDot | DotType::SoftDotted => break,
854                 // found combining character, bail
855                 _ => return false,
856             }
857         }
858 
859         // either both should have an acute accent, or none. this is an XNOR operation
860         !(j_has_acute ^ i_has_acute)
861     }
862 }
863