1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! This module contains most of the actual algorithms for case mapping. 6 //! 7 //! Primarily, it implements methods on `CaseMap`, which contains the data model. 8 9 use crate::greek_to_me::{ 10 self, GreekCombiningCharacterSequenceDiacritics, GreekDiacritics, GreekPrecomposedLetterData, 11 GreekVowel, 12 }; 13 use crate::provider::data::{DotType, MappingKind}; 14 use crate::provider::exception_helpers::ExceptionSlot; 15 use crate::provider::{CaseMap, CaseMapUnfold}; 16 use crate::set::ClosureSink; 17 use crate::titlecase::TrailingCase; 18 use core::fmt; 19 use icu_locale_core::LanguageIdentifier; 20 use writeable::Writeable; 21 22 const ACUTE: char = '\u{301}'; 23 24 // Used to control the behavior of CaseMapper::fold. 25 // Currently only used to decide whether to use Turkic (T) mappings for dotted/dotless i. 26 #[derive(Copy, Clone, Default)] 27 pub(crate) struct FoldOptions { 28 exclude_special_i: bool, 29 } 30 31 impl FoldOptions { with_turkic_mappings() -> Self32 pub fn with_turkic_mappings() -> Self { 33 Self { 34 exclude_special_i: true, 35 } 36 } 37 } 38 39 /// Helper type that wraps a writeable in a prefix string 40 pub(crate) struct StringAndWriteable<'a, W> { 41 pub string: &'a str, 42 pub writeable: W, 43 } 44 45 impl<Wr: Writeable> Writeable for StringAndWriteable<'_, Wr> { write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result46 fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result { 47 sink.write_str(self.string)?; 48 self.writeable.write_to(sink) 49 } writeable_length_hint(&self) -> writeable::LengthHint50 fn writeable_length_hint(&self) -> writeable::LengthHint { 51 writeable::LengthHint::exact(self.string.len()) + self.writeable.writeable_length_hint() 52 } 53 } 54 55 pub(crate) struct FullCaseWriteable<'a, const IS_TITLE_CONTEXT: bool> { 56 data: &'a CaseMap<'a>, 57 src: &'a str, 58 locale: CaseMapLocale, 59 mapping: MappingKind, 60 titlecase_tail_casing: TrailingCase, 61 } 62 63 impl<const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'_, IS_TITLE_CONTEXT> { 64 #[allow(clippy::indexing_slicing)] // last_uncopied_index and i are known to be in bounds write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result65 fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result { 66 let src = self.src; 67 let mut mapping = self.mapping; 68 let mut iter = src.char_indices(); 69 for (i, c) in &mut iter { 70 let context = ContextIterator::new(&src[..i], &src[i..]); 71 self.data 72 .full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?; 73 if IS_TITLE_CONTEXT { 74 if self.titlecase_tail_casing == TrailingCase::Lower { 75 mapping = MappingKind::Lower; 76 } else { 77 break; 78 } 79 } 80 } 81 // Write the rest of the string 82 if IS_TITLE_CONTEXT && self.titlecase_tail_casing == TrailingCase::Unchanged { 83 sink.write_str(iter.as_str())?; 84 } 85 Ok(()) 86 } writeable_length_hint(&self) -> writeable::LengthHint87 fn writeable_length_hint(&self) -> writeable::LengthHint { 88 writeable::LengthHint::at_least(self.src.len()) 89 } 90 } 91 92 impl<'data> CaseMap<'data> { simple_helper(&self, c: char, kind: MappingKind) -> char93 fn simple_helper(&self, c: char, kind: MappingKind) -> char { 94 let data = self.lookup_data(c); 95 if !data.has_exception() { 96 if data.is_relevant_to(kind) { 97 let folded = c as i32 + data.delta() as i32; 98 // GIGO: delta should be valid 99 char::from_u32(folded as u32).unwrap_or(c) 100 } else { 101 c 102 } 103 } else { 104 let idx = data.exception_index(); 105 let exception = self.exceptions.get(idx); 106 if data.is_relevant_to(kind) { 107 if let Some(simple) = exception.get_simple_case_slot_for(c) { 108 return simple; 109 } 110 } 111 exception.slot_char_for_kind(kind).unwrap_or(c) 112 } 113 } 114 115 // Returns the lowercase mapping of the given `char`. 116 #[inline] simple_lower(&self, c: char) -> char117 pub(crate) fn simple_lower(&self, c: char) -> char { 118 self.simple_helper(c, MappingKind::Lower) 119 } 120 121 // Returns the uppercase mapping of the given `char`. 122 #[inline] simple_upper(&self, c: char) -> char123 pub(crate) fn simple_upper(&self, c: char) -> char { 124 self.simple_helper(c, MappingKind::Upper) 125 } 126 127 // Returns the titlecase mapping of the given `char`. 128 #[inline] simple_title(&self, c: char) -> char129 pub(crate) fn simple_title(&self, c: char) -> char { 130 self.simple_helper(c, MappingKind::Title) 131 } 132 133 // Return the simple case folding mapping of the given char. 134 #[inline] simple_fold(&self, c: char, options: FoldOptions) -> char135 pub(crate) fn simple_fold(&self, c: char, options: FoldOptions) -> char { 136 let data = self.lookup_data(c); 137 if !data.has_exception() { 138 if data.is_upper_or_title() { 139 let folded = c as i32 + data.delta() as i32; 140 // GIGO: delta should be valid 141 char::from_u32(folded as u32).unwrap_or(c) 142 } else { 143 c 144 } 145 } else { 146 // TODO: if we move conditional fold and no_simple_case_folding into 147 // simple_helper, this function can just call simple_helper. 148 let idx = data.exception_index(); 149 let exception = self.exceptions.get(idx); 150 if exception.bits.has_conditional_fold() { 151 self.simple_fold_special_case(c, options) 152 } else if exception.bits.no_simple_case_folding() { 153 c 154 } else if data.is_upper_or_title() && exception.has_slot(ExceptionSlot::Delta) { 155 // unwrap_or case should never happen but best to avoid panics 156 exception.get_simple_case_slot_for(c).unwrap_or('\0') 157 } else if let Some(slot_char) = exception.slot_char_for_kind(MappingKind::Fold) { 158 slot_char 159 } else { 160 c 161 } 162 } 163 } 164 dot_type(&self, c: char) -> DotType165 fn dot_type(&self, c: char) -> DotType { 166 let data = self.lookup_data(c); 167 if !data.has_exception() { 168 data.dot_type() 169 } else { 170 let idx = data.exception_index(); 171 self.exceptions.get(idx).bits.dot_type() 172 } 173 } 174 175 // Returns true if this code point is is case-sensitive. 176 // This is not currently exposed. 177 #[allow(dead_code)] is_case_sensitive(&self, c: char) -> bool178 fn is_case_sensitive(&self, c: char) -> bool { 179 let data = self.lookup_data(c); 180 if !data.has_exception() { 181 data.is_sensitive() 182 } else { 183 let idx = data.exception_index(); 184 self.exceptions.get(idx).bits.is_sensitive() 185 } 186 } 187 188 /// Returns whether the character is cased is_cased(&self, c: char) -> bool189 pub(crate) fn is_cased(&self, c: char) -> bool { 190 self.lookup_data(c).case_type().is_some() 191 } 192 193 #[inline(always)] 194 // IS_TITLE_CONTEXT must be true if kind is MappingKind::Title 195 // The kind may be a different kind with IS_TITLE_CONTEXT still true because 196 // titlecasing a segment involves switching to lowercase later full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>( &self, c: char, context: ContextIterator, locale: CaseMapLocale, kind: MappingKind, sink: &mut W, ) -> fmt::Result197 fn full_helper<const IS_TITLE_CONTEXT: bool, W: fmt::Write + ?Sized>( 198 &self, 199 c: char, 200 context: ContextIterator, 201 locale: CaseMapLocale, 202 kind: MappingKind, 203 sink: &mut W, 204 ) -> fmt::Result { 205 // If using a title mapping IS_TITLE_CONTEXT must be true 206 debug_assert!(kind != MappingKind::Title || IS_TITLE_CONTEXT); 207 // In a title context, kind MUST be Title or Lower 208 debug_assert!( 209 !IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower 210 ); 211 212 // ICU4C's non-standard extension for Dutch IJ titlecasing 213 // handled here instead of in full_lower_special_case because J does not have conditional 214 // special casemapping. 215 if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower { 216 // When titlecasing, a J found immediately after an I at the beginning of the segment 217 // should also uppercase. They are both allowed to have an acute accent but it must 218 // be present on both letters or neither. They may not have any other combining marks. 219 if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) { 220 return sink.write_char('J'); 221 } 222 } 223 224 // ICU4C's non-standard extension for Greek uppercasing: 225 // https://icu.unicode.org/design/case/greek-upper. 226 // Effectively removes Greek accents from Greek vowels during uppercasing, 227 // whilst attempting to preserve additional marks like the dialytika (diæresis) 228 // and ypogegrammeni (combining small iota). 229 if !IS_TITLE_CONTEXT && locale == CaseMapLocale::Greek && kind == MappingKind::Upper { 230 // Remove all combining diacritics on a Greek letter. 231 // Ypogegrammeni is not an accent mark and is handled by regular casemapping (it turns into 232 // a capital iota). 233 // The dialytika is removed here, but it may be added again when the base letter is being processed. 234 if greek_to_me::is_greek_diacritic_except_ypogegrammeni(c) 235 && context.preceded_by_greek_letter() 236 { 237 return Ok(()); 238 } 239 let data = greek_to_me::get_data(c); 240 // Check if the character is a Greek vowel 241 match data { 242 Some(GreekPrecomposedLetterData::Vowel(vowel, mut precomposed_diacritics)) => { 243 // Get the diacritics on the character itself, and add any further combining diacritics 244 // from the context. 245 let mut diacritics = context.add_greek_diacritics(precomposed_diacritics); 246 // If the previous vowel had an accent (which would be removed) but no dialytika, 247 // and this is an iota or upsilon, add a dialytika since it is necessary to disambiguate 248 // the now-unaccented adjacent vowels from a digraph/diphthong. 249 // Use a precomposed dialytika if the accent was precomposed, and a combining dialytika 250 // if the accent was combining, so as to map NFD to NFD and NFC to NFC. 251 if !diacritics.dialytika && (vowel == GreekVowel::Ι || vowel == GreekVowel::Υ) 252 { 253 if let Some(preceding_vowel) = context.preceding_greek_vowel_diacritics() { 254 if !preceding_vowel.combining.dialytika 255 && !preceding_vowel.precomposed.dialytika 256 { 257 if preceding_vowel.combining.accented { 258 diacritics.dialytika = true; 259 } else { 260 precomposed_diacritics.dialytika = 261 preceding_vowel.precomposed.accented; 262 } 263 } 264 } 265 } 266 // Write the base of the uppercased combining character sequence. 267 // In most branches this is [`upper_base`], i.e., the uppercase letter with all accents removed. 268 // In some branches the base has a precomposed diacritic. 269 // In the case of the Greek disjunctive "or", a combining tonos may also be written. 270 match vowel { 271 GreekVowel::Η => { 272 // The letter η (eta) is allowed to retain a tonos when it is form a single-letter word to distinguish 273 // the feminine definite article ἡ (monotonic η) from the disjunctive "or" ἤ (monotonic ή). 274 // 275 // A lone η with an accent other than the oxia/tonos is not expected, 276 // so there is no need to special-case the oxia/tonos. 277 // The ancient ᾖ (exist.PRS.SUBJ.3s) has a iota subscript as well as the circumflex, 278 // so it would not be given an oxia/tonos under this rule, and the subjunctive is formed with a particle 279 // (e.g. να είναι) since Byzantine times anyway. 280 if diacritics.accented 281 && !context.followed_by_cased_letter(self) 282 && !context.preceded_by_cased_letter(self) 283 && !diacritics.ypogegrammeni 284 { 285 if precomposed_diacritics.accented { 286 sink.write_char('Ή')?; 287 } else { 288 sink.write_char('Η')?; 289 sink.write_char(greek_to_me::TONOS)?; 290 } 291 } else { 292 sink.write_char('Η')?; 293 } 294 } 295 GreekVowel::Ι => sink.write_char(if precomposed_diacritics.dialytika { 296 diacritics.dialytika = false; 297 'Ϊ' 298 } else { 299 vowel.into() 300 })?, 301 GreekVowel::Υ => sink.write_char(if precomposed_diacritics.dialytika { 302 diacritics.dialytika = false; 303 'Ϋ' 304 } else { 305 vowel.into() 306 })?, 307 _ => sink.write_char(vowel.into())?, 308 }; 309 if diacritics.dialytika { 310 sink.write_char(greek_to_me::DIALYTIKA)?; 311 } 312 if precomposed_diacritics.ypogegrammeni { 313 sink.write_char('Ι')?; 314 } 315 316 return Ok(()); 317 } 318 // Rho might have breathing marks, we handle it specially 319 // to remove them 320 Some(GreekPrecomposedLetterData::Consonant(true)) => { 321 sink.write_char(greek_to_me::CAPITAL_RHO)?; 322 return Ok(()); 323 } 324 _ => (), 325 } 326 } 327 328 let data = self.lookup_data(c); 329 if !data.has_exception() { 330 if data.is_relevant_to(kind) { 331 let mapped = c as i32 + data.delta() as i32; 332 // GIGO: delta should be valid 333 let mapped = char::from_u32(mapped as u32).unwrap_or(c); 334 sink.write_char(mapped) 335 } else { 336 sink.write_char(c) 337 } 338 } else { 339 let idx = data.exception_index(); 340 let exception = self.exceptions.get(idx); 341 if exception.bits.has_conditional_special() { 342 if let Some(special) = match kind { 343 MappingKind::Lower => { 344 self.full_lower_special_case::<IS_TITLE_CONTEXT>(c, context, locale) 345 } 346 MappingKind::Fold => self.full_fold_special_case(c, context, locale), 347 MappingKind::Upper | MappingKind::Title => self 348 .full_upper_or_title_special_case::<IS_TITLE_CONTEXT>(c, context, locale), 349 } { 350 return special.write_to(sink); 351 } 352 } 353 if let Some(mapped_string) = exception.get_fullmappings_slot_for_kind(kind) { 354 if !mapped_string.is_empty() { 355 return sink.write_str(mapped_string); 356 } 357 } 358 359 if kind == MappingKind::Fold && exception.bits.no_simple_case_folding() { 360 return sink.write_char(c); 361 } 362 363 if data.is_relevant_to(kind) { 364 if let Some(simple) = exception.get_simple_case_slot_for(c) { 365 return sink.write_char(simple); 366 } 367 } 368 369 if let Some(slot_char) = exception.slot_char_for_kind(kind) { 370 sink.write_char(slot_char) 371 } else { 372 sink.write_char(c) 373 } 374 } 375 } 376 377 // These constants are used for hardcoded locale-specific foldings. 378 const I_DOT: &'static str = "\u{69}\u{307}"; 379 const J_DOT: &'static str = "\u{6a}\u{307}"; 380 const I_OGONEK_DOT: &'static str = "\u{12f}\u{307}"; 381 const I_DOT_GRAVE: &'static str = "\u{69}\u{307}\u{300}"; 382 const I_DOT_ACUTE: &'static str = "\u{69}\u{307}\u{301}"; 383 const I_DOT_TILDE: &'static str = "\u{69}\u{307}\u{303}"; 384 385 // Special case folding mappings, hardcoded. 386 // This handles the special Turkic mappings for uppercase I and dotted uppercase I 387 // For non-Turkic languages, this mapping is normally not used. 388 // For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. simple_fold_special_case(&self, c: char, options: FoldOptions) -> char389 fn simple_fold_special_case(&self, c: char, options: FoldOptions) -> char { 390 debug_assert!(c == '\u{49}' || c == '\u{130}'); 391 let is_turkic = options.exclude_special_i; 392 match (c, is_turkic) { 393 // Turkic mappings 394 ('\u{49}', true) => '\u{131}', // 0049; T; 0131; # LATIN CAPITAL LETTER I 395 ('\u{130}', true) => '\u{69}', /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 396 397 // Default mappings 398 ('\u{49}', false) => '\u{69}', // 0049; C; 0069; # LATIN CAPITAL LETTER I 399 400 // There is no simple case folding for U+130. 401 (c, _) => c, 402 } 403 } 404 full_lower_special_case<const IS_TITLE_CONTEXT: bool>( &self, c: char, context: ContextIterator, locale: CaseMapLocale, ) -> Option<FullMappingResult>405 fn full_lower_special_case<const IS_TITLE_CONTEXT: bool>( 406 &self, 407 c: char, 408 context: ContextIterator, 409 locale: CaseMapLocale, 410 ) -> Option<FullMappingResult> { 411 if locale == CaseMapLocale::Lithuanian { 412 // Lithuanian retains the dot in a lowercase i when followed by accents. 413 // Introduce an explicit dot above when lowercasing capital I's and J's 414 // whenever there are more accents above (of the accents used in 415 // Lithuanian: grave, acute, and tilde above). 416 417 // Check for accents above I, J, and I-with-ogonek. 418 if c == 'I' && context.followed_by_more_above(self) { 419 return Some(FullMappingResult::String(Self::I_DOT)); 420 } else if c == 'J' && context.followed_by_more_above(self) { 421 return Some(FullMappingResult::String(Self::J_DOT)); 422 } else if c == '\u{12e}' && context.followed_by_more_above(self) { 423 return Some(FullMappingResult::String(Self::I_OGONEK_DOT)); 424 } 425 426 // These characters are precomposed with accents above, so we don't 427 // have to look at the context. 428 if c == '\u{cc}' { 429 return Some(FullMappingResult::String(Self::I_DOT_GRAVE)); 430 } else if c == '\u{cd}' { 431 return Some(FullMappingResult::String(Self::I_DOT_ACUTE)); 432 } else if c == '\u{128}' { 433 return Some(FullMappingResult::String(Self::I_DOT_TILDE)); 434 } 435 } 436 437 if locale == CaseMapLocale::Turkish { 438 if c == '\u{130}' { 439 // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 440 return Some(FullMappingResult::CodePoint('i')); 441 } else if c == '\u{307}' && context.preceded_by_capital_i::<IS_TITLE_CONTEXT>(self) { 442 // When lowercasing, remove dot_above in the sequence I + dot_above, 443 // which will turn into i. This matches the behaviour of the 444 // canonically equivalent I-dot_above. 445 // 446 // In a titlecase context, we do not want to apply this behavior to cases where the I 447 // was at the beginning of the string, as that I and its marks should be handled by the 448 // uppercasing rules (which ignore it, see below) 449 450 return Some(FullMappingResult::Remove); 451 } else if c == 'I' && !context.followed_by_dot_above(self) { 452 // When lowercasing, unless an I is before a dot_above, it turns 453 // into a dotless i. 454 return Some(FullMappingResult::CodePoint('\u{131}')); 455 } 456 } 457 458 if c == '\u{130}' { 459 // Preserve canonical equivalence for I with dot. Turkic is handled above. 460 return Some(FullMappingResult::String(Self::I_DOT)); 461 } 462 463 if c == '\u{3a3}' 464 && context.preceded_by_cased_letter(self) 465 && !context.followed_by_cased_letter(self) 466 { 467 // Greek capital sigman maps depending on surrounding cased letters. 468 return Some(FullMappingResult::CodePoint('\u{3c2}')); 469 } 470 471 // No relevant special case mapping. Use a normal mapping. 472 None 473 } 474 full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>( &self, c: char, context: ContextIterator, locale: CaseMapLocale, ) -> Option<FullMappingResult>475 fn full_upper_or_title_special_case<const IS_TITLE_CONTEXT: bool>( 476 &self, 477 c: char, 478 context: ContextIterator, 479 locale: CaseMapLocale, 480 ) -> Option<FullMappingResult> { 481 if locale == CaseMapLocale::Turkish && c == 'i' { 482 // In Turkic languages, i turns into a dotted capital I. 483 return Some(FullMappingResult::CodePoint('\u{130}')); 484 } 485 if locale == CaseMapLocale::Lithuanian 486 && c == '\u{307}' 487 && context.preceded_by_soft_dotted(self) 488 { 489 // Lithuanian retains the dot in a lowercase i when followed by accents. 490 // Remove dot_above after i with upper or titlecase. 491 return Some(FullMappingResult::Remove); 492 } 493 // ICU4C's non-standard extension for Armenian ligature ech-yiwn. 494 if c == '\u{587}' { 495 return match (locale, IS_TITLE_CONTEXT) { 496 (CaseMapLocale::Armenian, false) => Some(FullMappingResult::String("ԵՎ")), 497 (CaseMapLocale::Armenian, true) => Some(FullMappingResult::String("Եվ")), 498 (_, false) => Some(FullMappingResult::String("ԵՒ")), 499 (_, true) => Some(FullMappingResult::String("Եւ")), 500 }; 501 } 502 None 503 } 504 full_fold_special_case( &self, c: char, _context: ContextIterator, locale: CaseMapLocale, ) -> Option<FullMappingResult>505 fn full_fold_special_case( 506 &self, 507 c: char, 508 _context: ContextIterator, 509 locale: CaseMapLocale, 510 ) -> Option<FullMappingResult> { 511 let is_turkic = locale == CaseMapLocale::Turkish; 512 match (c, is_turkic) { 513 // Turkic mappings 514 ('\u{49}', true) => Some(FullMappingResult::CodePoint('\u{131}')), 515 ('\u{130}', true) => Some(FullMappingResult::CodePoint('\u{69}')), 516 517 // Default mappings 518 ('\u{49}', false) => Some(FullMappingResult::CodePoint('\u{69}')), 519 ('\u{130}', false) => Some(FullMappingResult::String(Self::I_DOT)), 520 (_, _) => None, 521 } 522 } 523 /// IS_TITLE_CONTEXT is true iff the mapping is MappingKind::Title, primarily exists 524 /// to avoid perf impacts on other more common modes of operation 525 /// 526 /// titlecase_tail_casing is only read in IS_TITLE_CONTEXT full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>( &'a self, src: &'a str, locale: CaseMapLocale, mapping: MappingKind, titlecase_tail_casing: TrailingCase, ) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT>527 pub(crate) fn full_helper_writeable<'a: 'data, const IS_TITLE_CONTEXT: bool>( 528 &'a self, 529 src: &'a str, 530 locale: CaseMapLocale, 531 mapping: MappingKind, 532 titlecase_tail_casing: TrailingCase, 533 ) -> FullCaseWriteable<'a, IS_TITLE_CONTEXT> { 534 // Ensure that they are either both true or both false, i.e. an XNOR operation 535 debug_assert!(!(IS_TITLE_CONTEXT ^ (mapping == MappingKind::Title))); 536 537 FullCaseWriteable::<IS_TITLE_CONTEXT> { 538 data: self, 539 src, 540 locale, 541 mapping, 542 titlecase_tail_casing, 543 } 544 } 545 546 /// Adds all simple case mappings and the full case folding for `c` to `set`. 547 /// Also adds special case closure mappings. 548 /// The character itself is not added. 549 /// For example, the mappings 550 /// - for s include long s 551 /// - for sharp s include ss 552 /// - for k include the Kelvin sign add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S)553 pub(crate) fn add_case_closure_to<S: ClosureSink>(&self, c: char, set: &mut S) { 554 // Hardcode the case closure of i and its relatives and ignore the 555 // data file data for these characters. 556 // The Turkic dotless i and dotted I with their case mapping conditions 557 // and case folding option make the related characters behave specially. 558 // This code matches their closure behavior to their case folding behavior. 559 match c { 560 // Regular i and I are in one equivalence class. 561 '\u{49}' => { 562 set.add_char('\u{69}'); 563 return; 564 } 565 '\u{69}' => { 566 set.add_char('\u{49}'); 567 return; 568 } 569 570 // Dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) 571 '\u{130}' => { 572 set.add_string(Self::I_DOT); 573 return; 574 } 575 576 // Dotless i is in a class by itself 577 '\u{131}' => { 578 return; 579 } 580 581 _ => {} 582 } 583 584 let data = self.lookup_data(c); 585 if !data.has_exception() { 586 if data.case_type().is_some() { 587 let delta = data.delta() as i32; 588 if delta != 0 { 589 // Add the one simple case mapping, no matter what type it is. 590 let codepoint = c as i32 + delta; 591 // GIGO: delta should be valid 592 let mapped = char::from_u32(codepoint as u32).unwrap_or(c); 593 set.add_char(mapped); 594 } 595 } 596 return; 597 } 598 599 // c has exceptions, so there may be multiple simple and/or full case mappings. 600 let idx = data.exception_index(); 601 let exception = self.exceptions.get(idx); 602 603 // Add all simple case mappings. 604 for slot in [ 605 ExceptionSlot::Lower, 606 ExceptionSlot::Fold, 607 ExceptionSlot::Upper, 608 ExceptionSlot::Title, 609 ] { 610 if let Some(simple) = exception.get_char_slot(slot) { 611 set.add_char(simple); 612 } 613 } 614 if let Some(simple) = exception.get_simple_case_slot_for(c) { 615 set.add_char(simple); 616 } 617 618 exception.add_full_and_closure_mappings(set); 619 } 620 621 /// Maps the string to single code points and adds the associated case closure 622 /// mappings. 623 /// 624 /// (see docs on CaseMapper::add_string_case_closure_to) add_string_case_closure_to<S: ClosureSink>( &self, s: &str, set: &mut S, unfold_data: &CaseMapUnfold, ) -> bool625 pub(crate) fn add_string_case_closure_to<S: ClosureSink>( 626 &self, 627 s: &str, 628 set: &mut S, 629 unfold_data: &CaseMapUnfold, 630 ) -> bool { 631 if s.chars().count() <= 1 { 632 // The string is too short to find any match. 633 return false; 634 } 635 match unfold_data.get(s) { 636 Some(closure_string) => { 637 for c in closure_string.chars() { 638 set.add_char(c); 639 self.add_case_closure_to(c, set); 640 } 641 true 642 } 643 None => false, 644 } 645 } 646 } 647 648 // An internal representation of locale. Non-Root values of this 649 // enumeration imply that hard-coded special cases exist for this 650 // language. 651 #[derive(Copy, Clone, Eq, PartialEq, Debug)] 652 pub enum CaseMapLocale { 653 Root, 654 Turkish, 655 Lithuanian, 656 Greek, 657 Dutch, 658 Armenian, 659 } 660 661 impl CaseMapLocale { from_langid(langid: &LanguageIdentifier) -> Self662 pub const fn from_langid(langid: &LanguageIdentifier) -> Self { 663 use icu_locale_core::subtags::{language, Language}; 664 const TR: Language = language!("tr"); 665 const AZ: Language = language!("az"); 666 const LT: Language = language!("lt"); 667 const EL: Language = language!("el"); 668 const NL: Language = language!("nl"); 669 const HY: Language = language!("hy"); 670 match langid.language { 671 TR | AZ => Self::Turkish, 672 LT => Self::Lithuanian, 673 EL => Self::Greek, 674 NL => Self::Dutch, 675 HY => Self::Armenian, 676 _ => Self::Root, 677 } 678 } 679 } 680 681 pub enum FullMappingResult<'a> { 682 Remove, 683 CodePoint(char), 684 String(&'a str), 685 } 686 687 impl FullMappingResult<'_> { 688 #[allow(dead_code)] add_to_set<S: ClosureSink>(&self, set: &mut S)689 fn add_to_set<S: ClosureSink>(&self, set: &mut S) { 690 match *self { 691 FullMappingResult::CodePoint(c) => set.add_char(c), 692 FullMappingResult::String(s) => set.add_string(s), 693 FullMappingResult::Remove => {} 694 } 695 } 696 } 697 698 impl Writeable for FullMappingResult<'_> { write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result699 fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result { 700 match *self { 701 FullMappingResult::CodePoint(c) => sink.write_char(c), 702 FullMappingResult::String(s) => sink.write_str(s), 703 FullMappingResult::Remove => Ok(()), 704 } 705 } 706 } 707 708 pub(crate) struct ContextIterator<'a> { 709 before: &'a str, 710 after: &'a str, 711 } 712 713 impl<'a> ContextIterator<'a> { 714 // Returns a context iterator with the characters before 715 // and after the character at a given index, given the preceding 716 // string and the succeeding string including the character itself new(before: &'a str, char_and_after: &'a str) -> Self717 pub fn new(before: &'a str, char_and_after: &'a str) -> Self { 718 let mut char_and_after = char_and_after.chars(); 719 char_and_after.next(); // skip the character itself 720 let after = char_and_after.as_str(); 721 Self { before, after } 722 } 723 add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics724 fn add_greek_diacritics(&self, mut diacritics: GreekDiacritics) -> GreekDiacritics { 725 diacritics.consume_greek_diacritics(self.after); 726 diacritics 727 } 728 preceded_by_greek_letter(&self) -> bool729 fn preceded_by_greek_letter(&self) -> bool { 730 greek_to_me::preceded_by_greek_letter(self.before) 731 } 732 preceding_greek_vowel_diacritics( &self, ) -> Option<GreekCombiningCharacterSequenceDiacritics>733 fn preceding_greek_vowel_diacritics( 734 &self, 735 ) -> Option<GreekCombiningCharacterSequenceDiacritics> { 736 greek_to_me::preceding_greek_vowel_diacritics(self.before) 737 } 738 preceded_by_soft_dotted(&self, mapping: &CaseMap) -> bool739 fn preceded_by_soft_dotted(&self, mapping: &CaseMap) -> bool { 740 for c in self.before.chars().rev() { 741 match mapping.dot_type(c) { 742 DotType::SoftDotted => return true, 743 DotType::OtherAccent => continue, 744 _ => return false, 745 } 746 } 747 false 748 } 749 /// Checks if the preceding character is a capital I, allowing for non-Above combining characters in between. 750 /// 751 /// If I_MUST_NOT_START_STRING is true, additionally will require that the capital I does not start the string preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>( &self, mapping: &CaseMap, ) -> bool752 fn preceded_by_capital_i<const I_MUST_NOT_START_STRING: bool>( 753 &self, 754 mapping: &CaseMap, 755 ) -> bool { 756 let mut iter = self.before.chars().rev(); 757 while let Some(c) = iter.next() { 758 if c == 'I' { 759 if I_MUST_NOT_START_STRING { 760 return iter.next().is_some(); 761 } else { 762 return true; 763 } 764 } 765 if mapping.dot_type(c) != DotType::OtherAccent { 766 break; 767 } 768 } 769 false 770 } preceded_by_cased_letter(&self, mapping: &CaseMap) -> bool771 fn preceded_by_cased_letter(&self, mapping: &CaseMap) -> bool { 772 for c in self.before.chars().rev() { 773 let data = mapping.lookup_data(c); 774 if !data.is_ignorable() { 775 return data.case_type().is_some(); 776 } 777 } 778 false 779 } followed_by_cased_letter(&self, mapping: &CaseMap) -> bool780 fn followed_by_cased_letter(&self, mapping: &CaseMap) -> bool { 781 for c in self.after.chars() { 782 let data = mapping.lookup_data(c); 783 if !data.is_ignorable() { 784 return data.case_type().is_some(); 785 } 786 } 787 false 788 } followed_by_more_above(&self, mapping: &CaseMap) -> bool789 fn followed_by_more_above(&self, mapping: &CaseMap) -> bool { 790 for c in self.after.chars() { 791 match mapping.dot_type(c) { 792 DotType::Above => return true, 793 DotType::OtherAccent => continue, 794 _ => return false, 795 } 796 } 797 false 798 } followed_by_dot_above(&self, mapping: &CaseMap) -> bool799 fn followed_by_dot_above(&self, mapping: &CaseMap) -> bool { 800 for c in self.after.chars() { 801 if c == '\u{307}' { 802 return true; 803 } 804 if mapping.dot_type(c) != DotType::OtherAccent { 805 return false; 806 } 807 } 808 false 809 } 810 811 /// Checks the preceding and surrounding context of a j or J 812 /// and returns true if it is preceded by an i or I at the start of the string. 813 /// If one has an acute accent, 814 /// both must have the accent for this to return true. No other accents are handled. is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMap) -> bool815 fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMap) -> bool { 816 let mut before = self.before.chars().rev(); 817 let mut i_has_acute = false; 818 loop { 819 match before.next() { 820 Some('i') | Some('I') => break, 821 Some('í') | Some('Í') => { 822 i_has_acute = true; 823 break; 824 } 825 Some(ACUTE) => i_has_acute = true, 826 _ => return false, 827 } 828 } 829 830 if before.next().is_some() { 831 // not at the beginning of a string, doesn't matter 832 return false; 833 } 834 let mut j_has_acute = false; 835 for c in self.after.chars() { 836 if c == ACUTE { 837 j_has_acute = true; 838 continue; 839 } 840 // We are supposed to check that `j` has no other combining marks aside 841 // from potentially an acute accent. Once we hit the first non-combining mark 842 // we are done. 843 // 844 // ICU4C checks for `gc=Mn` to determine if something is a combining mark, 845 // however this requires extra data (and is the *only* point in the casemapping algorithm 846 // where there is a direct dependency on properties data not mediated by the casemapping data trie). 847 // 848 // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does. 849 // 850 // See https://unicode-org.atlassian.net/browse/ICU-22429 851 match mapping.dot_type(c) { 852 // Not a combining character; ccc = 0 853 DotType::NoDot | DotType::SoftDotted => break, 854 // found combining character, bail 855 _ => return false, 856 } 857 } 858 859 // either both should have an acute accent, or none. this is an XNOR operation 860 !(j_has_acute ^ i_has_acute) 861 } 862 } 863