1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 use crate::internals::{CaseMapLocale, FoldOptions, FullCaseWriteable, StringAndWriteable}; 6 use crate::provider::data::MappingKind; 7 use crate::provider::CaseMap; 8 use crate::provider::CaseMapV1; 9 use crate::set::ClosureSink; 10 use crate::titlecase::{LeadingAdjustment, TitlecaseOptions, TrailingCase}; 11 use alloc::string::String; 12 use icu_locale_core::LanguageIdentifier; 13 use icu_provider::prelude::*; 14 use writeable::Writeable; 15 16 /// A struct with the ability to convert characters and strings to uppercase or lowercase, 17 /// or fold them to a normalized form for case-insensitive comparison. 18 /// 19 /// Most methods for this type live on [`CaseMapperBorrowed`], which you can obtain via 20 /// [`CaseMapper::new()`] or [`CaseMapper::as_borrowed()`]. 21 /// 22 /// # Examples 23 /// 24 /// ```rust 25 /// use icu::casemap::CaseMapper; 26 /// use icu::locale::langid; 27 /// 28 /// let cm = CaseMapper::new(); 29 /// 30 /// assert_eq!( 31 /// cm.uppercase_to_string("hello world", &langid!("und")), 32 /// "HELLO WORLD" 33 /// ); 34 /// assert_eq!( 35 /// cm.lowercase_to_string("Γειά σου Κόσμε", &langid!("und")), 36 /// "γειά σου κόσμε" 37 /// ); 38 /// ``` 39 #[derive(Clone, Debug)] 40 pub struct CaseMapper { 41 pub(crate) data: DataPayload<CaseMapV1>, 42 } 43 44 impl AsRef<CaseMapper> for CaseMapper { as_ref(&self) -> &CaseMapper45 fn as_ref(&self) -> &CaseMapper { 46 self 47 } 48 } 49 50 /// A struct with the ability to convert characters and strings to uppercase or lowercase, 51 /// or fold them to a normalized form for case-insensitive comparison, borrowed version. 52 /// 53 /// See methods or [`CaseMapper`] for examples. 54 #[derive(Clone, Debug, Copy)] 55 pub struct CaseMapperBorrowed<'a> { 56 pub(crate) data: &'a CaseMap<'a>, 57 } 58 59 impl CaseMapperBorrowed<'static> { 60 /// Cheaply converts a [`CaseMapperBorrowed<'static>`] into a [`CaseMapper`]. 61 /// 62 /// Note: Due to branching and indirection, using [`CaseMapper`] might inhibit some 63 /// compile-time optimizations that are possible with [`CaseMapperBorrowed`]. static_to_owned(self) -> CaseMapper64 pub const fn static_to_owned(self) -> CaseMapper { 65 CaseMapper { 66 data: DataPayload::from_static_ref(self.data), 67 } 68 } 69 /// Creates a [`CaseMapperBorrowed`] using compiled data. 70 /// 71 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 72 /// 73 /// [ Help choosing a constructor](icu_provider::constructors) 74 /// 75 /// # Examples 76 /// 77 /// ```rust 78 /// use icu::casemap::CaseMapper; 79 /// use icu::locale::langid; 80 /// 81 /// let cm = CaseMapper::new(); 82 /// 83 /// assert_eq!( 84 /// cm.uppercase_to_string("hello world", &langid!("und")), 85 /// "HELLO WORLD" 86 /// ); 87 /// ``` 88 #[cfg(feature = "compiled_data")] new() -> Self89 pub const fn new() -> Self { 90 Self { 91 data: crate::provider::Baked::SINGLETON_CASE_MAP_V1, 92 } 93 } 94 } 95 96 #[cfg(feature = "compiled_data")] 97 impl Default for CaseMapperBorrowed<'static> { default() -> Self98 fn default() -> Self { 99 Self::new() 100 } 101 } 102 103 impl<'a> CaseMapperBorrowed<'a> { 104 /// Returns the full lowercase mapping of the given string as a [`Writeable`]. 105 /// This function is context and language sensitive. Callers should pass the text's language 106 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 107 /// `Default::default()` for the root locale. 108 /// 109 /// See [`Self::lowercase_to_string()`] for the equivalent convenience function that returns a String, 110 /// as well as for an example. lowercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a111 pub fn lowercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a { 112 self.data.full_helper_writeable::<false>( 113 src, 114 CaseMapLocale::from_langid(langid), 115 MappingKind::Lower, 116 TrailingCase::default(), 117 ) 118 } 119 120 /// Returns the full uppercase mapping of the given string as a [`Writeable`]. 121 /// This function is context and language sensitive. Callers should pass the text's language 122 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 123 /// `Default::default()` for the root locale. 124 /// 125 /// See [`Self::uppercase_to_string()`] for the equivalent convenience function that returns a String, 126 /// as well as for an example. uppercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a127 pub fn uppercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a { 128 self.data.full_helper_writeable::<false>( 129 src, 130 CaseMapLocale::from_langid(langid), 131 MappingKind::Upper, 132 TrailingCase::default(), 133 ) 134 } 135 136 /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating 137 /// the string as a single segment (and thus only titlecasing the beginning of it). Performs 138 /// the specified leading adjustment behavior from the options without loading additional data. 139 /// 140 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired 141 /// by the application, for example one can titlecase on a per-word basis by mixing this with 142 /// a `WordSegmenter`. 143 /// 144 /// This function is context and language sensitive. Callers should pass the text's language 145 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 146 /// `Default::default()` for the root locale. 147 /// 148 /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`] 149 /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load 150 /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See 151 /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between 152 /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode 153 /// is [`LeadingAdjustment::None`]. 154 /// 155 /// See [`Self::titlecase_segment_with_only_case_data_to_string()`] for the equivalent convenience function that returns a String, 156 /// as well as for an example. 157 /// 158 /// [`TitlecaseMapper`]: crate::TitlecaseMapper titlecase_segment_with_only_case_data( self, src: &'a str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> impl Writeable + 'a159 pub fn titlecase_segment_with_only_case_data( 160 self, 161 src: &'a str, 162 langid: &LanguageIdentifier, 163 options: TitlecaseOptions, 164 ) -> impl Writeable + 'a { 165 self.titlecase_segment_with_adjustment(src, langid, options, |data, ch| data.is_cased(ch)) 166 } 167 168 /// Helper to support different leading adjustment behaviors, 169 /// `char_is_lead` is a function that returns true for a character that is allowed to be the 170 /// first relevant character in a titlecasing string, when `leading_adjustment != None` 171 /// 172 /// We return a concrete type instead of `impl Trait` so the return value can be mixed with that of other calls 173 /// to this function with different closures titlecase_segment_with_adjustment( self, src: &'a str, langid: &LanguageIdentifier, options: TitlecaseOptions, char_is_lead: impl Fn(&CaseMap, char) -> bool, ) -> StringAndWriteable<'a, FullCaseWriteable<'a, true>>174 pub(crate) fn titlecase_segment_with_adjustment( 175 self, 176 src: &'a str, 177 langid: &LanguageIdentifier, 178 options: TitlecaseOptions, 179 char_is_lead: impl Fn(&CaseMap, char) -> bool, 180 ) -> StringAndWriteable<'a, FullCaseWriteable<'a, true>> { 181 let (head, rest) = match options.leading_adjustment.unwrap_or_default() { 182 LeadingAdjustment::Auto | LeadingAdjustment::ToCased => { 183 let first_cased = src 184 .char_indices() 185 .find(|(_i, ch)| char_is_lead(self.data, *ch)); 186 if let Some((first_cased, _ch)) = first_cased { 187 ( 188 src.get(..first_cased).unwrap_or(""), 189 src.get(first_cased..).unwrap_or(""), 190 ) 191 } else { 192 (src, "") 193 } 194 } 195 LeadingAdjustment::None => ("", src), 196 }; 197 let writeable = self.data.full_helper_writeable::<true>( 198 rest, 199 CaseMapLocale::from_langid(langid), 200 MappingKind::Title, 201 options.trailing_case.unwrap_or_default(), 202 ); 203 StringAndWriteable { 204 string: head, 205 writeable, 206 } 207 } 208 /// Case-folds the characters in the given string as a [`Writeable`]. 209 /// This function is locale-independent and context-insensitive. 210 /// 211 /// Can be used to test if two strings are case-insensitively equivalent. 212 /// 213 /// See [`Self::fold_string()`] for the equivalent convenience function that returns a String, 214 /// as well as for an example. fold(self, src: &'a str) -> impl Writeable + 'a215 pub fn fold(self, src: &'a str) -> impl Writeable + 'a { 216 self.data.full_helper_writeable::<false>( 217 src, 218 CaseMapLocale::Root, 219 MappingKind::Fold, 220 TrailingCase::default(), 221 ) 222 } 223 224 /// Case-folds the characters in the given string as a [`Writeable`], 225 /// using Turkic (T) mappings for dotted/dotless I. 226 /// This function is locale-independent and context-insensitive. 227 /// 228 /// Can be used to test if two strings are case-insensitively equivalent. 229 /// 230 /// See [`Self::fold_turkic_string()`] for the equivalent convenience function that returns a String, 231 /// as well as for an example. fold_turkic(self, src: &'a str) -> impl Writeable + 'a232 pub fn fold_turkic(self, src: &'a str) -> impl Writeable + 'a { 233 self.data.full_helper_writeable::<false>( 234 src, 235 CaseMapLocale::Turkish, 236 MappingKind::Fold, 237 TrailingCase::default(), 238 ) 239 } 240 241 /// Returns the full lowercase mapping of the given string as a String. 242 /// 243 /// This function is context and language sensitive. Callers should pass the text's language 244 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 245 /// `Default::default()` for the root locale. 246 /// 247 /// See [`Self::lowercase()`] for the equivalent lower-level function that returns a [`Writeable`] 248 /// 249 /// # Examples 250 /// 251 /// ```rust 252 /// use icu::casemap::CaseMapper; 253 /// use icu::locale::langid; 254 /// 255 /// let cm = CaseMapper::new(); 256 /// let root = langid!("und"); 257 /// 258 /// assert_eq!(cm.lowercase_to_string("hEllO WorLd", &root), "hello world"); 259 /// assert_eq!(cm.lowercase_to_string("Γειά σου Κόσμε", &root), "γειά σου κόσμε"); 260 /// assert_eq!(cm.lowercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया"); 261 /// assert_eq!(cm.lowercase_to_string("Привет мир", &root), "привет мир"); 262 /// 263 /// // Some behavior is language-sensitive 264 /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &root), "constantinople"); 265 /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &langid!("tr")), "constantınople"); 266 /// ``` lowercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String267 pub fn lowercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String { 268 self.lowercase(src, langid).write_to_string().into_owned() 269 } 270 271 /// Returns the full uppercase mapping of the given string as a String. 272 /// 273 /// This function is context and language sensitive. Callers should pass the text's language 274 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 275 /// `Default::default()` for the root locale. 276 /// 277 /// See [`Self::uppercase()`] for the equivalent lower-level function that returns a [`Writeable`] 278 /// 279 /// # Examples 280 /// 281 /// ```rust 282 /// use icu::casemap::CaseMapper; 283 /// use icu::locale::langid; 284 /// 285 /// let cm = CaseMapper::new(); 286 /// let root = langid!("und"); 287 /// 288 /// assert_eq!(cm.uppercase_to_string("hEllO WorLd", &root), "HELLO WORLD"); 289 /// assert_eq!(cm.uppercase_to_string("Γειά σου Κόσμε", &root), "ΓΕΙΆ ΣΟΥ ΚΌΣΜΕ"); 290 /// assert_eq!(cm.uppercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया"); 291 /// assert_eq!(cm.uppercase_to_string("Привет мир", &root), "ПРИВЕТ МИР"); 292 /// 293 /// // Some behavior is language-sensitive 294 /// assert_eq!(cm.uppercase_to_string("istanbul", &root), "ISTANBUL"); 295 /// assert_eq!(cm.uppercase_to_string("istanbul", &langid!("tr")), "İSTANBUL"); // Turkish dotted i 296 /// 297 /// assert_eq!(cm.uppercase_to_string("և Երևանի", &root), "ԵՒ ԵՐԵՒԱՆԻ"); 298 /// assert_eq!(cm.uppercase_to_string("և Երևանի", &langid!("hy")), "ԵՎ ԵՐԵՎԱՆԻ"); // Eastern Armenian ech-yiwn ligature 299 /// ``` uppercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String300 pub fn uppercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String { 301 self.uppercase(src, langid).write_to_string().into_owned() 302 } 303 304 /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating 305 /// the string as a single segment (and thus only titlecasing the beginning of it). Performs 306 /// the specified leading adjustment behavior from the options without loading additional data. 307 /// 308 /// Note that [`TitlecaseMapper`] has better behavior, most users should consider using 309 /// it instead. This method primarily exists for people who care about the amount of data being loaded. 310 /// 311 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired 312 /// by the application, for example one can titlecase on a per-word basis by mixing this with 313 /// a `WordSegmenter`. 314 /// 315 /// This function is context and language sensitive. Callers should pass the text's language 316 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 317 /// `Default::default()` for the root locale. 318 /// 319 /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`] 320 /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load 321 /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See 322 /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between 323 /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode 324 /// is [`LeadingAdjustment::None`]. 325 /// 326 /// See [`Self::titlecase_segment_with_only_case_data()`] for the equivalent lower-level function that returns a [`Writeable`] 327 /// 328 /// # Examples 329 /// 330 /// ```rust 331 /// use icu::casemap::CaseMapper; 332 /// use icu::locale::langid; 333 /// 334 /// let cm = CaseMapper::new(); 335 /// let root = langid!("und"); 336 /// 337 /// let default_options = Default::default(); 338 /// 339 /// // note that the subsequent words are not titlecased, this function assumes 340 /// // that the entire string is a single segment and only titlecases at the beginning. 341 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("hEllO WorLd", &root, default_options), "Hello world"); 342 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε"); 343 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया"); 344 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Привет мир", &root, default_options), "Привет мир"); 345 /// 346 /// // Some behavior is language-sensitive 347 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &root, default_options), "Istanbul"); 348 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i 349 /// 350 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &root, default_options), "Եւ երևանի"); 351 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature 352 /// 353 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &root, default_options), "Ijkdijk"); 354 /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph 355 /// ``` 356 /// 357 /// [`TitlecaseMapper`]: crate::TitlecaseMapper titlecase_segment_with_only_case_data_to_string( self, src: &str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> String358 pub fn titlecase_segment_with_only_case_data_to_string( 359 self, 360 src: &str, 361 langid: &LanguageIdentifier, 362 options: TitlecaseOptions, 363 ) -> String { 364 self.titlecase_segment_with_only_case_data(src, langid, options) 365 .write_to_string() 366 .into_owned() 367 } 368 369 /// Case-folds the characters in the given string as a String. 370 /// This function is locale-independent and context-insensitive. 371 /// 372 /// Can be used to test if two strings are case-insensitively equivalent. 373 /// 374 /// See [`Self::fold()`] for the equivalent lower-level function that returns a [`Writeable`] 375 ///s s 376 /// # Examples 377 /// 378 /// ```rust 379 /// use icu::casemap::CaseMapper; 380 /// 381 /// let cm = CaseMapper::new(); 382 /// 383 /// // Check if two strings are equivalent case insensitively 384 /// assert_eq!(cm.fold_string("hEllO WorLd"), cm.fold_string("HELLO worlD")); 385 /// 386 /// assert_eq!(cm.fold_string("hEllO WorLd"), "hello world"); 387 /// assert_eq!(cm.fold_string("Γειά σου Κόσμε"), "γειά σου κόσμε"); 388 /// assert_eq!(cm.fold_string("नमस्ते दुनिया"), "नमस्ते दुनिया"); 389 /// assert_eq!(cm.fold_string("Привет мир"), "привет мир"); 390 /// ``` fold_string(self, src: &str) -> String391 pub fn fold_string(self, src: &str) -> String { 392 self.fold(src).write_to_string().into_owned() 393 } 394 395 /// Case-folds the characters in the given string as a String, 396 /// using Turkic (T) mappings for dotted/dotless I. 397 /// This function is locale-independent and context-insensitive. 398 /// 399 /// Can be used to test if two strings are case-insensitively equivalent. 400 /// 401 /// See [`Self::fold_turkic()`] for the equivalent lower-level function that returns a [`Writeable`] 402 /// 403 /// # Examples 404 /// 405 /// ```rust 406 /// use icu::casemap::CaseMapper; 407 /// 408 /// let cm = CaseMapper::new(); 409 /// 410 /// // Check if two strings are equivalent case insensitively 411 /// assert_eq!(cm.fold_turkic_string("İstanbul"), cm.fold_turkic_string("iSTANBUL")); 412 /// 413 /// assert_eq!(cm.fold_turkic_string("İstanbul not Constantinople"), "istanbul not constantinople"); 414 /// assert_eq!(cm.fold_turkic_string("Istanbul not Constantınople"), "ıstanbul not constantınople"); 415 /// 416 /// assert_eq!(cm.fold_turkic_string("hEllO WorLd"), "hello world"); 417 /// assert_eq!(cm.fold_turkic_string("Γειά σου Κόσμε"), "γειά σου κόσμε"); 418 /// assert_eq!(cm.fold_turkic_string("नमस्ते दुनिया"), "नमस्ते दुनिया"); 419 /// assert_eq!(cm.fold_turkic_string("Привет мир"), "привет мир"); 420 /// ``` fold_turkic_string(self, src: &str) -> String421 pub fn fold_turkic_string(self, src: &str) -> String { 422 self.fold_turkic(src).write_to_string().into_owned() 423 } 424 425 /// Adds all simple case mappings and the full case folding for `c` to `set`. 426 /// Also adds special case closure mappings. 427 /// 428 /// Identical to [`CaseMapCloserBorrowed::add_case_closure_to()`], see docs there for more information. 429 /// This method is duplicated so that one does not need to load extra unfold data 430 /// if they only need this and not also [`CaseMapCloserBorrowed::add_string_case_closure_to()`]. 431 /// 432 /// 433 /// # Examples 434 /// 435 /// ```rust 436 /// use icu::casemap::CaseMapper; 437 /// use icu::collections::codepointinvlist::CodePointInversionListBuilder; 438 /// 439 /// let cm = CaseMapper::new(); 440 /// let mut builder = CodePointInversionListBuilder::new(); 441 /// cm.add_case_closure_to('s', &mut builder); 442 /// 443 /// let set = builder.build(); 444 /// 445 /// assert!(set.contains('S')); 446 /// assert!(set.contains('ſ')); 447 /// assert!(!set.contains('s')); // does not contain itself 448 /// ``` 449 /// 450 /// [`CaseMapCloserBorrowed::add_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_case_closure_to 451 /// [`CaseMapCloserBorrowed::add_string_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_string_case_closure_to add_case_closure_to<S: ClosureSink>(self, c: char, set: &mut S)452 pub fn add_case_closure_to<S: ClosureSink>(self, c: char, set: &mut S) { 453 self.data.add_case_closure_to(c, set); 454 } 455 456 /// Returns the lowercase mapping of the given `char`. 457 /// This function only implements simple and common mappings. Full mappings, 458 /// which can map one `char` to a string, are not included. 459 /// For full mappings, use [`CaseMapperBorrowed::lowercase`]. 460 /// 461 /// # Examples 462 /// 463 /// ```rust 464 /// use icu::casemap::CaseMapper; 465 /// 466 /// let cm = CaseMapper::new(); 467 /// 468 /// assert_eq!(cm.simple_lowercase('C'), 'c'); 469 /// assert_eq!(cm.simple_lowercase('c'), 'c'); 470 /// assert_eq!(cm.simple_lowercase('Ć'), 'ć'); 471 /// assert_eq!(cm.simple_lowercase('Γ'), 'γ'); 472 /// ``` simple_lowercase(self, c: char) -> char473 pub fn simple_lowercase(self, c: char) -> char { 474 self.data.simple_lower(c) 475 } 476 477 /// Returns the uppercase mapping of the given `char`. 478 /// This function only implements simple and common mappings. Full mappings, 479 /// which can map one `char` to a string, are not included. 480 /// For full mappings, use [`CaseMapperBorrowed::uppercase`]. 481 /// 482 /// # Examples 483 /// 484 /// ```rust 485 /// use icu::casemap::CaseMapper; 486 /// 487 /// let cm = CaseMapper::new(); 488 /// 489 /// assert_eq!(cm.simple_uppercase('c'), 'C'); 490 /// assert_eq!(cm.simple_uppercase('C'), 'C'); 491 /// assert_eq!(cm.simple_uppercase('ć'), 'Ć'); 492 /// assert_eq!(cm.simple_uppercase('γ'), 'Γ'); 493 /// 494 /// assert_eq!(cm.simple_uppercase('dz'), 'DZ'); 495 /// ``` simple_uppercase(self, c: char) -> char496 pub fn simple_uppercase(self, c: char) -> char { 497 self.data.simple_upper(c) 498 } 499 500 /// Returns the titlecase mapping of the given `char`. 501 /// This function only implements simple and common mappings. Full mappings, 502 /// which can map one `char` to a string, are not included. 503 /// 504 /// # Examples 505 /// 506 /// ```rust 507 /// use icu::casemap::CaseMapper; 508 /// 509 /// let cm = CaseMapper::new(); 510 /// 511 /// assert_eq!(cm.simple_titlecase('dz'), 'Dz'); 512 /// 513 /// assert_eq!(cm.simple_titlecase('c'), 'C'); 514 /// assert_eq!(cm.simple_titlecase('C'), 'C'); 515 /// assert_eq!(cm.simple_titlecase('ć'), 'Ć'); 516 /// assert_eq!(cm.simple_titlecase('γ'), 'Γ'); 517 /// ``` simple_titlecase(self, c: char) -> char518 pub fn simple_titlecase(self, c: char) -> char { 519 self.data.simple_title(c) 520 } 521 522 /// Returns the simple case folding of the given char. 523 /// For full mappings, use [`CaseMapperBorrowed::fold`]. 524 /// 525 /// This function can be used to perform caseless matches on 526 /// individual characters. 527 /// > *Note:* With Unicode 15.0 data, there are three 528 /// > pairs of characters for which equivalence under this 529 /// > function is inconsistent with equivalence of the 530 /// > one-character strings under [`CaseMapperBorrowed::fold`]. 531 /// > This is resolved in Unicode 15.1 and later. 532 /// 533 /// For compatibility applications where simple case folding 534 /// of strings is required, this function can be applied to 535 /// each character of a string. Note that the resulting 536 /// equivalence relation is different from that obtained 537 /// by [`CaseMapperBorrowed::fold`]: 538 /// The strings "Straße" and "STRASSE" are distinct 539 /// under simple case folding, but are equivalent under 540 /// default (full) case folding. 541 /// 542 /// # Examples 543 /// 544 /// ```rust 545 /// use icu::casemap::CaseMapper; 546 /// 547 /// let cm = CaseMapper::new(); 548 /// 549 /// // perform case insensitive checks 550 /// assert_eq!(cm.simple_fold('σ'), cm.simple_fold('ς')); 551 /// assert_eq!(cm.simple_fold('Σ'), cm.simple_fold('ς')); 552 /// 553 /// assert_eq!(cm.simple_fold('c'), 'c'); 554 /// assert_eq!(cm.simple_fold('Ć'), 'ć'); 555 /// assert_eq!(cm.simple_fold('Γ'), 'γ'); 556 /// assert_eq!(cm.simple_fold('ς'), 'σ'); 557 /// 558 /// assert_eq!(cm.simple_fold('ß'), 'ß'); 559 /// assert_eq!(cm.simple_fold('I'), 'i'); 560 /// assert_eq!(cm.simple_fold('İ'), 'İ'); 561 /// assert_eq!(cm.simple_fold('ı'), 'ı'); 562 /// ``` simple_fold(self, c: char) -> char563 pub fn simple_fold(self, c: char) -> char { 564 self.data.simple_fold(c, FoldOptions::default()) 565 } 566 567 /// Returns the simple case folding of the given char, using Turkic (T) mappings for 568 /// dotted/dotless i. This function does not fold `i` and `I` to the same character. Instead, 569 /// `I` will fold to `ı`, and `İ` will fold to `i`. Otherwise, this is the same as 570 /// [`CaseMapperBorrowed::fold()`]. 571 /// 572 /// You can use the case folding to perform Turkic caseless matches on characters 573 /// provided they don't full-casefold to strings. To avoid that situation, 574 /// convert to a string and use [`CaseMapperBorrowed::fold_turkic`]. 575 /// 576 /// 577 /// # Examples 578 /// 579 /// ```rust 580 /// use icu::casemap::CaseMapper; 581 /// 582 /// let cm = CaseMapper::new(); 583 /// 584 /// assert_eq!(cm.simple_fold_turkic('I'), 'ı'); 585 /// assert_eq!(cm.simple_fold_turkic('İ'), 'i'); 586 /// ``` simple_fold_turkic(self, c: char) -> char587 pub fn simple_fold_turkic(self, c: char) -> char { 588 self.data 589 .simple_fold(c, FoldOptions::with_turkic_mappings()) 590 } 591 } 592 593 impl CaseMapper { 594 /// Creates a [`CaseMapperBorrowed`] using compiled data. 595 /// 596 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 597 /// 598 /// [ Help choosing a constructor](icu_provider::constructors) 599 /// 600 /// # Examples 601 /// 602 /// ```rust 603 /// use icu::casemap::CaseMapper; 604 /// use icu::locale::langid; 605 /// 606 /// let cm = CaseMapper::new(); 607 /// 608 /// assert_eq!( 609 /// cm.uppercase_to_string("hello world", &langid!("und")), 610 /// "HELLO WORLD" 611 /// ); 612 /// ``` 613 #[cfg(feature = "compiled_data")] 614 #[allow(clippy::new_ret_no_self)] // Intentional new() -> CaseMapperBorrowed<'static>615 pub const fn new() -> CaseMapperBorrowed<'static> { 616 CaseMapperBorrowed::new() 617 } 618 619 /// Constructs a borrowed version of this type for more efficient querying. as_borrowed(&self) -> CaseMapperBorrowed<'_>620 pub fn as_borrowed(&self) -> CaseMapperBorrowed<'_> { 621 CaseMapperBorrowed { 622 data: self.data.get(), 623 } 624 } 625 626 icu_provider::gen_buffer_data_constructors!(() -> error: DataError, 627 functions: [ 628 new: skip, 629 try_new_with_buffer_provider, 630 try_new_unstable, 631 Self, 632 ]); 633 634 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] try_new_unstable<P>(provider: &P) -> Result<CaseMapper, DataError> where P: DataProvider<CaseMapV1> + ?Sized,635 pub fn try_new_unstable<P>(provider: &P) -> Result<CaseMapper, DataError> 636 where 637 P: DataProvider<CaseMapV1> + ?Sized, 638 { 639 let data = provider.load(Default::default())?.payload; 640 Ok(Self { data }) 641 } 642 } 643 644 #[cfg(test)] 645 mod tests { 646 use super::*; 647 use icu_locale_core::langid; 648 649 #[test] 650 /// Tests for SpecialCasing.txt. Some of the special cases are data-driven, some are code-driven test_special_cases()651 fn test_special_cases() { 652 let cm = CaseMapper::new(); 653 let root = langid!("und"); 654 let default_options = Default::default(); 655 656 // Ligatures 657 658 // U+FB00 LATIN SMALL LIGATURE FF 659 assert_eq!(cm.uppercase_to_string("ff", &root), "FF"); 660 // U+FB05 LATIN SMALL LIGATURE LONG S T 661 assert_eq!(cm.uppercase_to_string("ſt", &root), "ST"); 662 663 // No corresponding uppercased character 664 665 // U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 666 assert_eq!(cm.uppercase_to_string("ʼn", &root), "ʼN"); 667 668 // U+1F50 GREEK SMALL LETTER UPSILON WITH PSILI 669 assert_eq!(cm.uppercase_to_string("ὐ", &root), "Υ̓"); 670 // U+1FF6 GREEK SMALL LETTER OMEGA WITH PERISPOMENI 671 assert_eq!(cm.uppercase_to_string("ῶ", &root), "Ω͂"); 672 673 // YPOGEGRAMMENI / PROSGEGRAMMENI special cases 674 675 // E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA> 676 assert_eq!( 677 cm.uppercase_to_string("α\u{0313}\u{0345}", &root), 678 "Α\u{0313}Ι" 679 ); 680 // but the YPOGEGRAMMENI should not titlecase 681 assert_eq!( 682 cm.titlecase_segment_with_only_case_data_to_string( 683 "α\u{0313}\u{0345}", 684 &root, 685 default_options 686 ), 687 "Α\u{0313}\u{0345}" 688 ); 689 690 // U+1F80 GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI 691 assert_eq!( 692 cm.titlecase_segment_with_only_case_data_to_string("ᾀ", &root, default_options), 693 "ᾈ" 694 ); 695 assert_eq!(cm.uppercase_to_string("ᾀ", &root), "ἈΙ"); 696 697 // U+1FFC GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 698 assert_eq!(cm.lowercase_to_string("ῼ", &root), "ῳ"); 699 assert_eq!( 700 cm.titlecase_segment_with_only_case_data_to_string("ῼ", &root, default_options), 701 "ῼ" 702 ); 703 assert_eq!(cm.uppercase_to_string("ῼ", &root), "ΩΙ"); 704 705 // U+1F98 GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI 706 assert_eq!(cm.lowercase_to_string("ᾘ", &root), "ᾐ"); 707 assert_eq!( 708 cm.titlecase_segment_with_only_case_data_to_string("ᾘ", &root, default_options), 709 "ᾘ" 710 ); 711 assert_eq!(cm.uppercase_to_string("ᾘ", &root), "ἨΙ"); 712 713 // U+1FB2 GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI 714 assert_eq!(cm.lowercase_to_string("ᾲ", &root), "ᾲ"); 715 assert_eq!( 716 cm.titlecase_segment_with_only_case_data_to_string("ᾲ", &root, default_options), 717 "Ὰ\u{345}" 718 ); 719 assert_eq!(cm.uppercase_to_string("ᾲ", &root), "ᾺΙ"); 720 721 // Final sigma test 722 // U+03A3 GREEK CAPITAL LETTER SIGMA in Final_Sigma context 723 assert_eq!(cm.lowercase_to_string("ΙΙΙΣ", &root), "ιιις"); 724 725 // Turkish / Azeri 726 let tr = langid!("tr"); 727 let az = langid!("az"); 728 // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE 729 assert_eq!(cm.lowercase_to_string("İ", &tr), "i"); 730 assert_eq!(cm.lowercase_to_string("İ", &az), "i"); 731 assert_eq!( 732 cm.titlecase_segment_with_only_case_data_to_string("İ", &tr, default_options), 733 "İ" 734 ); 735 assert_eq!( 736 cm.titlecase_segment_with_only_case_data_to_string("İ", &az, default_options), 737 "İ" 738 ); 739 assert_eq!(cm.uppercase_to_string("İ", &tr), "İ"); 740 assert_eq!(cm.uppercase_to_string("İ", &az), "İ"); 741 742 // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE 743 assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i"); 744 assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i"); 745 assert_eq!( 746 cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &tr, default_options), 747 "I\u{0307}" 748 ); 749 assert_eq!( 750 cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &az, default_options), 751 "I\u{0307}" 752 ); 753 assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}"); 754 assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}"); 755 756 // U+0049 LATIN CAPITAL LETTER I 757 assert_eq!(cm.lowercase_to_string("I", &tr), "ı"); 758 assert_eq!(cm.lowercase_to_string("I", &az), "ı"); 759 assert_eq!( 760 cm.titlecase_segment_with_only_case_data_to_string("I", &tr, default_options), 761 "I" 762 ); 763 assert_eq!( 764 cm.titlecase_segment_with_only_case_data_to_string("I", &az, default_options), 765 "I" 766 ); 767 assert_eq!(cm.uppercase_to_string("I", &tr), "I"); 768 assert_eq!(cm.uppercase_to_string("I", &az), "I"); 769 770 // U+0069 LATIN SMALL LETTER I 771 assert_eq!(cm.lowercase_to_string("i", &tr), "i"); 772 assert_eq!(cm.lowercase_to_string("i", &az), "i"); 773 assert_eq!( 774 cm.titlecase_segment_with_only_case_data_to_string("i", &tr, default_options), 775 "İ" 776 ); 777 assert_eq!( 778 cm.titlecase_segment_with_only_case_data_to_string("i", &az, default_options), 779 "İ" 780 ); 781 assert_eq!(cm.uppercase_to_string("i", &tr), "İ"); 782 assert_eq!(cm.uppercase_to_string("i", &az), "İ"); 783 } 784 785 #[test] test_cherokee_case_folding()786 fn test_cherokee_case_folding() { 787 let case_mapping = CaseMapper::new(); 788 assert_eq!(case_mapping.simple_fold('Ꭰ'), 'Ꭰ'); 789 assert_eq!(case_mapping.simple_fold('ꭰ'), 'Ꭰ'); 790 assert_eq!(case_mapping.simple_fold_turkic('Ꭰ'), 'Ꭰ'); 791 assert_eq!(case_mapping.simple_fold_turkic('ꭰ'), 'Ꭰ'); 792 assert_eq!(case_mapping.fold_string("Ꭰ"), "Ꭰ"); 793 assert_eq!(case_mapping.fold_string("ꭰ"), "Ꭰ"); 794 assert_eq!(case_mapping.fold_turkic_string("Ꭰ"), "Ꭰ"); 795 assert_eq!(case_mapping.fold_turkic_string("ꭰ"), "Ꭰ"); 796 } 797 } 798