1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 //! Titlecasing-specific 6 use crate::provider::CaseMapV1; 7 use crate::{CaseMapper, CaseMapperBorrowed}; 8 use alloc::string::String; 9 use icu_locale_core::LanguageIdentifier; 10 use icu_properties::props::{GeneralCategory, GeneralCategoryGroup}; 11 use icu_properties::provider::GeneralCategoryV1; 12 use icu_properties::{CodePointMapData, CodePointMapDataBorrowed}; 13 use icu_provider::prelude::*; 14 use writeable::Writeable; 15 16 /// How to handle the rest of the string once the beginning of the 17 /// string has been titlecased. 18 /// 19 /// # Examples 20 /// 21 /// ```rust 22 /// use icu::casemap::options::{TitlecaseOptions, TrailingCase}; 23 /// use icu::casemap::TitlecaseMapper; 24 /// use icu::locale::langid; 25 /// 26 /// let cm = TitlecaseMapper::new(); 27 /// let root = langid!("und"); 28 /// 29 /// let default_options = Default::default(); 30 /// let mut preserve_case: TitlecaseOptions = Default::default(); 31 /// preserve_case.trailing_case = Some(TrailingCase::Unchanged); 32 /// 33 /// // Exhibits trailing case when set: 34 /// assert_eq!( 35 /// cm.titlecase_segment_to_string("spOngeBoB", &root, default_options), 36 /// "Spongebob" 37 /// ); 38 /// assert_eq!( 39 /// cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case), 40 /// "SpOngeBoB" 41 /// ); 42 /// ``` 43 #[non_exhaustive] 44 #[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)] 45 pub enum TrailingCase { 46 /// Preserve the casing of the rest of the string ("spoNgEBoB" -> "SpoNgEBoB") 47 Unchanged, 48 /// Lowercase the rest of the string ("spoNgEBoB" -> "Spongebob") 49 #[default] 50 Lower, 51 } 52 53 /// Where to start casing the string. 54 /// 55 /// [`TitlecaseMapper`] by default performs "leading adjustment", where it searches for the first "relevant" character 56 /// in the string before initializing the actual titlecasing. For example, it will skip punctuation at the beginning 57 /// of a string, allowing for strings like `'twas` or `«hello»` to be appropriately titlecased. 58 /// 59 /// Opinions on exactly what is a "relevant" character may differ. In "adjust to cased" mode the first cased character is considered "relevant", 60 /// whereas in the "auto" mode, it is the first character that is a letter, number, symbol, or private use character. This means 61 /// that the strings `49ers` and `«丰(abc)»` will titlecase in "adjust to cased" mode to `49Ers` and `«丰(Abc)»`, whereas in the "auto" mode they stay unchanged. 62 /// This difference largely matters for things that mix numbers and letters, or mix writing systems, within a single segment. 63 /// 64 /// # Examples 65 /// 66 /// ```rust 67 /// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions}; 68 /// use icu::casemap::TitlecaseMapper; 69 /// use icu::locale::langid; 70 /// 71 /// let cm = TitlecaseMapper::new(); 72 /// let root = langid!("und"); 73 /// 74 /// let default_options = Default::default(); // head adjustment set to Auto 75 /// let mut no_adjust: TitlecaseOptions = Default::default(); 76 /// let mut adjust_to_cased: TitlecaseOptions = Default::default(); 77 /// no_adjust.leading_adjustment = Some(LeadingAdjustment::None); 78 /// adjust_to_cased.leading_adjustment = Some(LeadingAdjustment::ToCased); 79 /// 80 /// // Exhibits leading adjustment when set: 81 /// assert_eq!( 82 /// cm.titlecase_segment_to_string("«hello»", &root, default_options), 83 /// "«Hello»" 84 /// ); 85 /// assert_eq!( 86 /// cm.titlecase_segment_to_string("«hello»", &root, adjust_to_cased), 87 /// "«Hello»" 88 /// ); 89 /// assert_eq!( 90 /// cm.titlecase_segment_to_string("«hello»", &root, no_adjust), 91 /// "«hello»" 92 /// ); 93 /// 94 /// // Only changed in adjust-to-cased mode: 95 /// assert_eq!( 96 /// cm.titlecase_segment_to_string("丰(abc)", &root, default_options), 97 /// "丰(abc)" 98 /// ); 99 /// assert_eq!( 100 /// cm.titlecase_segment_to_string("丰(abc)", &root, adjust_to_cased), 101 /// "丰(Abc)" 102 /// ); 103 /// assert_eq!( 104 /// cm.titlecase_segment_to_string("丰(abc)", &root, no_adjust), 105 /// "丰(abc)" 106 /// ); 107 /// 108 /// // Only changed in adjust-to-cased mode: 109 /// assert_eq!( 110 /// cm.titlecase_segment_to_string("49ers", &root, default_options), 111 /// "49ers" 112 /// ); 113 /// assert_eq!( 114 /// cm.titlecase_segment_to_string("49ers", &root, adjust_to_cased), 115 /// "49Ers" 116 /// ); 117 /// assert_eq!( 118 /// cm.titlecase_segment_to_string("49ers", &root, no_adjust), 119 /// "49ers" 120 /// ); 121 /// ``` 122 #[non_exhaustive] 123 #[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)] 124 pub enum LeadingAdjustment { 125 /// Start titlecasing immediately, even if the character is not one that is relevant for casing 126 /// ("'twixt" -> "'twixt", "twixt" -> "Twixt") 127 None, 128 /// Adjust the string to the first relevant character before beginning to apply casing 129 /// ("'twixt" -> "'Twixt"). "Relevant" character is picked by best available algorithm, 130 /// by default will adjust to first letter, number, symbol, or private use character, 131 /// but if no data is available (e.g. this API is being called via [`CaseMapperBorrowed::titlecase_segment_with_only_case_data()`]), 132 /// then may be equivalent to "adjust to cased". 133 /// 134 /// This is the default 135 #[default] 136 Auto, 137 /// Adjust the string to the first cased character before beginning to apply casing 138 /// ("'twixt" -> "'Twixt") 139 ToCased, 140 } 141 142 /// Various options for controlling titlecasing 143 /// 144 /// See docs of [`TitlecaseMapper`] for examples. 145 #[non_exhaustive] 146 #[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)] 147 pub struct TitlecaseOptions { 148 /// How to handle the rest of the string once the head of the 149 /// string has been titlecased 150 /// 151 /// Default is [`TrailingCase::Lower`] 152 pub trailing_case: Option<TrailingCase>, 153 /// Whether to start casing at the beginning of the string or at the first 154 /// relevant character. 155 /// 156 /// Default is [`LeadingAdjustment::Auto`] 157 pub leading_adjustment: Option<LeadingAdjustment>, 158 } 159 160 /// A wrapper around [`CaseMapper`] that can compute titlecasing stuff, and is able to load additional data 161 /// to support the non-legacy "head adjustment" behavior. 162 /// 163 /// 164 /// Most methods for this type live on [`TitlecaseMapperBorrowed`], which you can obtain via 165 /// [`TitlecaseMapper::new()`] or [`TitlecaseMapper::as_borrowed()`]. 166 /// 167 /// By default, [`TitlecaseMapperBorrowed::titlecase_segment()`] and [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] perform "leading adjustment", 168 /// where they wait till the first relevant character to begin titlecasing. For example, in the string `'twixt`, the apostrophe 169 /// is ignored because the word starts at the first "t", which will get titlecased (producing `'Twixt`). Other punctuation will 170 /// also be ignored, like in the string `«hello»`, which will get titlecased to `«Hello»`. 171 /// 172 /// This is a separate type from [`CaseMapper`] because it loads the additional data 173 /// required by [`LeadingAdjustment::Auto`] to perform the best possible leading adjustment. 174 /// 175 /// If you are planning on only using [`LeadingAdjustment::None`] or [`LeadingAdjustment::ToCased`], consider using [`CaseMapper`] directly; this 176 /// type will have no additional behavior. 177 /// 178 /// # Examples 179 /// 180 /// Basic casemapping behavior: 181 /// 182 /// ```rust 183 /// use icu::casemap::TitlecaseMapper; 184 /// use icu::locale::langid; 185 /// 186 /// let cm = TitlecaseMapper::new(); 187 /// let root = langid!("und"); 188 /// 189 /// let default_options = Default::default(); 190 /// 191 /// // note that the subsequent words are not titlecased, this function assumes 192 /// // that the entire string is a single segment and only titlecases at the beginning. 193 /// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world"); 194 /// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε"); 195 /// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया"); 196 /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир"); 197 /// 198 /// // Some behavior is language-sensitive 199 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul"); 200 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i 201 /// 202 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի"); 203 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature 204 /// 205 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk"); 206 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph 207 /// ``` 208 #[derive(Clone, Debug)] 209 pub struct TitlecaseMapper<CM> { 210 cm: CM, 211 gc: CodePointMapData<GeneralCategory>, 212 } 213 214 impl TitlecaseMapper<CaseMapper> { 215 icu_provider::gen_buffer_data_constructors!(() -> error: DataError, 216 functions: [ 217 new: skip, 218 try_new_with_buffer_provider, 219 try_new_unstable, 220 Self, 221 ]); 222 223 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] try_new_unstable<P>(provider: &P) -> Result<Self, DataError> where P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized,224 pub fn try_new_unstable<P>(provider: &P) -> Result<Self, DataError> 225 where 226 P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized, 227 { 228 let cm = CaseMapper::try_new_unstable(provider)?; 229 let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?; 230 Ok(Self { cm, gc }) 231 } 232 } 233 234 impl TitlecaseMapper<CaseMapper> { 235 /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data 236 /// 237 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 238 /// 239 /// [ Help choosing a constructor](icu_provider::constructors) 240 #[cfg(feature = "compiled_data")] 241 #[allow(clippy::new_ret_no_self)] // Intentional new() -> TitlecaseMapperBorrowed<'static>242 pub const fn new() -> TitlecaseMapperBorrowed<'static> { 243 TitlecaseMapperBorrowed::new() 244 } 245 } 246 // We use Borrow, not AsRef, since we want the blanket impl on T 247 impl<CM: AsRef<CaseMapper>> TitlecaseMapper<CM> { 248 icu_provider::gen_buffer_data_constructors!((casemapper: CM) -> error: DataError, 249 functions: [ 250 new_with_mapper: skip, 251 try_new_with_mapper_with_buffer_provider, 252 try_new_with_mapper_unstable, 253 Self, 254 ]); 255 256 /// A constructor which creates a [`TitlecaseMapper`] from an existing [`CaseMapper`] 257 /// (either owned or as a reference) and compiled data 258 /// 259 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 260 /// 261 /// [ Help choosing a constructor](icu_provider::constructors) 262 #[cfg(feature = "compiled_data")] new_with_mapper(casemapper: CM) -> Self263 pub const fn new_with_mapper(casemapper: CM) -> Self { 264 Self { 265 cm: casemapper, 266 gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new() 267 .static_to_owned(), 268 } 269 } 270 271 /// Construct this object to wrap an existing CaseMapper (or a reference to one), loading additional data as needed. 272 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_mapper)] try_new_with_mapper_unstable<P>(provider: &P, casemapper: CM) -> Result<Self, DataError> where P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized,273 pub fn try_new_with_mapper_unstable<P>(provider: &P, casemapper: CM) -> Result<Self, DataError> 274 where 275 P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized, 276 { 277 let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?; 278 Ok(Self { cm: casemapper, gc }) 279 } 280 281 /// Constructs a borrowed version of this type for more efficient querying. as_borrowed(&self) -> TitlecaseMapperBorrowed<'_>282 pub fn as_borrowed(&self) -> TitlecaseMapperBorrowed<'_> { 283 TitlecaseMapperBorrowed { 284 cm: self.cm.as_ref().as_borrowed(), 285 gc: self.gc.as_borrowed(), 286 } 287 } 288 } 289 290 /// A borrowed [`TitlecaseMapper`]. 291 /// 292 /// See methods or [`TitlecaseMapper`] for examples. 293 #[derive(Clone, Debug, Copy)] 294 pub struct TitlecaseMapperBorrowed<'a> { 295 cm: CaseMapperBorrowed<'a>, 296 gc: CodePointMapDataBorrowed<'a, GeneralCategory>, 297 } 298 299 impl TitlecaseMapperBorrowed<'static> { 300 /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data 301 /// 302 /// ✨ *Enabled with the `compiled_data` Cargo feature.* 303 /// 304 /// [ Help choosing a constructor](icu_provider::constructors) 305 #[cfg(feature = "compiled_data")] new() -> Self306 pub const fn new() -> Self { 307 Self { 308 cm: CaseMapper::new(), 309 gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new(), 310 } 311 } 312 /// Cheaply converts a [`TitlecaseMapperBorrowed<'static>`] into a [`TitlecaseMapper`]. 313 /// 314 /// Note: Due to branching and indirection, using [`TitlecaseMapper`] might inhibit some 315 /// compile-time optimizations that are possible with [`TitlecaseMapper`]. static_to_owned(self) -> TitlecaseMapper<CaseMapper>316 pub const fn static_to_owned(self) -> TitlecaseMapper<CaseMapper> { 317 TitlecaseMapper { 318 cm: self.cm.static_to_owned(), 319 gc: self.gc.static_to_owned(), 320 } 321 } 322 } 323 324 #[cfg(feature = "compiled_data")] 325 impl Default for TitlecaseMapperBorrowed<'static> { default() -> Self326 fn default() -> Self { 327 Self::new() 328 } 329 } 330 331 impl<'a> TitlecaseMapperBorrowed<'a> { 332 /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating 333 /// the string as a single segment (and thus only titlecasing the beginning of it). 334 /// 335 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired 336 /// by the application, for example one can titlecase on a per-word basis by mixing this with 337 /// a `WordSegmenter`. 338 /// 339 /// This function is context and language sensitive. Callers should pass the text's language 340 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 341 /// `Default::default()` for the root locale. 342 /// 343 /// See [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] for the equivalent convenience function that returns a String, 344 /// as well as for an example. titlecase_segment( self, src: &'a str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> impl Writeable + 'a345 pub fn titlecase_segment( 346 self, 347 src: &'a str, 348 langid: &LanguageIdentifier, 349 options: TitlecaseOptions, 350 ) -> impl Writeable + 'a { 351 if options.leading_adjustment.unwrap_or_default() == LeadingAdjustment::Auto { 352 // letter, number, symbol, or private use code point 353 const HEAD_GROUPS: GeneralCategoryGroup = GeneralCategoryGroup::Letter 354 .union(GeneralCategoryGroup::Number) 355 .union(GeneralCategoryGroup::Symbol) 356 .union(GeneralCategoryGroup::PrivateUse); 357 self.cm 358 .titlecase_segment_with_adjustment(src, langid, options, |_data, ch| { 359 HEAD_GROUPS.contains(self.gc.get(ch)) 360 }) 361 } else { 362 self.cm 363 .titlecase_segment_with_adjustment(src, langid, options, |data, ch| { 364 data.is_cased(ch) 365 }) 366 } 367 } 368 369 /// Returns the full titlecase mapping of the given string as a String, treating 370 /// the string as a single segment (and thus only titlecasing the beginning of it). 371 /// 372 /// This should typically be used as a lower-level helper to construct the titlecasing operation desired 373 /// by the application, for example one can titlecase on a per-word basis by mixing this with 374 /// a `WordSegmenter`. 375 /// 376 /// This function is context and language sensitive. Callers should pass the text's language 377 /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or 378 /// `Default::default()` for the root locale. 379 /// 380 /// See [`TitlecaseMapperBorrowed::titlecase_segment()`] for the equivalent lower-level function that returns a [`Writeable`] 381 /// 382 /// # Examples 383 /// 384 /// ```rust 385 /// use icu::casemap::TitlecaseMapper; 386 /// use icu::locale::langid; 387 /// 388 /// let cm = TitlecaseMapper::new(); 389 /// let root = langid!("und"); 390 /// 391 /// let default_options = Default::default(); 392 /// 393 /// // note that the subsequent words are not titlecased, this function assumes 394 /// // that the entire string is a single segment and only titlecases at the beginning. 395 /// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world"); 396 /// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε"); 397 /// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया"); 398 /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир"); 399 /// 400 /// // Some behavior is language-sensitive 401 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul"); 402 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i 403 /// 404 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի"); 405 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature 406 /// 407 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk"); 408 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph 409 /// ``` 410 /// 411 /// Leading adjustment behaviors: 412 /// 413 /// ```rust 414 /// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions}; 415 /// use icu::casemap::TitlecaseMapper; 416 /// use icu::locale::langid; 417 /// 418 /// let cm = TitlecaseMapper::new(); 419 /// let root = langid!("und"); 420 /// 421 /// let default_options = Default::default(); 422 /// let mut no_adjust: TitlecaseOptions = Default::default(); 423 /// no_adjust.leading_adjustment = Some(LeadingAdjustment::None); 424 /// 425 /// // Exhibits leading adjustment when set: 426 /// assert_eq!( 427 /// cm.titlecase_segment_to_string("«hello»", &root, default_options), 428 /// "«Hello»" 429 /// ); 430 /// assert_eq!( 431 /// cm.titlecase_segment_to_string("«hello»", &root, no_adjust), 432 /// "«hello»" 433 /// ); 434 /// 435 /// assert_eq!( 436 /// cm.titlecase_segment_to_string("'Twas", &root, default_options), 437 /// "'Twas" 438 /// ); 439 /// assert_eq!( 440 /// cm.titlecase_segment_to_string("'Twas", &root, no_adjust), 441 /// "'twas" 442 /// ); 443 /// 444 /// assert_eq!( 445 /// cm.titlecase_segment_to_string("", &root, default_options), 446 /// "" 447 /// ); 448 /// assert_eq!(cm.titlecase_segment_to_string("", &root, no_adjust), ""); 449 /// ``` 450 /// 451 /// Tail casing behaviors: 452 /// 453 /// ```rust 454 /// use icu::casemap::options::{TitlecaseOptions, TrailingCase}; 455 /// use icu::casemap::TitlecaseMapper; 456 /// use icu::locale::langid; 457 /// 458 /// let cm = TitlecaseMapper::new(); 459 /// let root = langid!("und"); 460 /// 461 /// let default_options = Default::default(); 462 /// let mut preserve_case: TitlecaseOptions = Default::default(); 463 /// preserve_case.trailing_case = Some(TrailingCase::Unchanged); 464 /// 465 /// // Exhibits trailing case when set: 466 /// assert_eq!( 467 /// cm.titlecase_segment_to_string("spOngeBoB", &root, default_options), 468 /// "Spongebob" 469 /// ); 470 /// assert_eq!( 471 /// cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case), 472 /// "SpOngeBoB" 473 /// ); 474 /// ``` titlecase_segment_to_string( self, src: &str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> String475 pub fn titlecase_segment_to_string( 476 self, 477 src: &str, 478 langid: &LanguageIdentifier, 479 options: TitlecaseOptions, 480 ) -> String { 481 self.titlecase_segment(src, langid, options) 482 .write_to_string() 483 .into_owned() 484 } 485 } 486