• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! Titlecasing-specific
6 use crate::provider::CaseMapV1;
7 use crate::{CaseMapper, CaseMapperBorrowed};
8 use alloc::string::String;
9 use icu_locale_core::LanguageIdentifier;
10 use icu_properties::props::{GeneralCategory, GeneralCategoryGroup};
11 use icu_properties::provider::GeneralCategoryV1;
12 use icu_properties::{CodePointMapData, CodePointMapDataBorrowed};
13 use icu_provider::prelude::*;
14 use writeable::Writeable;
15 
16 /// How to handle the rest of the string once the beginning of the
17 /// string has been titlecased.
18 ///
19 /// # Examples
20 ///
21 /// ```rust
22 /// use icu::casemap::options::{TitlecaseOptions, TrailingCase};
23 /// use icu::casemap::TitlecaseMapper;
24 /// use icu::locale::langid;
25 ///
26 /// let cm = TitlecaseMapper::new();
27 /// let root = langid!("und");
28 ///
29 /// let default_options = Default::default();
30 /// let mut preserve_case: TitlecaseOptions = Default::default();
31 /// preserve_case.trailing_case = Some(TrailingCase::Unchanged);
32 ///
33 /// // Exhibits trailing case when set:
34 /// assert_eq!(
35 ///     cm.titlecase_segment_to_string("spOngeBoB", &root, default_options),
36 ///     "Spongebob"
37 /// );
38 /// assert_eq!(
39 ///     cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case),
40 ///     "SpOngeBoB"
41 /// );
42 /// ```
43 #[non_exhaustive]
44 #[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
45 pub enum TrailingCase {
46     /// Preserve the casing of the rest of the string ("spoNgEBoB" -> "SpoNgEBoB")
47     Unchanged,
48     /// Lowercase the rest of the string ("spoNgEBoB" -> "Spongebob")
49     #[default]
50     Lower,
51 }
52 
53 /// Where to start casing the string.
54 ///
55 /// [`TitlecaseMapper`] by default performs "leading adjustment", where it searches for the first "relevant" character
56 /// in the string before initializing the actual titlecasing. For example, it will skip punctuation at the beginning
57 /// of a string, allowing for strings like `'twas` or `«hello»` to be appropriately titlecased.
58 ///
59 /// Opinions on exactly what is a "relevant" character may differ. In "adjust to cased" mode the first cased character is considered "relevant",
60 /// whereas in the "auto" mode, it is the first character that is a letter, number, symbol, or private use character. This means
61 /// that the strings `49ers` and `«丰(abc)»` will titlecase in "adjust to cased" mode to `49Ers` and `«丰(Abc)»`, whereas in the "auto" mode they stay unchanged.
62 /// This difference largely matters for things that mix numbers and letters, or mix writing systems, within a single segment.
63 ///
64 /// # Examples
65 ///
66 /// ```rust
67 /// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions};
68 /// use icu::casemap::TitlecaseMapper;
69 /// use icu::locale::langid;
70 ///
71 /// let cm = TitlecaseMapper::new();
72 /// let root = langid!("und");
73 ///
74 /// let default_options = Default::default(); // head adjustment set to Auto
75 /// let mut no_adjust: TitlecaseOptions = Default::default();
76 /// let mut adjust_to_cased: TitlecaseOptions = Default::default();
77 /// no_adjust.leading_adjustment = Some(LeadingAdjustment::None);
78 /// adjust_to_cased.leading_adjustment = Some(LeadingAdjustment::ToCased);
79 ///
80 /// // Exhibits leading adjustment when set:
81 /// assert_eq!(
82 ///     cm.titlecase_segment_to_string("«hello»", &root, default_options),
83 ///     "«Hello»"
84 /// );
85 /// assert_eq!(
86 ///     cm.titlecase_segment_to_string("«hello»", &root, adjust_to_cased),
87 ///     "«Hello»"
88 /// );
89 /// assert_eq!(
90 ///     cm.titlecase_segment_to_string("«hello»", &root, no_adjust),
91 ///     "«hello»"
92 /// );
93 ///
94 /// // Only changed in adjust-to-cased mode:
95 /// assert_eq!(
96 ///     cm.titlecase_segment_to_string("丰(abc)", &root, default_options),
97 ///     "丰(abc)"
98 /// );
99 /// assert_eq!(
100 ///     cm.titlecase_segment_to_string("丰(abc)", &root, adjust_to_cased),
101 ///     "丰(Abc)"
102 /// );
103 /// assert_eq!(
104 ///     cm.titlecase_segment_to_string("丰(abc)", &root, no_adjust),
105 ///     "丰(abc)"
106 /// );
107 ///
108 /// // Only changed in adjust-to-cased mode:
109 /// assert_eq!(
110 ///     cm.titlecase_segment_to_string("49ers", &root, default_options),
111 ///     "49ers"
112 /// );
113 /// assert_eq!(
114 ///     cm.titlecase_segment_to_string("49ers", &root, adjust_to_cased),
115 ///     "49Ers"
116 /// );
117 /// assert_eq!(
118 ///     cm.titlecase_segment_to_string("49ers", &root, no_adjust),
119 ///     "49ers"
120 /// );
121 /// ```
122 #[non_exhaustive]
123 #[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
124 pub enum LeadingAdjustment {
125     /// Start titlecasing immediately, even if the character is not one that is relevant for casing
126     /// ("'twixt" -> "'twixt", "twixt" -> "Twixt")
127     None,
128     /// Adjust the string to the first relevant character before beginning to apply casing
129     /// ("'twixt" -> "'Twixt"). "Relevant" character is picked by best available algorithm,
130     /// by default will adjust to first letter, number, symbol, or private use character,
131     /// but if no data is available (e.g. this API is being called via [`CaseMapperBorrowed::titlecase_segment_with_only_case_data()`]),
132     /// then may be equivalent to "adjust to cased".
133     ///
134     /// This is the default
135     #[default]
136     Auto,
137     /// Adjust the string to the first cased character before beginning to apply casing
138     /// ("'twixt" -> "'Twixt")
139     ToCased,
140 }
141 
142 /// Various options for controlling titlecasing
143 ///
144 /// See docs of [`TitlecaseMapper`] for examples.
145 #[non_exhaustive]
146 #[derive(Copy, Clone, Default, PartialEq, Eq, Hash, Debug)]
147 pub struct TitlecaseOptions {
148     /// How to handle the rest of the string once the head of the
149     /// string has been titlecased
150     ///
151     /// Default is [`TrailingCase::Lower`]
152     pub trailing_case: Option<TrailingCase>,
153     /// Whether to start casing at the beginning of the string or at the first
154     /// relevant character.
155     ///
156     /// Default is [`LeadingAdjustment::Auto`]
157     pub leading_adjustment: Option<LeadingAdjustment>,
158 }
159 
160 /// A wrapper around [`CaseMapper`] that can compute titlecasing stuff, and is able to load additional data
161 /// to support the non-legacy "head adjustment" behavior.
162 ///
163 ///
164 /// Most methods for this type live on [`TitlecaseMapperBorrowed`], which you can obtain via
165 /// [`TitlecaseMapper::new()`] or [`TitlecaseMapper::as_borrowed()`].
166 ///
167 /// By default, [`TitlecaseMapperBorrowed::titlecase_segment()`] and [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] perform "leading adjustment",
168 /// where they wait till the first relevant character to begin titlecasing. For example, in the string `'twixt`, the apostrophe
169 /// is ignored because the word starts at the first "t", which will get titlecased (producing `'Twixt`). Other punctuation will
170 /// also be ignored, like in the string `«hello»`, which will get titlecased to `«Hello»`.
171 ///
172 /// This is a separate type from [`CaseMapper`] because it loads the additional data
173 /// required by [`LeadingAdjustment::Auto`] to perform the best possible leading adjustment.
174 ///
175 /// If you are planning on only using [`LeadingAdjustment::None`] or [`LeadingAdjustment::ToCased`], consider using [`CaseMapper`] directly; this
176 /// type will have no additional behavior.
177 ///
178 /// # Examples
179 ///
180 /// Basic casemapping behavior:
181 ///
182 /// ```rust
183 /// use icu::casemap::TitlecaseMapper;
184 /// use icu::locale::langid;
185 ///
186 /// let cm = TitlecaseMapper::new();
187 /// let root = langid!("und");
188 ///
189 /// let default_options = Default::default();
190 ///
191 /// // note that the subsequent words are not titlecased, this function assumes
192 /// // that the entire string is a single segment and only titlecases at the beginning.
193 /// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world");
194 /// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
195 /// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
196 /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир");
197 ///
198 /// // Some behavior is language-sensitive
199 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul");
200 /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
201 ///
202 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
203 /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
204 ///
205 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk");
206 /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
207 /// ```
208 #[derive(Clone, Debug)]
209 pub struct TitlecaseMapper<CM> {
210     cm: CM,
211     gc: CodePointMapData<GeneralCategory>,
212 }
213 
214 impl TitlecaseMapper<CaseMapper> {
215     icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
216     functions: [
217         new: skip,
218                 try_new_with_buffer_provider,
219         try_new_unstable,
220         Self,
221     ]);
222 
223     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
try_new_unstable<P>(provider: &P) -> Result<Self, DataError> where P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized,224     pub fn try_new_unstable<P>(provider: &P) -> Result<Self, DataError>
225     where
226         P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized,
227     {
228         let cm = CaseMapper::try_new_unstable(provider)?;
229         let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?;
230         Ok(Self { cm, gc })
231     }
232 }
233 
234 impl TitlecaseMapper<CaseMapper> {
235     /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data
236     ///
237     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
238     ///
239     /// [�� Help choosing a constructor](icu_provider::constructors)
240     #[cfg(feature = "compiled_data")]
241     #[allow(clippy::new_ret_no_self)] // Intentional
new() -> TitlecaseMapperBorrowed<'static>242     pub const fn new() -> TitlecaseMapperBorrowed<'static> {
243         TitlecaseMapperBorrowed::new()
244     }
245 }
246 // We use Borrow, not AsRef, since we want the blanket impl on T
247 impl<CM: AsRef<CaseMapper>> TitlecaseMapper<CM> {
248     icu_provider::gen_buffer_data_constructors!((casemapper: CM) -> error: DataError,
249     functions: [
250         new_with_mapper: skip,
251         try_new_with_mapper_with_buffer_provider,
252         try_new_with_mapper_unstable,
253         Self,
254     ]);
255 
256     /// A constructor which creates a [`TitlecaseMapper`] from an existing [`CaseMapper`]
257     /// (either owned or as a reference) and compiled data
258     ///
259     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
260     ///
261     /// [�� Help choosing a constructor](icu_provider::constructors)
262     #[cfg(feature = "compiled_data")]
new_with_mapper(casemapper: CM) -> Self263     pub const fn new_with_mapper(casemapper: CM) -> Self {
264         Self {
265             cm: casemapper,
266             gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new()
267                 .static_to_owned(),
268         }
269     }
270 
271     /// Construct this object to wrap an existing CaseMapper (or a reference to one), loading additional data as needed.
272     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_mapper)]
try_new_with_mapper_unstable<P>(provider: &P, casemapper: CM) -> Result<Self, DataError> where P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized,273     pub fn try_new_with_mapper_unstable<P>(provider: &P, casemapper: CM) -> Result<Self, DataError>
274     where
275         P: DataProvider<CaseMapV1> + DataProvider<GeneralCategoryV1> + ?Sized,
276     {
277         let gc = icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::try_new_unstable(provider)?;
278         Ok(Self { cm: casemapper, gc })
279     }
280 
281     /// Constructs a borrowed version of this type for more efficient querying.
as_borrowed(&self) -> TitlecaseMapperBorrowed<'_>282     pub fn as_borrowed(&self) -> TitlecaseMapperBorrowed<'_> {
283         TitlecaseMapperBorrowed {
284             cm: self.cm.as_ref().as_borrowed(),
285             gc: self.gc.as_borrowed(),
286         }
287     }
288 }
289 
290 /// A borrowed [`TitlecaseMapper`].
291 ///
292 /// See methods or [`TitlecaseMapper`] for examples.
293 #[derive(Clone, Debug, Copy)]
294 pub struct TitlecaseMapperBorrowed<'a> {
295     cm: CaseMapperBorrowed<'a>,
296     gc: CodePointMapDataBorrowed<'a, GeneralCategory>,
297 }
298 
299 impl TitlecaseMapperBorrowed<'static> {
300     /// A constructor which creates a [`TitlecaseMapperBorrowed`] using compiled data
301     ///
302     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
303     ///
304     /// [�� Help choosing a constructor](icu_provider::constructors)
305     #[cfg(feature = "compiled_data")]
new() -> Self306     pub const fn new() -> Self {
307         Self {
308             cm: CaseMapper::new(),
309             gc: icu_properties::CodePointMapData::<icu_properties::props::GeneralCategory>::new(),
310         }
311     }
312     /// Cheaply converts a [`TitlecaseMapperBorrowed<'static>`] into a [`TitlecaseMapper`].
313     ///
314     /// Note: Due to branching and indirection, using [`TitlecaseMapper`] might inhibit some
315     /// compile-time optimizations that are possible with [`TitlecaseMapper`].
static_to_owned(self) -> TitlecaseMapper<CaseMapper>316     pub const fn static_to_owned(self) -> TitlecaseMapper<CaseMapper> {
317         TitlecaseMapper {
318             cm: self.cm.static_to_owned(),
319             gc: self.gc.static_to_owned(),
320         }
321     }
322 }
323 
324 #[cfg(feature = "compiled_data")]
325 impl Default for TitlecaseMapperBorrowed<'static> {
default() -> Self326     fn default() -> Self {
327         Self::new()
328     }
329 }
330 
331 impl<'a> TitlecaseMapperBorrowed<'a> {
332     /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
333     /// the string as a single segment (and thus only titlecasing the beginning of it).
334     ///
335     /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
336     /// by the application, for example one can titlecase on a per-word basis by mixing this with
337     /// a `WordSegmenter`.
338     ///
339     /// This function is context and language sensitive. Callers should pass the text's language
340     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
341     /// `Default::default()` for the root locale.
342     ///
343     /// See [`TitlecaseMapperBorrowed::titlecase_segment_to_string()`] for the equivalent convenience function that returns a String,
344     /// as well as for an example.
titlecase_segment( self, src: &'a str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> impl Writeable + 'a345     pub fn titlecase_segment(
346         self,
347         src: &'a str,
348         langid: &LanguageIdentifier,
349         options: TitlecaseOptions,
350     ) -> impl Writeable + 'a {
351         if options.leading_adjustment.unwrap_or_default() == LeadingAdjustment::Auto {
352             // letter, number, symbol, or private use code point
353             const HEAD_GROUPS: GeneralCategoryGroup = GeneralCategoryGroup::Letter
354                 .union(GeneralCategoryGroup::Number)
355                 .union(GeneralCategoryGroup::Symbol)
356                 .union(GeneralCategoryGroup::PrivateUse);
357             self.cm
358                 .titlecase_segment_with_adjustment(src, langid, options, |_data, ch| {
359                     HEAD_GROUPS.contains(self.gc.get(ch))
360                 })
361         } else {
362             self.cm
363                 .titlecase_segment_with_adjustment(src, langid, options, |data, ch| {
364                     data.is_cased(ch)
365                 })
366         }
367     }
368 
369     /// Returns the full titlecase mapping of the given string as a String, treating
370     /// the string as a single segment (and thus only titlecasing the beginning of it).
371     ///
372     /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
373     /// by the application, for example one can titlecase on a per-word basis by mixing this with
374     /// a `WordSegmenter`.
375     ///
376     /// This function is context and language sensitive. Callers should pass the text's language
377     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
378     /// `Default::default()` for the root locale.
379     ///
380     /// See [`TitlecaseMapperBorrowed::titlecase_segment()`] for the equivalent lower-level function that returns a [`Writeable`]
381     ///
382     /// # Examples
383     ///
384     /// ```rust
385     /// use icu::casemap::TitlecaseMapper;
386     /// use icu::locale::langid;
387     ///
388     /// let cm = TitlecaseMapper::new();
389     /// let root = langid!("und");
390     ///
391     /// let default_options = Default::default();
392     ///
393     /// // note that the subsequent words are not titlecased, this function assumes
394     /// // that the entire string is a single segment and only titlecases at the beginning.
395     /// assert_eq!(cm.titlecase_segment_to_string("hEllO WorLd", &root, default_options), "Hello world");
396     /// assert_eq!(cm.titlecase_segment_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
397     /// assert_eq!(cm.titlecase_segment_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
398     /// assert_eq!(cm.titlecase_segment_to_string("Привет мир", &root, default_options), "Привет мир");
399     ///
400     /// // Some behavior is language-sensitive
401     /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &root, default_options), "Istanbul");
402     /// assert_eq!(cm.titlecase_segment_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
403     ///
404     /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
405     /// assert_eq!(cm.titlecase_segment_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
406     ///
407     /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &root, default_options), "Ijkdijk");
408     /// assert_eq!(cm.titlecase_segment_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
409     /// ```
410     ///
411     /// Leading adjustment behaviors:
412     ///
413     /// ```rust
414     /// use icu::casemap::options::{LeadingAdjustment, TitlecaseOptions};
415     /// use icu::casemap::TitlecaseMapper;
416     /// use icu::locale::langid;
417     ///
418     /// let cm = TitlecaseMapper::new();
419     /// let root = langid!("und");
420     ///
421     /// let default_options = Default::default();
422     /// let mut no_adjust: TitlecaseOptions = Default::default();
423     /// no_adjust.leading_adjustment = Some(LeadingAdjustment::None);
424     ///
425     /// // Exhibits leading adjustment when set:
426     /// assert_eq!(
427     ///     cm.titlecase_segment_to_string("«hello»", &root, default_options),
428     ///     "«Hello»"
429     /// );
430     /// assert_eq!(
431     ///     cm.titlecase_segment_to_string("«hello»", &root, no_adjust),
432     ///     "«hello»"
433     /// );
434     ///
435     /// assert_eq!(
436     ///     cm.titlecase_segment_to_string("'Twas", &root, default_options),
437     ///     "'Twas"
438     /// );
439     /// assert_eq!(
440     ///     cm.titlecase_segment_to_string("'Twas", &root, no_adjust),
441     ///     "'twas"
442     /// );
443     ///
444     /// assert_eq!(
445     ///     cm.titlecase_segment_to_string("", &root, default_options),
446     ///     ""
447     /// );
448     /// assert_eq!(cm.titlecase_segment_to_string("", &root, no_adjust), "");
449     /// ```
450     ///
451     /// Tail casing behaviors:
452     ///
453     /// ```rust
454     /// use icu::casemap::options::{TitlecaseOptions, TrailingCase};
455     /// use icu::casemap::TitlecaseMapper;
456     /// use icu::locale::langid;
457     ///
458     /// let cm = TitlecaseMapper::new();
459     /// let root = langid!("und");
460     ///
461     /// let default_options = Default::default();
462     /// let mut preserve_case: TitlecaseOptions = Default::default();
463     /// preserve_case.trailing_case = Some(TrailingCase::Unchanged);
464     ///
465     /// // Exhibits trailing case when set:
466     /// assert_eq!(
467     ///     cm.titlecase_segment_to_string("spOngeBoB", &root, default_options),
468     ///     "Spongebob"
469     /// );
470     /// assert_eq!(
471     ///     cm.titlecase_segment_to_string("spOngeBoB", &root, preserve_case),
472     ///     "SpOngeBoB"
473     /// );
474     /// ```
titlecase_segment_to_string( self, src: &str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> String475     pub fn titlecase_segment_to_string(
476         self,
477         src: &str,
478         langid: &LanguageIdentifier,
479         options: TitlecaseOptions,
480     ) -> String {
481         self.titlecase_segment(src, langid, options)
482             .write_to_string()
483             .into_owned()
484     }
485 }
486