• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 use crate::internals::{CaseMapLocale, FoldOptions, FullCaseWriteable, StringAndWriteable};
6 use crate::provider::data::MappingKind;
7 use crate::provider::CaseMap;
8 use crate::provider::CaseMapV1;
9 use crate::set::ClosureSink;
10 use crate::titlecase::{LeadingAdjustment, TitlecaseOptions, TrailingCase};
11 use alloc::string::String;
12 use icu_locale_core::LanguageIdentifier;
13 use icu_provider::prelude::*;
14 use writeable::Writeable;
15 
16 /// A struct with the ability to convert characters and strings to uppercase or lowercase,
17 /// or fold them to a normalized form for case-insensitive comparison.
18 ///
19 /// Most methods for this type live on [`CaseMapperBorrowed`], which you can obtain via
20 /// [`CaseMapper::new()`] or [`CaseMapper::as_borrowed()`].
21 ///
22 /// # Examples
23 ///
24 /// ```rust
25 /// use icu::casemap::CaseMapper;
26 /// use icu::locale::langid;
27 ///
28 /// let cm = CaseMapper::new();
29 ///
30 /// assert_eq!(
31 ///     cm.uppercase_to_string("hello world", &langid!("und")),
32 ///     "HELLO WORLD"
33 /// );
34 /// assert_eq!(
35 ///     cm.lowercase_to_string("Γειά σου Κόσμε", &langid!("und")),
36 ///     "γειά σου κόσμε"
37 /// );
38 /// ```
39 #[derive(Clone, Debug)]
40 pub struct CaseMapper {
41     pub(crate) data: DataPayload<CaseMapV1>,
42 }
43 
44 impl AsRef<CaseMapper> for CaseMapper {
as_ref(&self) -> &CaseMapper45     fn as_ref(&self) -> &CaseMapper {
46         self
47     }
48 }
49 
50 /// A struct with the ability to convert characters and strings to uppercase or lowercase,
51 /// or fold them to a normalized form for case-insensitive comparison, borrowed version.
52 ///
53 /// See methods or [`CaseMapper`] for examples.
54 #[derive(Clone, Debug, Copy)]
55 pub struct CaseMapperBorrowed<'a> {
56     pub(crate) data: &'a CaseMap<'a>,
57 }
58 
59 impl CaseMapperBorrowed<'static> {
60     /// Cheaply converts a [`CaseMapperBorrowed<'static>`] into a [`CaseMapper`].
61     ///
62     /// Note: Due to branching and indirection, using [`CaseMapper`] might inhibit some
63     /// compile-time optimizations that are possible with [`CaseMapperBorrowed`].
static_to_owned(self) -> CaseMapper64     pub const fn static_to_owned(self) -> CaseMapper {
65         CaseMapper {
66             data: DataPayload::from_static_ref(self.data),
67         }
68     }
69     /// Creates a [`CaseMapperBorrowed`] using compiled data.
70     ///
71     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
72     ///
73     /// [�� Help choosing a constructor](icu_provider::constructors)
74     ///
75     /// # Examples
76     ///
77     /// ```rust
78     /// use icu::casemap::CaseMapper;
79     /// use icu::locale::langid;
80     ///
81     /// let cm = CaseMapper::new();
82     ///
83     /// assert_eq!(
84     ///     cm.uppercase_to_string("hello world", &langid!("und")),
85     ///     "HELLO WORLD"
86     /// );
87     /// ```
88     #[cfg(feature = "compiled_data")]
new() -> Self89     pub const fn new() -> Self {
90         Self {
91             data: crate::provider::Baked::SINGLETON_CASE_MAP_V1,
92         }
93     }
94 }
95 
96 #[cfg(feature = "compiled_data")]
97 impl Default for CaseMapperBorrowed<'static> {
default() -> Self98     fn default() -> Self {
99         Self::new()
100     }
101 }
102 
103 impl<'a> CaseMapperBorrowed<'a> {
104     /// Returns the full lowercase mapping of the given string as a [`Writeable`].
105     /// This function is context and language sensitive. Callers should pass the text's language
106     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
107     /// `Default::default()` for the root locale.
108     ///
109     /// See [`Self::lowercase_to_string()`] for the equivalent convenience function that returns a String,
110     /// as well as for an example.
lowercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a111     pub fn lowercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a {
112         self.data.full_helper_writeable::<false>(
113             src,
114             CaseMapLocale::from_langid(langid),
115             MappingKind::Lower,
116             TrailingCase::default(),
117         )
118     }
119 
120     /// Returns the full uppercase mapping of the given string as a [`Writeable`].
121     /// This function is context and language sensitive. Callers should pass the text's language
122     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
123     /// `Default::default()` for the root locale.
124     ///
125     /// See [`Self::uppercase_to_string()`] for the equivalent convenience function that returns a String,
126     /// as well as for an example.
uppercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a127     pub fn uppercase(self, src: &'a str, langid: &LanguageIdentifier) -> impl Writeable + 'a {
128         self.data.full_helper_writeable::<false>(
129             src,
130             CaseMapLocale::from_langid(langid),
131             MappingKind::Upper,
132             TrailingCase::default(),
133         )
134     }
135 
136     /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
137     /// the string as a single segment (and thus only titlecasing the beginning of it). Performs
138     /// the specified leading adjustment behavior from the options without loading additional data.
139     ///
140     /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
141     /// by the application, for example one can titlecase on a per-word basis by mixing this with
142     /// a `WordSegmenter`.
143     ///
144     /// This function is context and language sensitive. Callers should pass the text's language
145     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
146     /// `Default::default()` for the root locale.
147     ///
148     /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`]
149     /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load
150     /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See
151     /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between
152     /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode
153     /// is [`LeadingAdjustment::None`].
154     ///
155     /// See [`Self::titlecase_segment_with_only_case_data_to_string()`] for the equivalent convenience function that returns a String,
156     /// as well as for an example.
157     ///
158     /// [`TitlecaseMapper`]: crate::TitlecaseMapper
titlecase_segment_with_only_case_data( self, src: &'a str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> impl Writeable + 'a159     pub fn titlecase_segment_with_only_case_data(
160         self,
161         src: &'a str,
162         langid: &LanguageIdentifier,
163         options: TitlecaseOptions,
164     ) -> impl Writeable + 'a {
165         self.titlecase_segment_with_adjustment(src, langid, options, |data, ch| data.is_cased(ch))
166     }
167 
168     /// Helper to support different leading adjustment behaviors,
169     /// `char_is_lead` is a function that returns true for a character that is allowed to be the
170     /// first relevant character in a titlecasing string, when `leading_adjustment != None`
171     ///
172     /// We return a concrete type instead of `impl Trait` so the return value can be mixed with that of other calls
173     /// to this function with different closures
titlecase_segment_with_adjustment( self, src: &'a str, langid: &LanguageIdentifier, options: TitlecaseOptions, char_is_lead: impl Fn(&CaseMap, char) -> bool, ) -> StringAndWriteable<'a, FullCaseWriteable<'a, true>>174     pub(crate) fn titlecase_segment_with_adjustment(
175         self,
176         src: &'a str,
177         langid: &LanguageIdentifier,
178         options: TitlecaseOptions,
179         char_is_lead: impl Fn(&CaseMap, char) -> bool,
180     ) -> StringAndWriteable<'a, FullCaseWriteable<'a, true>> {
181         let (head, rest) = match options.leading_adjustment.unwrap_or_default() {
182             LeadingAdjustment::Auto | LeadingAdjustment::ToCased => {
183                 let first_cased = src
184                     .char_indices()
185                     .find(|(_i, ch)| char_is_lead(self.data, *ch));
186                 if let Some((first_cased, _ch)) = first_cased {
187                     (
188                         src.get(..first_cased).unwrap_or(""),
189                         src.get(first_cased..).unwrap_or(""),
190                     )
191                 } else {
192                     (src, "")
193                 }
194             }
195             LeadingAdjustment::None => ("", src),
196         };
197         let writeable = self.data.full_helper_writeable::<true>(
198             rest,
199             CaseMapLocale::from_langid(langid),
200             MappingKind::Title,
201             options.trailing_case.unwrap_or_default(),
202         );
203         StringAndWriteable {
204             string: head,
205             writeable,
206         }
207     }
208     /// Case-folds the characters in the given string as a [`Writeable`].
209     /// This function is locale-independent and context-insensitive.
210     ///
211     /// Can be used to test if two strings are case-insensitively equivalent.
212     ///
213     /// See [`Self::fold_string()`] for the equivalent convenience function that returns a String,
214     /// as well as for an example.
fold(self, src: &'a str) -> impl Writeable + 'a215     pub fn fold(self, src: &'a str) -> impl Writeable + 'a {
216         self.data.full_helper_writeable::<false>(
217             src,
218             CaseMapLocale::Root,
219             MappingKind::Fold,
220             TrailingCase::default(),
221         )
222     }
223 
224     /// Case-folds the characters in the given string as a [`Writeable`],
225     /// using Turkic (T) mappings for dotted/dotless I.
226     /// This function is locale-independent and context-insensitive.
227     ///
228     /// Can be used to test if two strings are case-insensitively equivalent.
229     ///
230     /// See [`Self::fold_turkic_string()`] for the equivalent convenience function that returns a String,
231     /// as well as for an example.
fold_turkic(self, src: &'a str) -> impl Writeable + 'a232     pub fn fold_turkic(self, src: &'a str) -> impl Writeable + 'a {
233         self.data.full_helper_writeable::<false>(
234             src,
235             CaseMapLocale::Turkish,
236             MappingKind::Fold,
237             TrailingCase::default(),
238         )
239     }
240 
241     /// Returns the full lowercase mapping of the given string as a String.
242     ///
243     /// This function is context and language sensitive. Callers should pass the text's language
244     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
245     /// `Default::default()` for the root locale.
246     ///
247     /// See [`Self::lowercase()`] for the equivalent lower-level function that returns a [`Writeable`]
248     ///
249     /// # Examples
250     ///
251     /// ```rust
252     /// use icu::casemap::CaseMapper;
253     /// use icu::locale::langid;
254     ///
255     /// let cm = CaseMapper::new();
256     /// let root = langid!("und");
257     ///
258     /// assert_eq!(cm.lowercase_to_string("hEllO WorLd", &root), "hello world");
259     /// assert_eq!(cm.lowercase_to_string("Γειά σου Κόσμε", &root), "γειά σου κόσμε");
260     /// assert_eq!(cm.lowercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया");
261     /// assert_eq!(cm.lowercase_to_string("Привет мир", &root), "привет мир");
262     ///
263     /// // Some behavior is language-sensitive
264     /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &root), "constantinople");
265     /// assert_eq!(cm.lowercase_to_string("CONSTANTINOPLE", &langid!("tr")), "constantınople");
266     /// ```
lowercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String267     pub fn lowercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String {
268         self.lowercase(src, langid).write_to_string().into_owned()
269     }
270 
271     /// Returns the full uppercase mapping of the given string as a String.
272     ///
273     /// This function is context and language sensitive. Callers should pass the text's language
274     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
275     /// `Default::default()` for the root locale.
276     ///
277     /// See [`Self::uppercase()`] for the equivalent lower-level function that returns a [`Writeable`]
278     ///
279     /// # Examples
280     ///
281     /// ```rust
282     /// use icu::casemap::CaseMapper;
283     /// use icu::locale::langid;
284     ///
285     /// let cm = CaseMapper::new();
286     /// let root = langid!("und");
287     ///
288     /// assert_eq!(cm.uppercase_to_string("hEllO WorLd", &root), "HELLO WORLD");
289     /// assert_eq!(cm.uppercase_to_string("Γειά σου Κόσμε", &root), "ΓΕΙΆ ΣΟΥ ΚΌΣΜΕ");
290     /// assert_eq!(cm.uppercase_to_string("नमस्ते दुनिया", &root), "नमस्ते दुनिया");
291     /// assert_eq!(cm.uppercase_to_string("Привет мир", &root), "ПРИВЕТ МИР");
292     ///
293     /// // Some behavior is language-sensitive
294     /// assert_eq!(cm.uppercase_to_string("istanbul", &root), "ISTANBUL");
295     /// assert_eq!(cm.uppercase_to_string("istanbul", &langid!("tr")), "İSTANBUL"); // Turkish dotted i
296     ///
297     /// assert_eq!(cm.uppercase_to_string("և Երևանի", &root), "ԵՒ ԵՐԵՒԱՆԻ");
298     /// assert_eq!(cm.uppercase_to_string("և Երևանի", &langid!("hy")), "ԵՎ ԵՐԵՎԱՆԻ"); // Eastern Armenian ech-yiwn ligature
299     /// ```
uppercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String300     pub fn uppercase_to_string(self, src: &str, langid: &LanguageIdentifier) -> String {
301         self.uppercase(src, langid).write_to_string().into_owned()
302     }
303 
304     /// Returns the full titlecase mapping of the given string as a [`Writeable`], treating
305     /// the string as a single segment (and thus only titlecasing the beginning of it). Performs
306     /// the specified leading adjustment behavior from the options without loading additional data.
307     ///
308     /// Note that [`TitlecaseMapper`] has better behavior, most users should consider using
309     /// it instead. This method primarily exists for people who care about the amount of data being loaded.
310     ///
311     /// This should typically be used as a lower-level helper to construct the titlecasing operation desired
312     /// by the application, for example one can titlecase on a per-word basis by mixing this with
313     /// a `WordSegmenter`.
314     ///
315     /// This function is context and language sensitive. Callers should pass the text's language
316     /// as a `LanguageIdentifier` (usually the `id` field of the `Locale`) if available, or
317     /// `Default::default()` for the root locale.
318     ///
319     /// This function performs "adjust to cased" leading adjustment behavior when [`LeadingAdjustment::Auto`] or [`LeadingAdjustment::ToCased`]
320     /// is set. Auto mode is not able to pick the "adjust to letter/number/symbol" behavior as this type does not load
321     /// the data to do so, use [`TitlecaseMapper`] if such behavior is desired. See
322     /// the docs of [`TitlecaseMapper`] for more information on what this means. There is no difference between
323     /// the behavior of this function and the equivalent ones on [`TitlecaseMapper`] when the head adjustment mode
324     /// is [`LeadingAdjustment::None`].
325     ///
326     /// See [`Self::titlecase_segment_with_only_case_data()`] for the equivalent lower-level function that returns a [`Writeable`]
327     ///
328     /// # Examples
329     ///
330     /// ```rust
331     /// use icu::casemap::CaseMapper;
332     /// use icu::locale::langid;
333     ///
334     /// let cm = CaseMapper::new();
335     /// let root = langid!("und");
336     ///
337     /// let default_options = Default::default();
338     ///
339     /// // note that the subsequent words are not titlecased, this function assumes
340     /// // that the entire string is a single segment and only titlecases at the beginning.
341     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("hEllO WorLd", &root, default_options), "Hello world");
342     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Γειά σου Κόσμε", &root, default_options), "Γειά σου κόσμε");
343     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("नमस्ते दुनिया", &root, default_options), "नमस्ते दुनिया");
344     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("Привет мир", &root, default_options), "Привет мир");
345     ///
346     /// // Some behavior is language-sensitive
347     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &root, default_options), "Istanbul");
348     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("istanbul", &langid!("tr"), default_options), "İstanbul"); // Turkish dotted i
349     ///
350     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &root, default_options), "Եւ երևանի");
351     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("և Երևանի", &langid!("hy"), default_options), "Եվ երևանի"); // Eastern Armenian ech-yiwn ligature
352     ///
353     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &root, default_options), "Ijkdijk");
354     /// assert_eq!(cm.titlecase_segment_with_only_case_data_to_string("ijkdijk", &langid!("nl"), default_options), "IJkdijk"); // Dutch IJ digraph
355     /// ```
356     ///
357     /// [`TitlecaseMapper`]: crate::TitlecaseMapper
titlecase_segment_with_only_case_data_to_string( self, src: &str, langid: &LanguageIdentifier, options: TitlecaseOptions, ) -> String358     pub fn titlecase_segment_with_only_case_data_to_string(
359         self,
360         src: &str,
361         langid: &LanguageIdentifier,
362         options: TitlecaseOptions,
363     ) -> String {
364         self.titlecase_segment_with_only_case_data(src, langid, options)
365             .write_to_string()
366             .into_owned()
367     }
368 
369     /// Case-folds the characters in the given string as a String.
370     /// This function is locale-independent and context-insensitive.
371     ///
372     /// Can be used to test if two strings are case-insensitively equivalent.
373     ///
374     /// See [`Self::fold()`] for the equivalent lower-level function that returns a [`Writeable`]
375     ///s s
376     /// # Examples
377     ///
378     /// ```rust
379     /// use icu::casemap::CaseMapper;
380     ///
381     /// let cm = CaseMapper::new();
382     ///
383     /// // Check if two strings are equivalent case insensitively
384     /// assert_eq!(cm.fold_string("hEllO WorLd"), cm.fold_string("HELLO worlD"));
385     ///
386     /// assert_eq!(cm.fold_string("hEllO WorLd"), "hello world");
387     /// assert_eq!(cm.fold_string("Γειά σου Κόσμε"), "γειά σου κόσμε");
388     /// assert_eq!(cm.fold_string("नमस्ते दुनिया"), "नमस्ते दुनिया");
389     /// assert_eq!(cm.fold_string("Привет мир"), "привет мир");
390     /// ```
fold_string(self, src: &str) -> String391     pub fn fold_string(self, src: &str) -> String {
392         self.fold(src).write_to_string().into_owned()
393     }
394 
395     /// Case-folds the characters in the given string as a String,
396     /// using Turkic (T) mappings for dotted/dotless I.
397     /// This function is locale-independent and context-insensitive.
398     ///
399     /// Can be used to test if two strings are case-insensitively equivalent.
400     ///
401     /// See [`Self::fold_turkic()`] for the equivalent lower-level function that returns a [`Writeable`]
402     ///
403     /// # Examples
404     ///
405     /// ```rust
406     /// use icu::casemap::CaseMapper;
407     ///
408     /// let cm = CaseMapper::new();
409     ///
410     /// // Check if two strings are equivalent case insensitively
411     /// assert_eq!(cm.fold_turkic_string("İstanbul"), cm.fold_turkic_string("iSTANBUL"));
412     ///
413     /// assert_eq!(cm.fold_turkic_string("İstanbul not Constantinople"), "istanbul not constantinople");
414     /// assert_eq!(cm.fold_turkic_string("Istanbul not Constantınople"), "ıstanbul not constantınople");
415     ///
416     /// assert_eq!(cm.fold_turkic_string("hEllO WorLd"), "hello world");
417     /// assert_eq!(cm.fold_turkic_string("Γειά σου Κόσμε"), "γειά σου κόσμε");
418     /// assert_eq!(cm.fold_turkic_string("नमस्ते दुनिया"), "नमस्ते दुनिया");
419     /// assert_eq!(cm.fold_turkic_string("Привет мир"), "привет мир");
420     /// ```
fold_turkic_string(self, src: &str) -> String421     pub fn fold_turkic_string(self, src: &str) -> String {
422         self.fold_turkic(src).write_to_string().into_owned()
423     }
424 
425     /// Adds all simple case mappings and the full case folding for `c` to `set`.
426     /// Also adds special case closure mappings.
427     ///
428     /// Identical to [`CaseMapCloserBorrowed::add_case_closure_to()`], see docs there for more information.
429     /// This method is duplicated so that one does not need to load extra unfold data
430     /// if they only need this and not also [`CaseMapCloserBorrowed::add_string_case_closure_to()`].
431     ///
432     ///
433     /// # Examples
434     ///
435     /// ```rust
436     /// use icu::casemap::CaseMapper;
437     /// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
438     ///
439     /// let cm = CaseMapper::new();
440     /// let mut builder = CodePointInversionListBuilder::new();
441     /// cm.add_case_closure_to('s', &mut builder);
442     ///
443     /// let set = builder.build();
444     ///
445     /// assert!(set.contains('S'));
446     /// assert!(set.contains('ſ'));
447     /// assert!(!set.contains('s')); // does not contain itself
448     /// ```
449     ///
450     /// [`CaseMapCloserBorrowed::add_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_case_closure_to
451     /// [`CaseMapCloserBorrowed::add_string_case_closure_to()`]: crate::CaseMapCloserBorrowed::add_string_case_closure_to
add_case_closure_to<S: ClosureSink>(self, c: char, set: &mut S)452     pub fn add_case_closure_to<S: ClosureSink>(self, c: char, set: &mut S) {
453         self.data.add_case_closure_to(c, set);
454     }
455 
456     /// Returns the lowercase mapping of the given `char`.
457     /// This function only implements simple and common mappings. Full mappings,
458     /// which can map one `char` to a string, are not included.
459     /// For full mappings, use [`CaseMapperBorrowed::lowercase`].
460     ///
461     /// # Examples
462     ///
463     /// ```rust
464     /// use icu::casemap::CaseMapper;
465     ///
466     /// let cm = CaseMapper::new();
467     ///
468     /// assert_eq!(cm.simple_lowercase('C'), 'c');
469     /// assert_eq!(cm.simple_lowercase('c'), 'c');
470     /// assert_eq!(cm.simple_lowercase('Ć'), 'ć');
471     /// assert_eq!(cm.simple_lowercase('Γ'), 'γ');
472     /// ```
simple_lowercase(self, c: char) -> char473     pub fn simple_lowercase(self, c: char) -> char {
474         self.data.simple_lower(c)
475     }
476 
477     /// Returns the uppercase mapping of the given `char`.
478     /// This function only implements simple and common mappings. Full mappings,
479     /// which can map one `char` to a string, are not included.
480     /// For full mappings, use [`CaseMapperBorrowed::uppercase`].
481     ///
482     /// # Examples
483     ///
484     /// ```rust
485     /// use icu::casemap::CaseMapper;
486     ///
487     /// let cm = CaseMapper::new();
488     ///
489     /// assert_eq!(cm.simple_uppercase('c'), 'C');
490     /// assert_eq!(cm.simple_uppercase('C'), 'C');
491     /// assert_eq!(cm.simple_uppercase('ć'), 'Ć');
492     /// assert_eq!(cm.simple_uppercase('γ'), 'Γ');
493     ///
494     /// assert_eq!(cm.simple_uppercase('dz'), 'DZ');
495     /// ```
simple_uppercase(self, c: char) -> char496     pub fn simple_uppercase(self, c: char) -> char {
497         self.data.simple_upper(c)
498     }
499 
500     /// Returns the titlecase mapping of the given `char`.
501     /// This function only implements simple and common mappings. Full mappings,
502     /// which can map one `char` to a string, are not included.
503     ///
504     /// # Examples
505     ///
506     /// ```rust
507     /// use icu::casemap::CaseMapper;
508     ///
509     /// let cm = CaseMapper::new();
510     ///
511     /// assert_eq!(cm.simple_titlecase('dz'), 'Dz');
512     ///
513     /// assert_eq!(cm.simple_titlecase('c'), 'C');
514     /// assert_eq!(cm.simple_titlecase('C'), 'C');
515     /// assert_eq!(cm.simple_titlecase('ć'), 'Ć');
516     /// assert_eq!(cm.simple_titlecase('γ'), 'Γ');
517     /// ```
simple_titlecase(self, c: char) -> char518     pub fn simple_titlecase(self, c: char) -> char {
519         self.data.simple_title(c)
520     }
521 
522     /// Returns the simple case folding of the given char.
523     /// For full mappings, use [`CaseMapperBorrowed::fold`].
524     ///
525     /// This function can be used to perform caseless matches on
526     /// individual characters.
527     /// > *Note:* With Unicode 15.0 data, there are three
528     /// > pairs of characters for which equivalence under this
529     /// > function is inconsistent with equivalence of the
530     /// > one-character strings under [`CaseMapperBorrowed::fold`].
531     /// > This is resolved in Unicode 15.1 and later.
532     ///
533     /// For compatibility applications where simple case folding
534     /// of strings is required, this function can be applied to
535     /// each character of a string.  Note that the resulting
536     /// equivalence relation is different from that obtained
537     /// by [`CaseMapperBorrowed::fold`]:
538     /// The strings "Straße" and "STRASSE" are distinct
539     /// under simple case folding, but are equivalent under
540     /// default (full) case folding.
541     ///
542     /// # Examples
543     ///
544     /// ```rust
545     /// use icu::casemap::CaseMapper;
546     ///
547     /// let cm = CaseMapper::new();
548     ///
549     /// // perform case insensitive checks
550     /// assert_eq!(cm.simple_fold('σ'), cm.simple_fold('ς'));
551     /// assert_eq!(cm.simple_fold('Σ'), cm.simple_fold('ς'));
552     ///
553     /// assert_eq!(cm.simple_fold('c'), 'c');
554     /// assert_eq!(cm.simple_fold('Ć'), 'ć');
555     /// assert_eq!(cm.simple_fold('Γ'), 'γ');
556     /// assert_eq!(cm.simple_fold('ς'), 'σ');
557     ///
558     /// assert_eq!(cm.simple_fold('ß'), 'ß');
559     /// assert_eq!(cm.simple_fold('I'), 'i');
560     /// assert_eq!(cm.simple_fold('İ'), 'İ');
561     /// assert_eq!(cm.simple_fold('ı'), 'ı');
562     /// ```
simple_fold(self, c: char) -> char563     pub fn simple_fold(self, c: char) -> char {
564         self.data.simple_fold(c, FoldOptions::default())
565     }
566 
567     /// Returns the simple case folding of the given char, using Turkic (T) mappings for
568     /// dotted/dotless i. This function does not fold `i` and `I` to the same character. Instead,
569     /// `I` will fold to `ı`, and `İ` will fold to `i`. Otherwise, this is the same as
570     /// [`CaseMapperBorrowed::fold()`].
571     ///
572     /// You can use the case folding to perform Turkic caseless matches on characters
573     /// provided they don't full-casefold to strings. To avoid that situation,
574     /// convert to a string and use [`CaseMapperBorrowed::fold_turkic`].
575     ///
576     ///
577     /// # Examples
578     ///
579     /// ```rust
580     /// use icu::casemap::CaseMapper;
581     ///
582     /// let cm = CaseMapper::new();
583     ///
584     /// assert_eq!(cm.simple_fold_turkic('I'), 'ı');
585     /// assert_eq!(cm.simple_fold_turkic('İ'), 'i');
586     /// ```
simple_fold_turkic(self, c: char) -> char587     pub fn simple_fold_turkic(self, c: char) -> char {
588         self.data
589             .simple_fold(c, FoldOptions::with_turkic_mappings())
590     }
591 }
592 
593 impl CaseMapper {
594     /// Creates a [`CaseMapperBorrowed`] using compiled data.
595     ///
596     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
597     ///
598     /// [�� Help choosing a constructor](icu_provider::constructors)
599     ///
600     /// # Examples
601     ///
602     /// ```rust
603     /// use icu::casemap::CaseMapper;
604     /// use icu::locale::langid;
605     ///
606     /// let cm = CaseMapper::new();
607     ///
608     /// assert_eq!(
609     ///     cm.uppercase_to_string("hello world", &langid!("und")),
610     ///     "HELLO WORLD"
611     /// );
612     /// ```
613     #[cfg(feature = "compiled_data")]
614     #[allow(clippy::new_ret_no_self)] // Intentional
new() -> CaseMapperBorrowed<'static>615     pub const fn new() -> CaseMapperBorrowed<'static> {
616         CaseMapperBorrowed::new()
617     }
618 
619     /// Constructs a borrowed version of this type for more efficient querying.
as_borrowed(&self) -> CaseMapperBorrowed<'_>620     pub fn as_borrowed(&self) -> CaseMapperBorrowed<'_> {
621         CaseMapperBorrowed {
622             data: self.data.get(),
623         }
624     }
625 
626     icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
627     functions: [
628         new: skip,
629         try_new_with_buffer_provider,
630         try_new_unstable,
631         Self,
632     ]);
633 
634     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
try_new_unstable<P>(provider: &P) -> Result<CaseMapper, DataError> where P: DataProvider<CaseMapV1> + ?Sized,635     pub fn try_new_unstable<P>(provider: &P) -> Result<CaseMapper, DataError>
636     where
637         P: DataProvider<CaseMapV1> + ?Sized,
638     {
639         let data = provider.load(Default::default())?.payload;
640         Ok(Self { data })
641     }
642 }
643 
644 #[cfg(test)]
645 mod tests {
646     use super::*;
647     use icu_locale_core::langid;
648 
649     #[test]
650     /// Tests for SpecialCasing.txt. Some of the special cases are data-driven, some are code-driven
test_special_cases()651     fn test_special_cases() {
652         let cm = CaseMapper::new();
653         let root = langid!("und");
654         let default_options = Default::default();
655 
656         // Ligatures
657 
658         // U+FB00 LATIN SMALL LIGATURE FF
659         assert_eq!(cm.uppercase_to_string("ff", &root), "FF");
660         // U+FB05 LATIN SMALL LIGATURE LONG S T
661         assert_eq!(cm.uppercase_to_string("ſt", &root), "ST");
662 
663         // No corresponding uppercased character
664 
665         // U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
666         assert_eq!(cm.uppercase_to_string("ʼn", &root), "ʼN");
667 
668         // U+1F50 GREEK SMALL LETTER UPSILON WITH PSILI
669         assert_eq!(cm.uppercase_to_string("ὐ", &root), "Υ̓");
670         // U+1FF6 GREEK SMALL LETTER OMEGA WITH PERISPOMENI
671         assert_eq!(cm.uppercase_to_string("ῶ", &root), "Ω͂");
672 
673         // YPOGEGRAMMENI / PROSGEGRAMMENI special cases
674 
675         // E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
676         assert_eq!(
677             cm.uppercase_to_string("α\u{0313}\u{0345}", &root),
678             "Α\u{0313}Ι"
679         );
680         // but the YPOGEGRAMMENI should not titlecase
681         assert_eq!(
682             cm.titlecase_segment_with_only_case_data_to_string(
683                 "α\u{0313}\u{0345}",
684                 &root,
685                 default_options
686             ),
687             "Α\u{0313}\u{0345}"
688         );
689 
690         // U+1F80 GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
691         assert_eq!(
692             cm.titlecase_segment_with_only_case_data_to_string("ᾀ", &root, default_options),
693             "ᾈ"
694         );
695         assert_eq!(cm.uppercase_to_string("ᾀ", &root), "ἈΙ");
696 
697         // U+1FFC GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
698         assert_eq!(cm.lowercase_to_string("ῼ", &root), "ῳ");
699         assert_eq!(
700             cm.titlecase_segment_with_only_case_data_to_string("ῼ", &root, default_options),
701             "ῼ"
702         );
703         assert_eq!(cm.uppercase_to_string("ῼ", &root), "ΩΙ");
704 
705         // U+1F98 GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
706         assert_eq!(cm.lowercase_to_string("ᾘ", &root), "ᾐ");
707         assert_eq!(
708             cm.titlecase_segment_with_only_case_data_to_string("ᾘ", &root, default_options),
709             "ᾘ"
710         );
711         assert_eq!(cm.uppercase_to_string("ᾘ", &root), "ἨΙ");
712 
713         // U+1FB2 GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
714         assert_eq!(cm.lowercase_to_string("ᾲ", &root), "ᾲ");
715         assert_eq!(
716             cm.titlecase_segment_with_only_case_data_to_string("ᾲ", &root, default_options),
717             "Ὰ\u{345}"
718         );
719         assert_eq!(cm.uppercase_to_string("ᾲ", &root), "ᾺΙ");
720 
721         // Final sigma test
722         // U+03A3 GREEK CAPITAL LETTER SIGMA in Final_Sigma context
723         assert_eq!(cm.lowercase_to_string("ΙΙΙΣ", &root), "ιιις");
724 
725         // Turkish / Azeri
726         let tr = langid!("tr");
727         let az = langid!("az");
728         // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
729         assert_eq!(cm.lowercase_to_string("İ", &tr), "i");
730         assert_eq!(cm.lowercase_to_string("İ", &az), "i");
731         assert_eq!(
732             cm.titlecase_segment_with_only_case_data_to_string("İ", &tr, default_options),
733             "İ"
734         );
735         assert_eq!(
736             cm.titlecase_segment_with_only_case_data_to_string("İ", &az, default_options),
737             "İ"
738         );
739         assert_eq!(cm.uppercase_to_string("İ", &tr), "İ");
740         assert_eq!(cm.uppercase_to_string("İ", &az), "İ");
741 
742         // U+0049 LATIN CAPITAL LETTER I and U+0307 COMBINING DOT ABOVE
743         assert_eq!(cm.lowercase_to_string("I\u{0307}", &tr), "i");
744         assert_eq!(cm.lowercase_to_string("I\u{0307}", &az), "i");
745         assert_eq!(
746             cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &tr, default_options),
747             "I\u{0307}"
748         );
749         assert_eq!(
750             cm.titlecase_segment_with_only_case_data_to_string("I\u{0307}", &az, default_options),
751             "I\u{0307}"
752         );
753         assert_eq!(cm.uppercase_to_string("I\u{0307}", &tr), "I\u{0307}");
754         assert_eq!(cm.uppercase_to_string("I\u{0307}", &az), "I\u{0307}");
755 
756         // U+0049 LATIN CAPITAL LETTER I
757         assert_eq!(cm.lowercase_to_string("I", &tr), "ı");
758         assert_eq!(cm.lowercase_to_string("I", &az), "ı");
759         assert_eq!(
760             cm.titlecase_segment_with_only_case_data_to_string("I", &tr, default_options),
761             "I"
762         );
763         assert_eq!(
764             cm.titlecase_segment_with_only_case_data_to_string("I", &az, default_options),
765             "I"
766         );
767         assert_eq!(cm.uppercase_to_string("I", &tr), "I");
768         assert_eq!(cm.uppercase_to_string("I", &az), "I");
769 
770         // U+0069 LATIN SMALL LETTER I
771         assert_eq!(cm.lowercase_to_string("i", &tr), "i");
772         assert_eq!(cm.lowercase_to_string("i", &az), "i");
773         assert_eq!(
774             cm.titlecase_segment_with_only_case_data_to_string("i", &tr, default_options),
775             "İ"
776         );
777         assert_eq!(
778             cm.titlecase_segment_with_only_case_data_to_string("i", &az, default_options),
779             "İ"
780         );
781         assert_eq!(cm.uppercase_to_string("i", &tr), "İ");
782         assert_eq!(cm.uppercase_to_string("i", &az), "İ");
783     }
784 
785     #[test]
test_cherokee_case_folding()786     fn test_cherokee_case_folding() {
787         let case_mapping = CaseMapper::new();
788         assert_eq!(case_mapping.simple_fold('Ꭰ'), 'Ꭰ');
789         assert_eq!(case_mapping.simple_fold('ꭰ'), 'Ꭰ');
790         assert_eq!(case_mapping.simple_fold_turkic('Ꭰ'), 'Ꭰ');
791         assert_eq!(case_mapping.simple_fold_turkic('ꭰ'), 'Ꭰ');
792         assert_eq!(case_mapping.fold_string("Ꭰ"), "Ꭰ");
793         assert_eq!(case_mapping.fold_string("ꭰ"), "Ꭰ");
794         assert_eq!(case_mapping.fold_turkic_string("Ꭰ"), "Ꭰ");
795         assert_eq!(case_mapping.fold_turkic_string("ꭰ"), "Ꭰ");
796     }
797 }
798