• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! The collection of code for locale canonicalization.
6 
7 use crate::provider::*;
8 use alloc::vec::Vec;
9 use core::cmp::Ordering;
10 
11 use crate::LocaleExpander;
12 use crate::TransformResult;
13 use icu_locale_core::extensions::Extensions;
14 use icu_locale_core::subtags::{Language, Region, Script};
15 use icu_locale_core::{
16     extensions::unicode::key,
17     subtags::{language, Variant, Variants},
18     LanguageIdentifier, Locale,
19 };
20 use icu_provider::prelude::*;
21 use tinystr::TinyAsciiStr;
22 
23 /// Implements the algorithm defined in *[UTS #35: Annex C, LocaleId Canonicalization]*.
24 ///
25 /// # Examples
26 ///
27 /// ```
28 /// use icu::locale::Locale;
29 /// use icu::locale::{LocaleCanonicalizer, TransformResult};
30 ///
31 /// let lc = LocaleCanonicalizer::new_extended();
32 ///
33 /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
34 /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
35 /// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
36 /// ```
37 ///
38 /// [UTS #35: Annex C, LocaleId Canonicalization]: http://unicode.org/reports/tr35/#LocaleId_Canonicalization
39 #[derive(Debug)]
40 pub struct LocaleCanonicalizer<Expander = LocaleExpander> {
41     /// Data to support canonicalization.
42     aliases: DataPayload<LocaleAliasesV1>,
43     /// Likely subtags implementation for delegation.
44     expander: Expander,
45 }
46 
uts35_rule_matches<'a, I>( source: &LanguageIdentifier, language: Language, script: Option<Script>, region: Option<Region>, raw_variants: I, ) -> bool where I: Iterator<Item = &'a str>,47 fn uts35_rule_matches<'a, I>(
48     source: &LanguageIdentifier,
49     language: Language,
50     script: Option<Script>,
51     region: Option<Region>,
52     raw_variants: I,
53 ) -> bool
54 where
55     I: Iterator<Item = &'a str>,
56 {
57     (language.is_default() || language == source.language)
58         && (script.is_none() || script == source.script)
59         && (region.is_none() || region == source.region)
60         && {
61             // Checks if variants are a subset of source variants.
62             // As both iterators are sorted, this can be done linearly.
63             let mut source_variants = source.variants.iter();
64             'outer: for raw_variant in raw_variants {
65                 for source_variant in source_variants.by_ref() {
66                     match source_variant.as_str().cmp(raw_variant) {
67                         Ordering::Equal => {
68                             // The source_variant is equal, move to next raw_variant
69                             continue 'outer;
70                         }
71                         Ordering::Less => {
72                             // The source_variant is smaller, take the next source_variant
73                         }
74                         Ordering::Greater => {
75                             // The source_variant is greater,
76                             // raw_variants is not a subset of source_variants
77                             return false;
78                         }
79                     }
80                 }
81                 // There are raw_variants left after we exhausted source_variants
82                 return false;
83             }
84             true
85         }
86 }
87 
uts35_replacement<'a, I>( source: &mut LanguageIdentifier, ruletype_has_language: bool, ruletype_has_script: bool, ruletype_has_region: bool, ruletype_variants: Option<I>, replacement: &LanguageIdentifier, ) where I: Iterator<Item = &'a str>,88 fn uts35_replacement<'a, I>(
89     source: &mut LanguageIdentifier,
90     ruletype_has_language: bool,
91     ruletype_has_script: bool,
92     ruletype_has_region: bool,
93     ruletype_variants: Option<I>,
94     replacement: &LanguageIdentifier,
95 ) where
96     I: Iterator<Item = &'a str>,
97 {
98     if ruletype_has_language || (source.language.is_default() && !replacement.language.is_default())
99     {
100         source.language = replacement.language;
101     }
102     if ruletype_has_script || (source.script.is_none() && replacement.script.is_some()) {
103         source.script = replacement.script;
104     }
105     if ruletype_has_region || (source.region.is_none() && replacement.region.is_some()) {
106         source.region = replacement.region;
107     }
108     if let Some(skips) = ruletype_variants {
109         // The rule matches if the ruletype variants are a subset of the source variants.
110         // This means ja-Latn-fonipa-hepburn-heploc matches against the rule for
111         // hepburn-heploc and is canonicalized to ja-Latn-alalc97-fonipa
112 
113         // We're merging three sorted deduped iterators into a new sequence:
114         // sources - skips + replacements
115 
116         let mut sources = source.variants.iter().peekable();
117         let mut replacements = replacement.variants.iter().peekable();
118         let mut skips = skips.peekable();
119 
120         let mut variants: Vec<Variant> = Vec::new();
121 
122         loop {
123             match (sources.peek(), skips.peek(), replacements.peek()) {
124                 (Some(&source), Some(skip), _)
125                     if source.as_str().cmp(skip) == Ordering::Greater =>
126                 {
127                     skips.next();
128                 }
129                 (Some(&source), Some(skip), _) if source.as_str().cmp(skip) == Ordering::Equal => {
130                     skips.next();
131                     sources.next();
132                 }
133                 (Some(&source), _, Some(&replacement))
134                     if replacement.cmp(source) == Ordering::Less =>
135                 {
136                     variants.push(*replacement);
137                     replacements.next();
138                 }
139                 (Some(&source), _, Some(&replacement))
140                     if replacement.cmp(source) == Ordering::Equal =>
141                 {
142                     variants.push(*source);
143                     sources.next();
144                     replacements.next();
145                 }
146                 (Some(&source), _, _) => {
147                     variants.push(*source);
148                     sources.next();
149                 }
150                 (None, _, Some(&replacement)) => {
151                     variants.push(*replacement);
152                     replacements.next();
153                 }
154                 (None, _, None) => {
155                     break;
156                 }
157             }
158         }
159         source.variants = Variants::from_vec_unchecked(variants);
160     }
161 }
162 
163 #[inline]
uts35_check_language_rules( langid: &mut LanguageIdentifier, alias_data: &DataPayload<LocaleAliasesV1>, ) -> TransformResult164 fn uts35_check_language_rules(
165     langid: &mut LanguageIdentifier,
166     alias_data: &DataPayload<LocaleAliasesV1>,
167 ) -> TransformResult {
168     if !langid.language.is_default() {
169         let lang: TinyAsciiStr<3> = langid.language.into();
170         let replacement = if lang.len() == 2 {
171             alias_data
172                 .get()
173                 .language_len2
174                 .get(&lang.resize().to_unvalidated())
175         } else {
176             alias_data.get().language_len3.get(&lang.to_unvalidated())
177         };
178 
179         if let Some(replacement) = replacement {
180             if let Ok(new_langid) = replacement.parse() {
181                 uts35_replacement::<core::iter::Empty<&str>>(
182                     langid,
183                     true,
184                     false,
185                     false,
186                     None,
187                     &new_langid,
188                 );
189                 return TransformResult::Modified;
190             }
191         }
192     }
193 
194     TransformResult::Unmodified
195 }
196 
197 impl LocaleCanonicalizer<LocaleExpander> {
198     /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data,
199     /// using a [`LocaleExpander`] for common locales.
200     ///
201     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
202     ///
203     /// [�� Help choosing a constructor](icu_provider::constructors)
204     #[cfg(feature = "compiled_data")]
new_common() -> Self205     pub const fn new_common() -> Self {
206         Self::new_with_expander(LocaleExpander::new_common())
207     }
208 
209     icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
210         functions: [
211             new_common: skip,
212             try_new_common_with_buffer_provider,
213             try_new_common_unstable,
214             Self,
215         ]
216     );
217 
218     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_common)]
try_new_common_unstable<P>(provider: &P) -> Result<Self, DataError> where P: DataProvider<LocaleAliasesV1> + DataProvider<LocaleLikelySubtagsLanguageV1> + DataProvider<LocaleLikelySubtagsScriptRegionV1> + ?Sized,219     pub fn try_new_common_unstable<P>(provider: &P) -> Result<Self, DataError>
220     where
221         P: DataProvider<LocaleAliasesV1>
222             + DataProvider<LocaleLikelySubtagsLanguageV1>
223             + DataProvider<LocaleLikelySubtagsScriptRegionV1>
224             + ?Sized,
225     {
226         let expander = LocaleExpander::try_new_common_unstable(provider)?;
227         Self::try_new_with_expander_unstable(provider, expander)
228     }
229 
230     /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data,
231     /// using a [`LocaleExpander`] for all locales.
232     ///
233     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
234     ///
235     /// [�� Help choosing a constructor](icu_provider::constructors)
236     #[cfg(feature = "compiled_data")]
new_extended() -> Self237     pub const fn new_extended() -> Self {
238         Self::new_with_expander(LocaleExpander::new_extended())
239     }
240 
241     icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
242         functions: [
243             new_extended: skip,
244             try_new_extended_with_buffer_provider,
245             try_new_extended_unstable,
246             Self,
247         ]
248     );
249 
250     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_extended)]
try_new_extended_unstable<P>(provider: &P) -> Result<Self, DataError> where P: DataProvider<LocaleAliasesV1> + DataProvider<LocaleLikelySubtagsLanguageV1> + DataProvider<LocaleLikelySubtagsScriptRegionV1> + DataProvider<LocaleLikelySubtagsExtendedV1> + ?Sized,251     pub fn try_new_extended_unstable<P>(provider: &P) -> Result<Self, DataError>
252     where
253         P: DataProvider<LocaleAliasesV1>
254             + DataProvider<LocaleLikelySubtagsLanguageV1>
255             + DataProvider<LocaleLikelySubtagsScriptRegionV1>
256             + DataProvider<LocaleLikelySubtagsExtendedV1>
257             + ?Sized,
258     {
259         let expander = LocaleExpander::try_new_extended_unstable(provider)?;
260         Self::try_new_with_expander_unstable(provider, expander)
261     }
262 }
263 
264 impl<Expander: AsRef<LocaleExpander>> LocaleCanonicalizer<Expander> {
265     /// Creates a [`LocaleCanonicalizer`] with a custom [`LocaleExpander`] and compiled data.
266     ///
267     /// ✨ *Enabled with the `compiled_data` Cargo feature.*
268     ///
269     /// [�� Help choosing a constructor](icu_provider::constructors)
270     #[cfg(feature = "compiled_data")]
new_with_expander(expander: Expander) -> Self271     pub const fn new_with_expander(expander: Expander) -> Self {
272         Self {
273             aliases: DataPayload::from_static_ref(
274                 crate::provider::Baked::SINGLETON_LOCALE_ALIASES_V1,
275             ),
276             expander,
277         }
278     }
279 
280     #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)]
try_new_with_expander_unstable<P>( provider: &P, expander: Expander, ) -> Result<Self, DataError> where P: DataProvider<LocaleAliasesV1> + ?Sized,281     pub fn try_new_with_expander_unstable<P>(
282         provider: &P,
283         expander: Expander,
284     ) -> Result<Self, DataError>
285     where
286         P: DataProvider<LocaleAliasesV1> + ?Sized,
287     {
288         let aliases: DataPayload<LocaleAliasesV1> = provider.load(Default::default())?.payload;
289 
290         Ok(Self { aliases, expander })
291     }
292 
293     icu_provider::gen_buffer_data_constructors!((options: Expander) -> error: DataError,
294         functions: [
295             new_with_expander: skip,
296             try_new_with_expander_with_buffer_provider,
297             try_new_with_expander_unstable,
298             Self,
299         ]
300     );
301 
302     /// The canonicalize method potentially updates a passed in locale in place
303     /// depending up the results of running the canonicalization algorithm
304     /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>.
305     ///
306     /// Some BCP47 canonicalization data is not part of the CLDR json package. Because
307     /// of this, some canonicalizations are not performed, e.g. the canonicalization of
308     /// `und-u-ca-islamicc` to `und-u-ca-islamic-civil`. This will be fixed in a future
309     /// release once the missing data has been added to the CLDR json data. See:
310     /// <https://github.com/unicode-org/icu4x/issues/746>
311     ///
312     /// # Examples
313     ///
314     /// ```
315     /// use icu::locale::{Locale, LocaleCanonicalizer, TransformResult};
316     ///
317     /// let lc = LocaleCanonicalizer::new_extended();
318     ///
319     /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
320     /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
321     /// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
322     /// ```
canonicalize(&self, locale: &mut Locale) -> TransformResult323     pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult {
324         let mut result = TransformResult::Unmodified;
325 
326         // This loops until we get a 'fixed point', where applying the rules do not
327         // result in any more changes.
328         loop {
329             // These are linear searches due to the ordering imposed by the canonicalization
330             // rules, where rules with more variants should be considered first. With the
331             // current data in CLDR, we will only do this for locales which have variants,
332             // or new rules which we haven't special-cased yet (of which there are fewer
333             // than 20).
334             let modified = if locale.id.variants.is_empty() {
335                 self.canonicalize_absolute_language_fallbacks(&mut locale.id)
336             } else {
337                 self.canonicalize_language_variant_fallbacks(&mut locale.id)
338             };
339             if modified {
340                 result = TransformResult::Modified;
341                 continue;
342             }
343 
344             if !locale.id.language.is_default() {
345                 // If the region is specified, check sgn-region rules first
346                 if let Some(region) = locale.id.region {
347                     if locale.id.language == language!("sgn") {
348                         if let Some(&sgn_lang) = self
349                             .aliases
350                             .get()
351                             .sgn_region
352                             .get(&region.to_tinystr().to_unvalidated())
353                         {
354                             uts35_replacement::<core::iter::Empty<&str>>(
355                                 &mut locale.id,
356                                 true,
357                                 false,
358                                 true,
359                                 None,
360                                 &sgn_lang.into(),
361                             );
362                             result = TransformResult::Modified;
363                             continue;
364                         }
365                     }
366                 }
367 
368                 if uts35_check_language_rules(&mut locale.id, &self.aliases)
369                     == TransformResult::Modified
370                 {
371                     result = TransformResult::Modified;
372                     continue;
373                 }
374             }
375 
376             if let Some(script) = locale.id.script {
377                 if let Some(&replacement) = self
378                     .aliases
379                     .get()
380                     .script
381                     .get(&script.to_tinystr().to_unvalidated())
382                 {
383                     locale.id.script = Some(replacement);
384                     result = TransformResult::Modified;
385                     continue;
386                 }
387             }
388 
389             if let Some(region) = locale.id.region {
390                 let replacement = if region.is_alphabetic() {
391                     self.aliases
392                         .get()
393                         .region_alpha
394                         .get(&region.to_tinystr().resize().to_unvalidated())
395                 } else {
396                     self.aliases
397                         .get()
398                         .region_num
399                         .get(&region.to_tinystr().to_unvalidated())
400                 };
401                 if let Some(&replacement) = replacement {
402                     locale.id.region = Some(replacement);
403                     result = TransformResult::Modified;
404                     continue;
405                 }
406 
407                 if let Some(regions) = self
408                     .aliases
409                     .get()
410                     .complex_region
411                     .get(&region.to_tinystr().to_unvalidated())
412                 {
413                     // Skip if regions are empty
414                     if let Some(default_region) = regions.get(0) {
415                         let mut maximized = LanguageIdentifier {
416                             language: locale.id.language,
417                             script: locale.id.script,
418                             region: None,
419                             variants: Variants::default(),
420                         };
421 
422                         locale.id.region = Some(
423                             match (
424                                 self.expander.as_ref().maximize(&mut maximized),
425                                 maximized.region,
426                             ) {
427                                 (TransformResult::Modified, Some(candidate))
428                                     if regions.iter().any(|x| x == candidate) =>
429                                 {
430                                     candidate
431                                 }
432                                 _ => default_region,
433                             },
434                         );
435                         result = TransformResult::Modified;
436                         continue;
437                     }
438                 }
439             }
440 
441             if !locale.id.variants.is_empty() {
442                 let mut modified = Vec::with_capacity(0);
443                 for (idx, &variant) in locale.id.variants.iter().enumerate() {
444                     if let Some(&updated) = self
445                         .aliases
446                         .get()
447                         .variant
448                         .get(&variant.to_tinystr().to_unvalidated())
449                     {
450                         if modified.is_empty() {
451                             modified = locale.id.variants.to_vec();
452                         }
453                         #[allow(clippy::indexing_slicing)]
454                         let _ = core::mem::replace(&mut modified[idx], updated);
455                     }
456                 }
457 
458                 if !modified.is_empty() {
459                     modified.sort();
460                     modified.dedup();
461                     locale.id.variants = Variants::from_vec_unchecked(modified);
462                     result = TransformResult::Modified;
463                     continue;
464                 }
465             }
466 
467             // Nothing matched in this iteration, we're done.
468             break;
469         }
470 
471         if !locale.extensions.transform.is_empty() || !locale.extensions.unicode.is_empty() {
472             self.canonicalize_extensions(&mut locale.extensions, &mut result);
473         }
474         result
475     }
476 
canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult)477     fn canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult) {
478         // Handle Locale extensions in their own loops, because these rules do not interact
479         // with each other.
480         if let Some(ref mut lang) = extensions.transform.lang {
481             while uts35_check_language_rules(lang, &self.aliases) == TransformResult::Modified {
482                 *result = TransformResult::Modified;
483             }
484         }
485 
486         if !extensions.unicode.keywords.is_empty() {
487             for key in [key!("rg"), key!("sd")] {
488                 if let Some(value) = extensions.unicode.keywords.get_mut(&key) {
489                     if let Some(only_value) = value.as_single_subtag() {
490                         if let Some(modified_value) = self
491                             .aliases
492                             .get()
493                             .subdivision
494                             .get(&only_value.to_tinystr().resize().to_unvalidated())
495                         {
496                             if let Ok(modified_value) = modified_value.parse() {
497                                 *value = modified_value;
498                                 *result = TransformResult::Modified;
499                             }
500                         }
501                     }
502                 }
503             }
504         }
505     }
506 
canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool507     fn canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
508         // These language/variant comibnations have around 20 rules
509         for LanguageStrStrPair(lang, raw_variants, raw_to) in self
510             .aliases
511             .get()
512             .language_variants
513             .iter()
514             .map(zerofrom::ZeroFrom::zero_from)
515         {
516             let raw_variants = raw_variants.split('-');
517             // if is_iter_sorted(raw_variants.clone()) { // can we sort at construction?
518             if uts35_rule_matches(lid, lang, None, None, raw_variants.clone()) {
519                 if let Ok(to) = raw_to.parse() {
520                     uts35_replacement(
521                         lid,
522                         !lang.is_default(),
523                         false,
524                         false,
525                         Some(raw_variants),
526                         &to,
527                     );
528                     return true;
529                 }
530             }
531         }
532         false
533     }
534 
canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool535     fn canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
536         for StrStrPair(raw_from, raw_to) in self
537             .aliases
538             .get()
539             .language
540             .iter()
541             .map(zerofrom::ZeroFrom::zero_from)
542         {
543             if let Ok(from) = raw_from.parse::<LanguageIdentifier>() {
544                 if uts35_rule_matches(
545                     lid,
546                     from.language,
547                     from.script,
548                     from.region,
549                     from.variants.iter().map(Variant::as_str),
550                 ) {
551                     if let Ok(to) = raw_to.parse() {
552                         uts35_replacement(
553                             lid,
554                             !from.language.is_default(),
555                             from.script.is_some(),
556                             from.region.is_some(),
557                             Some(from.variants.iter().map(Variant::as_str)),
558                             &to,
559                         );
560                         return true;
561                     }
562                 }
563             }
564         }
565         false
566     }
567 }
568 
569 #[cfg(test)]
570 mod test {
571     use super::*;
572 
573     #[test]
test_uts35_rule_matches()574     fn test_uts35_rule_matches() {
575         for (source, rule, result) in [
576             ("ja", "und", true),
577             ("und-heploc-hepburn", "und-hepburn", true),
578             ("ja-heploc-hepburn", "und-hepburn", true),
579             ("ja-hepburn", "und-hepburn-heploc", false),
580         ] {
581             let source = source.parse().unwrap();
582             let rule = rule.parse::<LanguageIdentifier>().unwrap();
583             assert_eq!(
584                 uts35_rule_matches(
585                     &source,
586                     rule.language,
587                     rule.script,
588                     rule.region,
589                     rule.variants.iter().map(Variant::as_str),
590                 ),
591                 result,
592                 "{}",
593                 source
594             );
595         }
596     }
597 
598     #[test]
test_uts35_replacement()599     fn test_uts35_replacement() {
600         for (locale, rule_0, rule_1, result) in [
601             (
602                 "ja-Latn-fonipa-hepburn-heploc",
603                 "und-hepburn-heploc",
604                 "und-alalc97",
605                 "ja-Latn-alalc97-fonipa",
606             ),
607             ("sgn-DD", "und-DD", "und-DE", "sgn-DE"),
608             ("sgn-DE", "sgn-DE", "gsg", "gsg"),
609         ] {
610             let mut locale: Locale = locale.parse().unwrap();
611             let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap();
612             let rule_1 = rule_1.parse().unwrap();
613             let result = result.parse::<Locale>().unwrap();
614             uts35_replacement(
615                 &mut locale.id,
616                 !rule_0.language.is_default(),
617                 rule_0.script.is_some(),
618                 rule_0.region.is_some(),
619                 Some(rule_0.variants.iter().map(Variant::as_str)),
620                 &rule_1,
621             );
622             assert_eq!(result, locale);
623         }
624     }
625 }
626