1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5 //! The collection of code for locale canonicalization.
6
7 use crate::provider::*;
8 use alloc::vec::Vec;
9 use core::cmp::Ordering;
10
11 use crate::LocaleExpander;
12 use crate::TransformResult;
13 use icu_locale_core::extensions::Extensions;
14 use icu_locale_core::subtags::{Language, Region, Script};
15 use icu_locale_core::{
16 extensions::unicode::key,
17 subtags::{language, Variant, Variants},
18 LanguageIdentifier, Locale,
19 };
20 use icu_provider::prelude::*;
21 use tinystr::TinyAsciiStr;
22
23 /// Implements the algorithm defined in *[UTS #35: Annex C, LocaleId Canonicalization]*.
24 ///
25 /// # Examples
26 ///
27 /// ```
28 /// use icu::locale::Locale;
29 /// use icu::locale::{LocaleCanonicalizer, TransformResult};
30 ///
31 /// let lc = LocaleCanonicalizer::new_extended();
32 ///
33 /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
34 /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
35 /// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
36 /// ```
37 ///
38 /// [UTS #35: Annex C, LocaleId Canonicalization]: http://unicode.org/reports/tr35/#LocaleId_Canonicalization
39 #[derive(Debug)]
40 pub struct LocaleCanonicalizer<Expander = LocaleExpander> {
41 /// Data to support canonicalization.
42 aliases: DataPayload<LocaleAliasesV1>,
43 /// Likely subtags implementation for delegation.
44 expander: Expander,
45 }
46
uts35_rule_matches<'a, I>( source: &LanguageIdentifier, language: Language, script: Option<Script>, region: Option<Region>, raw_variants: I, ) -> bool where I: Iterator<Item = &'a str>,47 fn uts35_rule_matches<'a, I>(
48 source: &LanguageIdentifier,
49 language: Language,
50 script: Option<Script>,
51 region: Option<Region>,
52 raw_variants: I,
53 ) -> bool
54 where
55 I: Iterator<Item = &'a str>,
56 {
57 (language.is_default() || language == source.language)
58 && (script.is_none() || script == source.script)
59 && (region.is_none() || region == source.region)
60 && {
61 // Checks if variants are a subset of source variants.
62 // As both iterators are sorted, this can be done linearly.
63 let mut source_variants = source.variants.iter();
64 'outer: for raw_variant in raw_variants {
65 for source_variant in source_variants.by_ref() {
66 match source_variant.as_str().cmp(raw_variant) {
67 Ordering::Equal => {
68 // The source_variant is equal, move to next raw_variant
69 continue 'outer;
70 }
71 Ordering::Less => {
72 // The source_variant is smaller, take the next source_variant
73 }
74 Ordering::Greater => {
75 // The source_variant is greater,
76 // raw_variants is not a subset of source_variants
77 return false;
78 }
79 }
80 }
81 // There are raw_variants left after we exhausted source_variants
82 return false;
83 }
84 true
85 }
86 }
87
uts35_replacement<'a, I>( source: &mut LanguageIdentifier, ruletype_has_language: bool, ruletype_has_script: bool, ruletype_has_region: bool, ruletype_variants: Option<I>, replacement: &LanguageIdentifier, ) where I: Iterator<Item = &'a str>,88 fn uts35_replacement<'a, I>(
89 source: &mut LanguageIdentifier,
90 ruletype_has_language: bool,
91 ruletype_has_script: bool,
92 ruletype_has_region: bool,
93 ruletype_variants: Option<I>,
94 replacement: &LanguageIdentifier,
95 ) where
96 I: Iterator<Item = &'a str>,
97 {
98 if ruletype_has_language || (source.language.is_default() && !replacement.language.is_default())
99 {
100 source.language = replacement.language;
101 }
102 if ruletype_has_script || (source.script.is_none() && replacement.script.is_some()) {
103 source.script = replacement.script;
104 }
105 if ruletype_has_region || (source.region.is_none() && replacement.region.is_some()) {
106 source.region = replacement.region;
107 }
108 if let Some(skips) = ruletype_variants {
109 // The rule matches if the ruletype variants are a subset of the source variants.
110 // This means ja-Latn-fonipa-hepburn-heploc matches against the rule for
111 // hepburn-heploc and is canonicalized to ja-Latn-alalc97-fonipa
112
113 // We're merging three sorted deduped iterators into a new sequence:
114 // sources - skips + replacements
115
116 let mut sources = source.variants.iter().peekable();
117 let mut replacements = replacement.variants.iter().peekable();
118 let mut skips = skips.peekable();
119
120 let mut variants: Vec<Variant> = Vec::new();
121
122 loop {
123 match (sources.peek(), skips.peek(), replacements.peek()) {
124 (Some(&source), Some(skip), _)
125 if source.as_str().cmp(skip) == Ordering::Greater =>
126 {
127 skips.next();
128 }
129 (Some(&source), Some(skip), _) if source.as_str().cmp(skip) == Ordering::Equal => {
130 skips.next();
131 sources.next();
132 }
133 (Some(&source), _, Some(&replacement))
134 if replacement.cmp(source) == Ordering::Less =>
135 {
136 variants.push(*replacement);
137 replacements.next();
138 }
139 (Some(&source), _, Some(&replacement))
140 if replacement.cmp(source) == Ordering::Equal =>
141 {
142 variants.push(*source);
143 sources.next();
144 replacements.next();
145 }
146 (Some(&source), _, _) => {
147 variants.push(*source);
148 sources.next();
149 }
150 (None, _, Some(&replacement)) => {
151 variants.push(*replacement);
152 replacements.next();
153 }
154 (None, _, None) => {
155 break;
156 }
157 }
158 }
159 source.variants = Variants::from_vec_unchecked(variants);
160 }
161 }
162
163 #[inline]
uts35_check_language_rules( langid: &mut LanguageIdentifier, alias_data: &DataPayload<LocaleAliasesV1>, ) -> TransformResult164 fn uts35_check_language_rules(
165 langid: &mut LanguageIdentifier,
166 alias_data: &DataPayload<LocaleAliasesV1>,
167 ) -> TransformResult {
168 if !langid.language.is_default() {
169 let lang: TinyAsciiStr<3> = langid.language.into();
170 let replacement = if lang.len() == 2 {
171 alias_data
172 .get()
173 .language_len2
174 .get(&lang.resize().to_unvalidated())
175 } else {
176 alias_data.get().language_len3.get(&lang.to_unvalidated())
177 };
178
179 if let Some(replacement) = replacement {
180 if let Ok(new_langid) = replacement.parse() {
181 uts35_replacement::<core::iter::Empty<&str>>(
182 langid,
183 true,
184 false,
185 false,
186 None,
187 &new_langid,
188 );
189 return TransformResult::Modified;
190 }
191 }
192 }
193
194 TransformResult::Unmodified
195 }
196
197 impl LocaleCanonicalizer<LocaleExpander> {
198 /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data,
199 /// using a [`LocaleExpander`] for common locales.
200 ///
201 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
202 ///
203 /// [ Help choosing a constructor](icu_provider::constructors)
204 #[cfg(feature = "compiled_data")]
new_common() -> Self205 pub const fn new_common() -> Self {
206 Self::new_with_expander(LocaleExpander::new_common())
207 }
208
209 icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
210 functions: [
211 new_common: skip,
212 try_new_common_with_buffer_provider,
213 try_new_common_unstable,
214 Self,
215 ]
216 );
217
218 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_common)]
try_new_common_unstable<P>(provider: &P) -> Result<Self, DataError> where P: DataProvider<LocaleAliasesV1> + DataProvider<LocaleLikelySubtagsLanguageV1> + DataProvider<LocaleLikelySubtagsScriptRegionV1> + ?Sized,219 pub fn try_new_common_unstable<P>(provider: &P) -> Result<Self, DataError>
220 where
221 P: DataProvider<LocaleAliasesV1>
222 + DataProvider<LocaleLikelySubtagsLanguageV1>
223 + DataProvider<LocaleLikelySubtagsScriptRegionV1>
224 + ?Sized,
225 {
226 let expander = LocaleExpander::try_new_common_unstable(provider)?;
227 Self::try_new_with_expander_unstable(provider, expander)
228 }
229
230 /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data,
231 /// using a [`LocaleExpander`] for all locales.
232 ///
233 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
234 ///
235 /// [ Help choosing a constructor](icu_provider::constructors)
236 #[cfg(feature = "compiled_data")]
new_extended() -> Self237 pub const fn new_extended() -> Self {
238 Self::new_with_expander(LocaleExpander::new_extended())
239 }
240
241 icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
242 functions: [
243 new_extended: skip,
244 try_new_extended_with_buffer_provider,
245 try_new_extended_unstable,
246 Self,
247 ]
248 );
249
250 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_extended)]
try_new_extended_unstable<P>(provider: &P) -> Result<Self, DataError> where P: DataProvider<LocaleAliasesV1> + DataProvider<LocaleLikelySubtagsLanguageV1> + DataProvider<LocaleLikelySubtagsScriptRegionV1> + DataProvider<LocaleLikelySubtagsExtendedV1> + ?Sized,251 pub fn try_new_extended_unstable<P>(provider: &P) -> Result<Self, DataError>
252 where
253 P: DataProvider<LocaleAliasesV1>
254 + DataProvider<LocaleLikelySubtagsLanguageV1>
255 + DataProvider<LocaleLikelySubtagsScriptRegionV1>
256 + DataProvider<LocaleLikelySubtagsExtendedV1>
257 + ?Sized,
258 {
259 let expander = LocaleExpander::try_new_extended_unstable(provider)?;
260 Self::try_new_with_expander_unstable(provider, expander)
261 }
262 }
263
264 impl<Expander: AsRef<LocaleExpander>> LocaleCanonicalizer<Expander> {
265 /// Creates a [`LocaleCanonicalizer`] with a custom [`LocaleExpander`] and compiled data.
266 ///
267 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
268 ///
269 /// [ Help choosing a constructor](icu_provider::constructors)
270 #[cfg(feature = "compiled_data")]
new_with_expander(expander: Expander) -> Self271 pub const fn new_with_expander(expander: Expander) -> Self {
272 Self {
273 aliases: DataPayload::from_static_ref(
274 crate::provider::Baked::SINGLETON_LOCALE_ALIASES_V1,
275 ),
276 expander,
277 }
278 }
279
280 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)]
try_new_with_expander_unstable<P>( provider: &P, expander: Expander, ) -> Result<Self, DataError> where P: DataProvider<LocaleAliasesV1> + ?Sized,281 pub fn try_new_with_expander_unstable<P>(
282 provider: &P,
283 expander: Expander,
284 ) -> Result<Self, DataError>
285 where
286 P: DataProvider<LocaleAliasesV1> + ?Sized,
287 {
288 let aliases: DataPayload<LocaleAliasesV1> = provider.load(Default::default())?.payload;
289
290 Ok(Self { aliases, expander })
291 }
292
293 icu_provider::gen_buffer_data_constructors!((options: Expander) -> error: DataError,
294 functions: [
295 new_with_expander: skip,
296 try_new_with_expander_with_buffer_provider,
297 try_new_with_expander_unstable,
298 Self,
299 ]
300 );
301
302 /// The canonicalize method potentially updates a passed in locale in place
303 /// depending up the results of running the canonicalization algorithm
304 /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>.
305 ///
306 /// Some BCP47 canonicalization data is not part of the CLDR json package. Because
307 /// of this, some canonicalizations are not performed, e.g. the canonicalization of
308 /// `und-u-ca-islamicc` to `und-u-ca-islamic-civil`. This will be fixed in a future
309 /// release once the missing data has been added to the CLDR json data. See:
310 /// <https://github.com/unicode-org/icu4x/issues/746>
311 ///
312 /// # Examples
313 ///
314 /// ```
315 /// use icu::locale::{Locale, LocaleCanonicalizer, TransformResult};
316 ///
317 /// let lc = LocaleCanonicalizer::new_extended();
318 ///
319 /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
320 /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
321 /// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
322 /// ```
canonicalize(&self, locale: &mut Locale) -> TransformResult323 pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult {
324 let mut result = TransformResult::Unmodified;
325
326 // This loops until we get a 'fixed point', where applying the rules do not
327 // result in any more changes.
328 loop {
329 // These are linear searches due to the ordering imposed by the canonicalization
330 // rules, where rules with more variants should be considered first. With the
331 // current data in CLDR, we will only do this for locales which have variants,
332 // or new rules which we haven't special-cased yet (of which there are fewer
333 // than 20).
334 let modified = if locale.id.variants.is_empty() {
335 self.canonicalize_absolute_language_fallbacks(&mut locale.id)
336 } else {
337 self.canonicalize_language_variant_fallbacks(&mut locale.id)
338 };
339 if modified {
340 result = TransformResult::Modified;
341 continue;
342 }
343
344 if !locale.id.language.is_default() {
345 // If the region is specified, check sgn-region rules first
346 if let Some(region) = locale.id.region {
347 if locale.id.language == language!("sgn") {
348 if let Some(&sgn_lang) = self
349 .aliases
350 .get()
351 .sgn_region
352 .get(®ion.to_tinystr().to_unvalidated())
353 {
354 uts35_replacement::<core::iter::Empty<&str>>(
355 &mut locale.id,
356 true,
357 false,
358 true,
359 None,
360 &sgn_lang.into(),
361 );
362 result = TransformResult::Modified;
363 continue;
364 }
365 }
366 }
367
368 if uts35_check_language_rules(&mut locale.id, &self.aliases)
369 == TransformResult::Modified
370 {
371 result = TransformResult::Modified;
372 continue;
373 }
374 }
375
376 if let Some(script) = locale.id.script {
377 if let Some(&replacement) = self
378 .aliases
379 .get()
380 .script
381 .get(&script.to_tinystr().to_unvalidated())
382 {
383 locale.id.script = Some(replacement);
384 result = TransformResult::Modified;
385 continue;
386 }
387 }
388
389 if let Some(region) = locale.id.region {
390 let replacement = if region.is_alphabetic() {
391 self.aliases
392 .get()
393 .region_alpha
394 .get(®ion.to_tinystr().resize().to_unvalidated())
395 } else {
396 self.aliases
397 .get()
398 .region_num
399 .get(®ion.to_tinystr().to_unvalidated())
400 };
401 if let Some(&replacement) = replacement {
402 locale.id.region = Some(replacement);
403 result = TransformResult::Modified;
404 continue;
405 }
406
407 if let Some(regions) = self
408 .aliases
409 .get()
410 .complex_region
411 .get(®ion.to_tinystr().to_unvalidated())
412 {
413 // Skip if regions are empty
414 if let Some(default_region) = regions.get(0) {
415 let mut maximized = LanguageIdentifier {
416 language: locale.id.language,
417 script: locale.id.script,
418 region: None,
419 variants: Variants::default(),
420 };
421
422 locale.id.region = Some(
423 match (
424 self.expander.as_ref().maximize(&mut maximized),
425 maximized.region,
426 ) {
427 (TransformResult::Modified, Some(candidate))
428 if regions.iter().any(|x| x == candidate) =>
429 {
430 candidate
431 }
432 _ => default_region,
433 },
434 );
435 result = TransformResult::Modified;
436 continue;
437 }
438 }
439 }
440
441 if !locale.id.variants.is_empty() {
442 let mut modified = Vec::with_capacity(0);
443 for (idx, &variant) in locale.id.variants.iter().enumerate() {
444 if let Some(&updated) = self
445 .aliases
446 .get()
447 .variant
448 .get(&variant.to_tinystr().to_unvalidated())
449 {
450 if modified.is_empty() {
451 modified = locale.id.variants.to_vec();
452 }
453 #[allow(clippy::indexing_slicing)]
454 let _ = core::mem::replace(&mut modified[idx], updated);
455 }
456 }
457
458 if !modified.is_empty() {
459 modified.sort();
460 modified.dedup();
461 locale.id.variants = Variants::from_vec_unchecked(modified);
462 result = TransformResult::Modified;
463 continue;
464 }
465 }
466
467 // Nothing matched in this iteration, we're done.
468 break;
469 }
470
471 if !locale.extensions.transform.is_empty() || !locale.extensions.unicode.is_empty() {
472 self.canonicalize_extensions(&mut locale.extensions, &mut result);
473 }
474 result
475 }
476
canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult)477 fn canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult) {
478 // Handle Locale extensions in their own loops, because these rules do not interact
479 // with each other.
480 if let Some(ref mut lang) = extensions.transform.lang {
481 while uts35_check_language_rules(lang, &self.aliases) == TransformResult::Modified {
482 *result = TransformResult::Modified;
483 }
484 }
485
486 if !extensions.unicode.keywords.is_empty() {
487 for key in [key!("rg"), key!("sd")] {
488 if let Some(value) = extensions.unicode.keywords.get_mut(&key) {
489 if let Some(only_value) = value.as_single_subtag() {
490 if let Some(modified_value) = self
491 .aliases
492 .get()
493 .subdivision
494 .get(&only_value.to_tinystr().resize().to_unvalidated())
495 {
496 if let Ok(modified_value) = modified_value.parse() {
497 *value = modified_value;
498 *result = TransformResult::Modified;
499 }
500 }
501 }
502 }
503 }
504 }
505 }
506
canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool507 fn canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
508 // These language/variant comibnations have around 20 rules
509 for LanguageStrStrPair(lang, raw_variants, raw_to) in self
510 .aliases
511 .get()
512 .language_variants
513 .iter()
514 .map(zerofrom::ZeroFrom::zero_from)
515 {
516 let raw_variants = raw_variants.split('-');
517 // if is_iter_sorted(raw_variants.clone()) { // can we sort at construction?
518 if uts35_rule_matches(lid, lang, None, None, raw_variants.clone()) {
519 if let Ok(to) = raw_to.parse() {
520 uts35_replacement(
521 lid,
522 !lang.is_default(),
523 false,
524 false,
525 Some(raw_variants),
526 &to,
527 );
528 return true;
529 }
530 }
531 }
532 false
533 }
534
canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool535 fn canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
536 for StrStrPair(raw_from, raw_to) in self
537 .aliases
538 .get()
539 .language
540 .iter()
541 .map(zerofrom::ZeroFrom::zero_from)
542 {
543 if let Ok(from) = raw_from.parse::<LanguageIdentifier>() {
544 if uts35_rule_matches(
545 lid,
546 from.language,
547 from.script,
548 from.region,
549 from.variants.iter().map(Variant::as_str),
550 ) {
551 if let Ok(to) = raw_to.parse() {
552 uts35_replacement(
553 lid,
554 !from.language.is_default(),
555 from.script.is_some(),
556 from.region.is_some(),
557 Some(from.variants.iter().map(Variant::as_str)),
558 &to,
559 );
560 return true;
561 }
562 }
563 }
564 }
565 false
566 }
567 }
568
569 #[cfg(test)]
570 mod test {
571 use super::*;
572
573 #[test]
test_uts35_rule_matches()574 fn test_uts35_rule_matches() {
575 for (source, rule, result) in [
576 ("ja", "und", true),
577 ("und-heploc-hepburn", "und-hepburn", true),
578 ("ja-heploc-hepburn", "und-hepburn", true),
579 ("ja-hepburn", "und-hepburn-heploc", false),
580 ] {
581 let source = source.parse().unwrap();
582 let rule = rule.parse::<LanguageIdentifier>().unwrap();
583 assert_eq!(
584 uts35_rule_matches(
585 &source,
586 rule.language,
587 rule.script,
588 rule.region,
589 rule.variants.iter().map(Variant::as_str),
590 ),
591 result,
592 "{}",
593 source
594 );
595 }
596 }
597
598 #[test]
test_uts35_replacement()599 fn test_uts35_replacement() {
600 for (locale, rule_0, rule_1, result) in [
601 (
602 "ja-Latn-fonipa-hepburn-heploc",
603 "und-hepburn-heploc",
604 "und-alalc97",
605 "ja-Latn-alalc97-fonipa",
606 ),
607 ("sgn-DD", "und-DD", "und-DE", "sgn-DE"),
608 ("sgn-DE", "sgn-DE", "gsg", "gsg"),
609 ] {
610 let mut locale: Locale = locale.parse().unwrap();
611 let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap();
612 let rule_1 = rule_1.parse().unwrap();
613 let result = result.parse::<Locale>().unwrap();
614 uts35_replacement(
615 &mut locale.id,
616 !rule_0.language.is_default(),
617 rule_0.script.is_some(),
618 rule_0.region.is_some(),
619 Some(rule_0.variants.iter().map(Variant::as_str)),
620 &rule_1,
621 );
622 assert_eq!(result, locale);
623 }
624 }
625 }
626