• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 use super::LocaleFallbackPriority;
6 use icu_locale_core::subtags::{Language, Region, Script};
7 
8 use super::*;
9 
10 impl LocaleFallbackerWithConfig<'_> {
normalize(&self, locale: &mut DataLocale, default_script: &mut Option<Script>)11     pub(crate) fn normalize(&self, locale: &mut DataLocale, default_script: &mut Option<Script>) {
12         // 0. If there is an invalid "sd" subtag, drop it
13         if let Some(subdivision) = locale.subdivision.take() {
14             if let Some(region) = locale.region {
15                 if subdivision
16                     .as_str()
17                     .starts_with(region.to_tinystr().to_ascii_lowercase().as_str())
18                 {
19                     locale.subdivision = Some(subdivision);
20                 }
21             }
22         }
23         let language = locale.language;
24         // 1. Populate the region (required for region fallback only)
25         if self.config.priority == LocaleFallbackPriority::Region && locale.region.is_none() {
26             // 1a. First look for region based on language+script
27             if let Some(script) = locale.script {
28                 locale.region = self
29                     .likely_subtags
30                     .language_script
31                     .get(&(
32                         language.to_tinystr().to_unvalidated(),
33                         script.to_tinystr().to_unvalidated(),
34                     ))
35                     .copied();
36             }
37             // 1b. If that fails, try language only
38             if locale.region.is_none() {
39                 locale.region = self
40                     .likely_subtags
41                     .language
42                     .get_copied(&language.to_tinystr().to_unvalidated())
43                     .map(|(_s, r)| r);
44             }
45         }
46         // 2. Remove the script if it is implied by the other subtags
47         if locale.script.is_some() || self.config.priority == LocaleFallbackPriority::Script {
48             *default_script = locale
49                 .region
50                 .and_then(|region| {
51                     self.likely_subtags.language_region.get_copied(&(
52                         language.to_tinystr().to_unvalidated(),
53                         region.to_tinystr().to_unvalidated(),
54                     ))
55                 })
56                 .or_else(|| {
57                     self.likely_subtags
58                         .language
59                         .get_copied(&language.to_tinystr().to_unvalidated())
60                         .map(|(s, _r)| s)
61                 });
62             if locale.script == *default_script {
63                 locale.script = None;
64             }
65         }
66     }
67 }
68 
69 impl LocaleFallbackIteratorInner<'_> {
step(&mut self, locale: &mut DataLocale)70     pub fn step(&mut self, locale: &mut DataLocale) {
71         match self.config.priority {
72             LocaleFallbackPriority::Language => self.step_language(locale),
73             LocaleFallbackPriority::Script => self.step_script(locale),
74             LocaleFallbackPriority::Region => self.step_region(locale),
75             // This case should not normally happen, but `LocaleFallbackPriority` is non_exhaustive.
76             // Make it go directly to `und`.
77             _ => {
78                 debug_assert!(
79                     false,
80                     "Unknown LocaleFallbackPriority: {:?}",
81                     self.config.priority
82                 );
83                 *locale = Default::default()
84             }
85         }
86     }
87 
step_language(&mut self, locale: &mut DataLocale)88     fn step_language(&mut self, locale: &mut DataLocale) {
89         // 2. Remove the subdivision keyword
90         if let Some(value) = locale.subdivision.take() {
91             self.backup_subdivision = Some(value);
92             return;
93         }
94         // 4. Remove variants
95         if let Some(single_variant) = locale.variant.take() {
96             self.backup_variant = Some(single_variant);
97             return;
98         }
99         // 5. Check for parent override
100         if let Some((language, script, region)) = self.get_explicit_parent(locale) {
101             locale.language = language;
102             locale.script = script;
103             locale.region = region;
104             locale.variant = self.backup_variant.take();
105             return;
106         }
107         // 7. Remove region
108         if let Some(region) = locale.region {
109             // 6. Add the script subtag if necessary
110             if locale.script.is_none() {
111                 let language = locale.language;
112                 if let Some(script) = self.likely_subtags.language_region.get_copied(&(
113                     language.to_tinystr().to_unvalidated(),
114                     region.to_tinystr().to_unvalidated(),
115                 )) {
116                     locale.script = Some(script);
117                 }
118             }
119             locale.region = None;
120             locale.variant = self.backup_variant.take();
121             return;
122         }
123         // 8. Remove language+script
124         debug_assert!(!locale.language.is_default() || locale.script.is_some()); // don't call .step() on und
125         locale.script = None;
126         locale.language = Language::UND;
127     }
128 
step_region(&mut self, locale: &mut DataLocale)129     fn step_region(&mut self, locale: &mut DataLocale) {
130         // TODO(#4413): -u-rg is not yet supported
131         // 2. Remove the subdivision keyword
132         if let Some(value) = locale.subdivision.take() {
133             self.backup_subdivision = Some(value);
134             return;
135         }
136         // 4. Remove variants
137         if let Some(variant) = locale.variant.take() {
138             self.backup_variant = Some(variant);
139             return;
140         }
141         // 5. Remove language+script
142         if !locale.language.is_default() || locale.script.is_some() {
143             locale.script = None;
144             locale.language = Language::UND;
145             // Don't produce und-variant
146             if locale.region.is_some() {
147                 locale.variant = self.backup_variant.take();
148                 locale.subdivision = self.backup_subdivision.take();
149             }
150             return;
151         }
152         // 6. Remove region
153         debug_assert!(locale.region.is_some()); // don't call .step() on und
154         locale.region = None;
155     }
156 
step_script(&mut self, locale: &mut DataLocale)157     fn step_script(&mut self, locale: &mut DataLocale) {
158         // Remove the subdivision keyword
159         if let Some(value) = locale.subdivision.take() {
160             self.backup_subdivision = Some(value);
161             return;
162         }
163         // Remove variants
164         if let Some(variant) = locale.variant.take() {
165             self.backup_variant = Some(variant);
166             return;
167         }
168         // Check for parent override
169         if let Some((language, script, region)) = self.get_explicit_parent(locale) {
170             locale.language = language;
171             locale.script = script;
172             locale.region = region;
173             locale.variant = self.backup_variant.take();
174             return;
175         }
176         // Remove the region
177         if let Some(region) = locale.region {
178             self.backup_region = Some(region);
179             let language_implied_script = self
180                 .likely_subtags
181                 .language
182                 .get_copied(&locale.language.to_tinystr().to_unvalidated())
183                 .map(|(s, _r)| s);
184             if language_implied_script != self.max_script {
185                 locale.script = self.max_script;
186             }
187             locale.region = None;
188             locale.variant = self.backup_variant.take();
189             return;
190         }
191 
192         // Remove the script if we have a language
193         if !locale.language.is_default() {
194             let language_implied_script = self
195                 .likely_subtags
196                 .language
197                 .get_copied(&locale.language.to_tinystr().to_unvalidated())
198                 .map(|(s, _r)| s);
199             if locale.script.is_some() && language_implied_script == locale.script {
200                 locale.script = None;
201                 if let Some(region) = self.backup_region.take() {
202                     locale.region = Some(region);
203                     locale.subdivision = self.backup_subdivision.take();
204                     locale.variant = self.backup_variant.take();
205                 }
206                 // needed if more fallback is added at the end
207                 #[allow(clippy::needless_return)]
208                 return;
209             } else {
210                 // 3. Remove the language and apply the maximized script
211                 locale.language = Language::UND;
212                 locale.script = self.max_script;
213                 // Don't produce und-variant
214                 if locale.script.is_some() {
215                     locale.variant = self.backup_variant.take();
216                 }
217                 // needed if more fallback is added at the end
218                 #[allow(clippy::needless_return)]
219                 return;
220             }
221         }
222 
223         // note: UTS #35 wants us to apply "other associated scripts" now. ICU4C/J does not do this,
224         // so we don't either. They would be found here if they are ever needed:
225         // https://github.com/unicode-cldr/cldr-core/blob/master/supplemental/languageData.json
226 
227         // 6. Remove script
228         if locale.script.is_some() {
229             locale.script = None;
230         }
231     }
232 
get_explicit_parent( &self, locale: &DataLocale, ) -> Option<(Language, Option<Script>, Option<Region>)>233     fn get_explicit_parent(
234         &self,
235         locale: &DataLocale,
236     ) -> Option<(Language, Option<Script>, Option<Region>)> {
237         self.parents
238             .parents
239             .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse())
240     }
241 }
242 
243 #[cfg(test)]
244 mod tests {
245     use super::*;
246     use writeable::Writeable;
247 
248     struct TestCase {
249         input: &'static str,
250         requires_data: bool,
251         // Note: The first entry in the chain is the normalized locale
252         expected_language_chain: &'static [&'static str],
253         expected_script_chain: &'static [&'static str],
254         expected_region_chain: &'static [&'static str],
255     }
256 
257     // TODO: Consider loading these from a JSON file
258     const TEST_CASES: &[TestCase] = &[
259         TestCase {
260             input: "en-fonipa",
261             requires_data: false,
262             expected_language_chain: &["en-fonipa", "en"],
263             expected_script_chain: &["en-fonipa", "en"],
264             expected_region_chain: &["en-fonipa", "en"],
265         },
266         TestCase {
267             input: "en-US-u-sd-usca",
268             requires_data: false,
269             expected_language_chain: &["en-US-u-sd-usca", "en-US", "en"],
270             expected_script_chain: &["en-US-u-sd-usca", "en-US", "en"],
271             expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
272         },
273         TestCase {
274             input: "en-US-fonipa-u-sd-usca",
275             requires_data: false,
276             expected_language_chain: &[
277                 "en-US-fonipa-u-sd-usca",
278                 "en-US-fonipa",
279                 "en-US",
280                 "en-fonipa",
281                 "en",
282             ],
283             expected_script_chain: &[
284                 "en-US-fonipa-u-sd-usca",
285                 "en-US-fonipa",
286                 "en-US",
287                 "en-fonipa",
288                 "en",
289             ],
290             expected_region_chain: &[
291                 "en-US-fonipa-u-sd-usca",
292                 "en-US-fonipa",
293                 "en-US",
294                 "und-US-fonipa-u-sd-usca",
295                 "und-US-fonipa",
296                 "und-US",
297             ],
298         },
299         TestCase {
300             input: "en-fonipa",
301             requires_data: true,
302             expected_language_chain: &["en-fonipa", "en"],
303             expected_script_chain: &["en-fonipa", "en", "und-Latn-fonipa", "und-Latn"],
304             expected_region_chain: &["en-US-fonipa", "en-US", "und-US-fonipa", "und-US"],
305         },
306         TestCase {
307             input: "en-Latn-fonipa",
308             requires_data: true,
309             expected_language_chain: &["en-fonipa", "en"],
310             expected_script_chain: &["en-fonipa", "en", "und-Latn-fonipa", "und-Latn"],
311             expected_region_chain: &["en-US-fonipa", "en-US", "und-US-fonipa", "und-US"],
312         },
313         TestCase {
314             input: "en-Latn-US-u-sd-usca",
315             requires_data: true,
316             expected_language_chain: &["en-US-u-sd-usca", "en-US", "en"],
317             expected_script_chain: &["en-US-u-sd-usca", "en-US", "en", "und-Latn"],
318             expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"],
319         },
320         TestCase {
321             input: "sr-ME",
322             requires_data: true,
323             expected_language_chain: &["sr-ME", "sr-Latn"],
324             expected_script_chain: &["sr-ME", "sr-Latn", "und-Latn"],
325             expected_region_chain: &["sr-ME", "und-ME"],
326         },
327         TestCase {
328             input: "sr-Latn-ME",
329             requires_data: true,
330             expected_language_chain: &["sr-ME", "sr-Latn"],
331             expected_script_chain: &["sr-ME", "sr-Latn", "und-Latn"],
332             expected_region_chain: &["sr-ME", "und-ME"],
333         },
334         TestCase {
335             input: "sr-ME-fonipa",
336             requires_data: true,
337             expected_language_chain: &["sr-ME-fonipa", "sr-ME", "sr-Latn-fonipa", "sr-Latn"],
338             expected_script_chain: &[
339                 "sr-ME-fonipa",
340                 "sr-ME",
341                 "sr-Latn-fonipa",
342                 "sr-Latn",
343                 "und-Latn-fonipa",
344                 "und-Latn",
345             ],
346             expected_region_chain: &["sr-ME-fonipa", "sr-ME", "und-ME-fonipa", "und-ME"],
347         },
348         TestCase {
349             input: "sr-RS",
350             requires_data: true,
351             expected_language_chain: &["sr-RS", "sr"],
352             expected_script_chain: &["sr-RS", "sr", "und-Cyrl"],
353             expected_region_chain: &["sr-RS", "und-RS"],
354         },
355         TestCase {
356             input: "sr-Cyrl-RS",
357             requires_data: true,
358             expected_language_chain: &["sr-RS", "sr"],
359             expected_script_chain: &["sr-RS", "sr", "und-Cyrl"],
360             expected_region_chain: &["sr-RS", "und-RS"],
361         },
362         TestCase {
363             input: "sr-Latn-RS",
364             requires_data: true,
365             expected_language_chain: &["sr-Latn-RS", "sr-Latn"],
366             expected_script_chain: &["sr-Latn-RS", "sr-Latn", "und-Latn"],
367             expected_region_chain: &["sr-Latn-RS", "und-RS"],
368         },
369         TestCase {
370             input: "de-Latn-LI",
371             requires_data: true,
372             expected_language_chain: &["de-LI", "de"],
373             expected_script_chain: &["de-LI", "de", "und-Latn"],
374             expected_region_chain: &["de-LI", "und-LI"],
375         },
376         TestCase {
377             input: "ca-ES-valencia",
378             requires_data: true,
379             expected_language_chain: &["ca-ES-valencia", "ca-ES", "ca-valencia", "ca"],
380             expected_script_chain: &[
381                 "ca-ES-valencia",
382                 "ca-ES",
383                 "ca-valencia",
384                 "ca",
385                 "und-Latn-valencia",
386                 "und-Latn",
387             ],
388             expected_region_chain: &["ca-ES-valencia", "ca-ES", "und-ES-valencia", "und-ES"],
389         },
390         TestCase {
391             input: "es-AR",
392             requires_data: true,
393             expected_language_chain: &["es-AR", "es-419", "es"],
394             expected_script_chain: &["es-AR", "es-419", "es", "und-Latn"],
395             expected_region_chain: &["es-AR", "und-AR"],
396         },
397         TestCase {
398             input: "hi-IN",
399             requires_data: true,
400             expected_language_chain: &["hi-IN", "hi"],
401             expected_script_chain: &["hi-IN", "hi", "und-Deva"],
402             expected_region_chain: &["hi-IN", "und-IN"],
403         },
404         TestCase {
405             input: "hi-Latn-IN",
406             requires_data: true,
407             expected_language_chain: &["hi-Latn-IN", "hi-Latn", "en-IN", "en-001", "en"],
408             expected_script_chain: &["hi-Latn-IN", "hi-Latn", "en-IN", "en-001", "en", "und-Latn"],
409             expected_region_chain: &["hi-Latn-IN", "und-IN"],
410         },
411         TestCase {
412             input: "zh-CN",
413             requires_data: true,
414             // Note: "zh-Hans" is not reachable because it is the default script for "zh".
415             // The fallback algorithm does not visit the language-script bundle when the
416             // script is the default for the language
417             expected_language_chain: &["zh-CN", "zh"],
418             expected_script_chain: &["zh-CN", "zh", "und-Hans", "und-Hani"],
419             expected_region_chain: &["zh-CN", "und-CN"],
420         },
421         TestCase {
422             input: "zh-TW",
423             requires_data: true,
424             expected_language_chain: &["zh-TW", "zh-Hant"],
425             expected_script_chain: &["zh-TW", "zh-Hant", "und-Hant", "und-Hani"],
426             expected_region_chain: &["zh-TW", "und-TW"],
427         },
428         TestCase {
429             input: "yue-HK",
430             requires_data: true,
431             expected_language_chain: &["yue-HK", "yue"],
432             expected_script_chain: &["yue-HK", "yue", "und-Hant", "und-Hani"],
433             expected_region_chain: &["yue-HK", "und-HK"],
434         },
435         TestCase {
436             input: "yue-HK",
437             requires_data: true,
438             expected_language_chain: &["yue-HK", "yue"],
439             expected_script_chain: &["yue-HK", "yue", "und-Hant", "und-Hani"],
440             expected_region_chain: &["yue-HK", "und-HK"],
441         },
442         TestCase {
443             input: "yue-CN",
444             requires_data: true,
445             expected_language_chain: &["yue-CN", "yue-Hans"],
446             expected_script_chain: &["yue-CN", "yue-Hans", "und-Hans", "und-Hani"],
447             expected_region_chain: &["yue-CN", "und-CN"],
448         },
449         TestCase {
450             input: "az-Arab-IR",
451             requires_data: true,
452             expected_language_chain: &["az-IR", "az-Arab"],
453             expected_script_chain: &["az-IR", "az-Arab", "und-Arab"],
454             expected_region_chain: &["az-IR", "und-IR"],
455         },
456         TestCase {
457             input: "az-IR",
458             requires_data: true,
459             expected_language_chain: &["az-IR", "az-Arab"],
460             expected_script_chain: &["az-IR", "az-Arab", "und-Arab"],
461             expected_region_chain: &["az-IR", "und-IR"],
462         },
463         TestCase {
464             input: "az-Arab",
465             requires_data: true,
466             expected_language_chain: &["az-Arab"],
467             expected_script_chain: &["az-Arab", "und-Arab"],
468             expected_region_chain: &["az-IR", "und-IR"],
469         },
470     ];
471 
472     #[test]
test_fallback()473     fn test_fallback() {
474         let fallbacker_no_data = LocaleFallbacker::new_without_data();
475         let fallbacker_no_data = fallbacker_no_data.as_borrowed();
476         let fallbacker_with_data = LocaleFallbacker::new();
477         for cas in TEST_CASES {
478             for (priority, expected_chain) in [
479                 (
480                     LocaleFallbackPriority::Language,
481                     cas.expected_language_chain,
482                 ),
483                 (LocaleFallbackPriority::Script, cas.expected_script_chain),
484                 (LocaleFallbackPriority::Region, cas.expected_region_chain),
485             ] {
486                 let mut config = LocaleFallbackConfig::default();
487                 config.priority = priority;
488                 let fallbacker = if cas.requires_data {
489                     fallbacker_with_data
490                 } else {
491                     fallbacker_no_data
492                 };
493                 let mut it = fallbacker
494                     .for_config(config)
495                     .fallback_for(cas.input.parse().unwrap());
496                 let mut actual_chain = Vec::new();
497                 for i in 0..20 {
498                     if i == 19 {
499                         eprintln!("20 iterations reached!");
500                     }
501                     if it.get().is_default() {
502                         break;
503                     }
504                     actual_chain.push(it.get().write_to_string().into_owned());
505                     it.step();
506                 }
507                 assert_eq!(
508                     expected_chain, &actual_chain,
509                     "{:?} ({:?})",
510                     cas.input, priority
511                 );
512             }
513         }
514     }
515 }
516