1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 use super::LocaleFallbackPriority; 6 use icu_locale_core::subtags::{Language, Region, Script}; 7 8 use super::*; 9 10 impl LocaleFallbackerWithConfig<'_> { normalize(&self, locale: &mut DataLocale, default_script: &mut Option<Script>)11 pub(crate) fn normalize(&self, locale: &mut DataLocale, default_script: &mut Option<Script>) { 12 // 0. If there is an invalid "sd" subtag, drop it 13 if let Some(subdivision) = locale.subdivision.take() { 14 if let Some(region) = locale.region { 15 if subdivision 16 .as_str() 17 .starts_with(region.to_tinystr().to_ascii_lowercase().as_str()) 18 { 19 locale.subdivision = Some(subdivision); 20 } 21 } 22 } 23 let language = locale.language; 24 // 1. Populate the region (required for region fallback only) 25 if self.config.priority == LocaleFallbackPriority::Region && locale.region.is_none() { 26 // 1a. First look for region based on language+script 27 if let Some(script) = locale.script { 28 locale.region = self 29 .likely_subtags 30 .language_script 31 .get(&( 32 language.to_tinystr().to_unvalidated(), 33 script.to_tinystr().to_unvalidated(), 34 )) 35 .copied(); 36 } 37 // 1b. If that fails, try language only 38 if locale.region.is_none() { 39 locale.region = self 40 .likely_subtags 41 .language 42 .get_copied(&language.to_tinystr().to_unvalidated()) 43 .map(|(_s, r)| r); 44 } 45 } 46 // 2. Remove the script if it is implied by the other subtags 47 if locale.script.is_some() || self.config.priority == LocaleFallbackPriority::Script { 48 *default_script = locale 49 .region 50 .and_then(|region| { 51 self.likely_subtags.language_region.get_copied(&( 52 language.to_tinystr().to_unvalidated(), 53 region.to_tinystr().to_unvalidated(), 54 )) 55 }) 56 .or_else(|| { 57 self.likely_subtags 58 .language 59 .get_copied(&language.to_tinystr().to_unvalidated()) 60 .map(|(s, _r)| s) 61 }); 62 if locale.script == *default_script { 63 locale.script = None; 64 } 65 } 66 } 67 } 68 69 impl LocaleFallbackIteratorInner<'_> { step(&mut self, locale: &mut DataLocale)70 pub fn step(&mut self, locale: &mut DataLocale) { 71 match self.config.priority { 72 LocaleFallbackPriority::Language => self.step_language(locale), 73 LocaleFallbackPriority::Script => self.step_script(locale), 74 LocaleFallbackPriority::Region => self.step_region(locale), 75 // This case should not normally happen, but `LocaleFallbackPriority` is non_exhaustive. 76 // Make it go directly to `und`. 77 _ => { 78 debug_assert!( 79 false, 80 "Unknown LocaleFallbackPriority: {:?}", 81 self.config.priority 82 ); 83 *locale = Default::default() 84 } 85 } 86 } 87 step_language(&mut self, locale: &mut DataLocale)88 fn step_language(&mut self, locale: &mut DataLocale) { 89 // 2. Remove the subdivision keyword 90 if let Some(value) = locale.subdivision.take() { 91 self.backup_subdivision = Some(value); 92 return; 93 } 94 // 4. Remove variants 95 if let Some(single_variant) = locale.variant.take() { 96 self.backup_variant = Some(single_variant); 97 return; 98 } 99 // 5. Check for parent override 100 if let Some((language, script, region)) = self.get_explicit_parent(locale) { 101 locale.language = language; 102 locale.script = script; 103 locale.region = region; 104 locale.variant = self.backup_variant.take(); 105 return; 106 } 107 // 7. Remove region 108 if let Some(region) = locale.region { 109 // 6. Add the script subtag if necessary 110 if locale.script.is_none() { 111 let language = locale.language; 112 if let Some(script) = self.likely_subtags.language_region.get_copied(&( 113 language.to_tinystr().to_unvalidated(), 114 region.to_tinystr().to_unvalidated(), 115 )) { 116 locale.script = Some(script); 117 } 118 } 119 locale.region = None; 120 locale.variant = self.backup_variant.take(); 121 return; 122 } 123 // 8. Remove language+script 124 debug_assert!(!locale.language.is_default() || locale.script.is_some()); // don't call .step() on und 125 locale.script = None; 126 locale.language = Language::UND; 127 } 128 step_region(&mut self, locale: &mut DataLocale)129 fn step_region(&mut self, locale: &mut DataLocale) { 130 // TODO(#4413): -u-rg is not yet supported 131 // 2. Remove the subdivision keyword 132 if let Some(value) = locale.subdivision.take() { 133 self.backup_subdivision = Some(value); 134 return; 135 } 136 // 4. Remove variants 137 if let Some(variant) = locale.variant.take() { 138 self.backup_variant = Some(variant); 139 return; 140 } 141 // 5. Remove language+script 142 if !locale.language.is_default() || locale.script.is_some() { 143 locale.script = None; 144 locale.language = Language::UND; 145 // Don't produce und-variant 146 if locale.region.is_some() { 147 locale.variant = self.backup_variant.take(); 148 locale.subdivision = self.backup_subdivision.take(); 149 } 150 return; 151 } 152 // 6. Remove region 153 debug_assert!(locale.region.is_some()); // don't call .step() on und 154 locale.region = None; 155 } 156 step_script(&mut self, locale: &mut DataLocale)157 fn step_script(&mut self, locale: &mut DataLocale) { 158 // Remove the subdivision keyword 159 if let Some(value) = locale.subdivision.take() { 160 self.backup_subdivision = Some(value); 161 return; 162 } 163 // Remove variants 164 if let Some(variant) = locale.variant.take() { 165 self.backup_variant = Some(variant); 166 return; 167 } 168 // Check for parent override 169 if let Some((language, script, region)) = self.get_explicit_parent(locale) { 170 locale.language = language; 171 locale.script = script; 172 locale.region = region; 173 locale.variant = self.backup_variant.take(); 174 return; 175 } 176 // Remove the region 177 if let Some(region) = locale.region { 178 self.backup_region = Some(region); 179 let language_implied_script = self 180 .likely_subtags 181 .language 182 .get_copied(&locale.language.to_tinystr().to_unvalidated()) 183 .map(|(s, _r)| s); 184 if language_implied_script != self.max_script { 185 locale.script = self.max_script; 186 } 187 locale.region = None; 188 locale.variant = self.backup_variant.take(); 189 return; 190 } 191 192 // Remove the script if we have a language 193 if !locale.language.is_default() { 194 let language_implied_script = self 195 .likely_subtags 196 .language 197 .get_copied(&locale.language.to_tinystr().to_unvalidated()) 198 .map(|(s, _r)| s); 199 if locale.script.is_some() && language_implied_script == locale.script { 200 locale.script = None; 201 if let Some(region) = self.backup_region.take() { 202 locale.region = Some(region); 203 locale.subdivision = self.backup_subdivision.take(); 204 locale.variant = self.backup_variant.take(); 205 } 206 // needed if more fallback is added at the end 207 #[allow(clippy::needless_return)] 208 return; 209 } else { 210 // 3. Remove the language and apply the maximized script 211 locale.language = Language::UND; 212 locale.script = self.max_script; 213 // Don't produce und-variant 214 if locale.script.is_some() { 215 locale.variant = self.backup_variant.take(); 216 } 217 // needed if more fallback is added at the end 218 #[allow(clippy::needless_return)] 219 return; 220 } 221 } 222 223 // note: UTS #35 wants us to apply "other associated scripts" now. ICU4C/J does not do this, 224 // so we don't either. They would be found here if they are ever needed: 225 // https://github.com/unicode-cldr/cldr-core/blob/master/supplemental/languageData.json 226 227 // 6. Remove script 228 if locale.script.is_some() { 229 locale.script = None; 230 } 231 } 232 get_explicit_parent( &self, locale: &DataLocale, ) -> Option<(Language, Option<Script>, Option<Region>)>233 fn get_explicit_parent( 234 &self, 235 locale: &DataLocale, 236 ) -> Option<(Language, Option<Script>, Option<Region>)> { 237 self.parents 238 .parents 239 .get_copied_by(|uvstr| locale.strict_cmp(uvstr).reverse()) 240 } 241 } 242 243 #[cfg(test)] 244 mod tests { 245 use super::*; 246 use writeable::Writeable; 247 248 struct TestCase { 249 input: &'static str, 250 requires_data: bool, 251 // Note: The first entry in the chain is the normalized locale 252 expected_language_chain: &'static [&'static str], 253 expected_script_chain: &'static [&'static str], 254 expected_region_chain: &'static [&'static str], 255 } 256 257 // TODO: Consider loading these from a JSON file 258 const TEST_CASES: &[TestCase] = &[ 259 TestCase { 260 input: "en-fonipa", 261 requires_data: false, 262 expected_language_chain: &["en-fonipa", "en"], 263 expected_script_chain: &["en-fonipa", "en"], 264 expected_region_chain: &["en-fonipa", "en"], 265 }, 266 TestCase { 267 input: "en-US-u-sd-usca", 268 requires_data: false, 269 expected_language_chain: &["en-US-u-sd-usca", "en-US", "en"], 270 expected_script_chain: &["en-US-u-sd-usca", "en-US", "en"], 271 expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"], 272 }, 273 TestCase { 274 input: "en-US-fonipa-u-sd-usca", 275 requires_data: false, 276 expected_language_chain: &[ 277 "en-US-fonipa-u-sd-usca", 278 "en-US-fonipa", 279 "en-US", 280 "en-fonipa", 281 "en", 282 ], 283 expected_script_chain: &[ 284 "en-US-fonipa-u-sd-usca", 285 "en-US-fonipa", 286 "en-US", 287 "en-fonipa", 288 "en", 289 ], 290 expected_region_chain: &[ 291 "en-US-fonipa-u-sd-usca", 292 "en-US-fonipa", 293 "en-US", 294 "und-US-fonipa-u-sd-usca", 295 "und-US-fonipa", 296 "und-US", 297 ], 298 }, 299 TestCase { 300 input: "en-fonipa", 301 requires_data: true, 302 expected_language_chain: &["en-fonipa", "en"], 303 expected_script_chain: &["en-fonipa", "en", "und-Latn-fonipa", "und-Latn"], 304 expected_region_chain: &["en-US-fonipa", "en-US", "und-US-fonipa", "und-US"], 305 }, 306 TestCase { 307 input: "en-Latn-fonipa", 308 requires_data: true, 309 expected_language_chain: &["en-fonipa", "en"], 310 expected_script_chain: &["en-fonipa", "en", "und-Latn-fonipa", "und-Latn"], 311 expected_region_chain: &["en-US-fonipa", "en-US", "und-US-fonipa", "und-US"], 312 }, 313 TestCase { 314 input: "en-Latn-US-u-sd-usca", 315 requires_data: true, 316 expected_language_chain: &["en-US-u-sd-usca", "en-US", "en"], 317 expected_script_chain: &["en-US-u-sd-usca", "en-US", "en", "und-Latn"], 318 expected_region_chain: &["en-US-u-sd-usca", "en-US", "und-US-u-sd-usca", "und-US"], 319 }, 320 TestCase { 321 input: "sr-ME", 322 requires_data: true, 323 expected_language_chain: &["sr-ME", "sr-Latn"], 324 expected_script_chain: &["sr-ME", "sr-Latn", "und-Latn"], 325 expected_region_chain: &["sr-ME", "und-ME"], 326 }, 327 TestCase { 328 input: "sr-Latn-ME", 329 requires_data: true, 330 expected_language_chain: &["sr-ME", "sr-Latn"], 331 expected_script_chain: &["sr-ME", "sr-Latn", "und-Latn"], 332 expected_region_chain: &["sr-ME", "und-ME"], 333 }, 334 TestCase { 335 input: "sr-ME-fonipa", 336 requires_data: true, 337 expected_language_chain: &["sr-ME-fonipa", "sr-ME", "sr-Latn-fonipa", "sr-Latn"], 338 expected_script_chain: &[ 339 "sr-ME-fonipa", 340 "sr-ME", 341 "sr-Latn-fonipa", 342 "sr-Latn", 343 "und-Latn-fonipa", 344 "und-Latn", 345 ], 346 expected_region_chain: &["sr-ME-fonipa", "sr-ME", "und-ME-fonipa", "und-ME"], 347 }, 348 TestCase { 349 input: "sr-RS", 350 requires_data: true, 351 expected_language_chain: &["sr-RS", "sr"], 352 expected_script_chain: &["sr-RS", "sr", "und-Cyrl"], 353 expected_region_chain: &["sr-RS", "und-RS"], 354 }, 355 TestCase { 356 input: "sr-Cyrl-RS", 357 requires_data: true, 358 expected_language_chain: &["sr-RS", "sr"], 359 expected_script_chain: &["sr-RS", "sr", "und-Cyrl"], 360 expected_region_chain: &["sr-RS", "und-RS"], 361 }, 362 TestCase { 363 input: "sr-Latn-RS", 364 requires_data: true, 365 expected_language_chain: &["sr-Latn-RS", "sr-Latn"], 366 expected_script_chain: &["sr-Latn-RS", "sr-Latn", "und-Latn"], 367 expected_region_chain: &["sr-Latn-RS", "und-RS"], 368 }, 369 TestCase { 370 input: "de-Latn-LI", 371 requires_data: true, 372 expected_language_chain: &["de-LI", "de"], 373 expected_script_chain: &["de-LI", "de", "und-Latn"], 374 expected_region_chain: &["de-LI", "und-LI"], 375 }, 376 TestCase { 377 input: "ca-ES-valencia", 378 requires_data: true, 379 expected_language_chain: &["ca-ES-valencia", "ca-ES", "ca-valencia", "ca"], 380 expected_script_chain: &[ 381 "ca-ES-valencia", 382 "ca-ES", 383 "ca-valencia", 384 "ca", 385 "und-Latn-valencia", 386 "und-Latn", 387 ], 388 expected_region_chain: &["ca-ES-valencia", "ca-ES", "und-ES-valencia", "und-ES"], 389 }, 390 TestCase { 391 input: "es-AR", 392 requires_data: true, 393 expected_language_chain: &["es-AR", "es-419", "es"], 394 expected_script_chain: &["es-AR", "es-419", "es", "und-Latn"], 395 expected_region_chain: &["es-AR", "und-AR"], 396 }, 397 TestCase { 398 input: "hi-IN", 399 requires_data: true, 400 expected_language_chain: &["hi-IN", "hi"], 401 expected_script_chain: &["hi-IN", "hi", "und-Deva"], 402 expected_region_chain: &["hi-IN", "und-IN"], 403 }, 404 TestCase { 405 input: "hi-Latn-IN", 406 requires_data: true, 407 expected_language_chain: &["hi-Latn-IN", "hi-Latn", "en-IN", "en-001", "en"], 408 expected_script_chain: &["hi-Latn-IN", "hi-Latn", "en-IN", "en-001", "en", "und-Latn"], 409 expected_region_chain: &["hi-Latn-IN", "und-IN"], 410 }, 411 TestCase { 412 input: "zh-CN", 413 requires_data: true, 414 // Note: "zh-Hans" is not reachable because it is the default script for "zh". 415 // The fallback algorithm does not visit the language-script bundle when the 416 // script is the default for the language 417 expected_language_chain: &["zh-CN", "zh"], 418 expected_script_chain: &["zh-CN", "zh", "und-Hans", "und-Hani"], 419 expected_region_chain: &["zh-CN", "und-CN"], 420 }, 421 TestCase { 422 input: "zh-TW", 423 requires_data: true, 424 expected_language_chain: &["zh-TW", "zh-Hant"], 425 expected_script_chain: &["zh-TW", "zh-Hant", "und-Hant", "und-Hani"], 426 expected_region_chain: &["zh-TW", "und-TW"], 427 }, 428 TestCase { 429 input: "yue-HK", 430 requires_data: true, 431 expected_language_chain: &["yue-HK", "yue"], 432 expected_script_chain: &["yue-HK", "yue", "und-Hant", "und-Hani"], 433 expected_region_chain: &["yue-HK", "und-HK"], 434 }, 435 TestCase { 436 input: "yue-HK", 437 requires_data: true, 438 expected_language_chain: &["yue-HK", "yue"], 439 expected_script_chain: &["yue-HK", "yue", "und-Hant", "und-Hani"], 440 expected_region_chain: &["yue-HK", "und-HK"], 441 }, 442 TestCase { 443 input: "yue-CN", 444 requires_data: true, 445 expected_language_chain: &["yue-CN", "yue-Hans"], 446 expected_script_chain: &["yue-CN", "yue-Hans", "und-Hans", "und-Hani"], 447 expected_region_chain: &["yue-CN", "und-CN"], 448 }, 449 TestCase { 450 input: "az-Arab-IR", 451 requires_data: true, 452 expected_language_chain: &["az-IR", "az-Arab"], 453 expected_script_chain: &["az-IR", "az-Arab", "und-Arab"], 454 expected_region_chain: &["az-IR", "und-IR"], 455 }, 456 TestCase { 457 input: "az-IR", 458 requires_data: true, 459 expected_language_chain: &["az-IR", "az-Arab"], 460 expected_script_chain: &["az-IR", "az-Arab", "und-Arab"], 461 expected_region_chain: &["az-IR", "und-IR"], 462 }, 463 TestCase { 464 input: "az-Arab", 465 requires_data: true, 466 expected_language_chain: &["az-Arab"], 467 expected_script_chain: &["az-Arab", "und-Arab"], 468 expected_region_chain: &["az-IR", "und-IR"], 469 }, 470 ]; 471 472 #[test] test_fallback()473 fn test_fallback() { 474 let fallbacker_no_data = LocaleFallbacker::new_without_data(); 475 let fallbacker_no_data = fallbacker_no_data.as_borrowed(); 476 let fallbacker_with_data = LocaleFallbacker::new(); 477 for cas in TEST_CASES { 478 for (priority, expected_chain) in [ 479 ( 480 LocaleFallbackPriority::Language, 481 cas.expected_language_chain, 482 ), 483 (LocaleFallbackPriority::Script, cas.expected_script_chain), 484 (LocaleFallbackPriority::Region, cas.expected_region_chain), 485 ] { 486 let mut config = LocaleFallbackConfig::default(); 487 config.priority = priority; 488 let fallbacker = if cas.requires_data { 489 fallbacker_with_data 490 } else { 491 fallbacker_no_data 492 }; 493 let mut it = fallbacker 494 .for_config(config) 495 .fallback_for(cas.input.parse().unwrap()); 496 let mut actual_chain = Vec::new(); 497 for i in 0..20 { 498 if i == 19 { 499 eprintln!("20 iterations reached!"); 500 } 501 if it.get().is_default() { 502 break; 503 } 504 actual_chain.push(it.get().write_to_string().into_owned()); 505 it.step(); 506 } 507 assert_eq!( 508 expected_chain, &actual_chain, 509 "{:?} ({:?})", 510 cas.input, priority 511 ); 512 } 513 } 514 } 515 } 516