1 package org.unicode.cldr.draft; 2 3 import java.util.Collection; 4 import java.util.Collections; 5 import java.util.HashMap; 6 import java.util.List; 7 import java.util.Map; 8 import java.util.Map.Entry; 9 import java.util.Objects; 10 import java.util.Set; 11 import java.util.TreeMap; 12 import java.util.TreeSet; 13 14 import org.unicode.cldr.util.CLDRConfig; 15 import org.unicode.cldr.util.CldrUtility; 16 import org.unicode.cldr.util.Counter; 17 import org.unicode.cldr.util.LanguageTagParser; 18 import org.unicode.cldr.util.SupplementalDataInfo; 19 20 import com.google.common.base.Splitter; 21 import com.google.common.collect.HashMultimap; 22 import com.google.common.collect.Multimap; 23 import com.ibm.icu.impl.Row.R2; 24 import com.ibm.icu.util.ULocale; 25 import com.ibm.icu.util.ULocale.Minimize; 26 27 public class XLikelySubtags { 28 private static final SupplementalDataInfo SDI = CLDRConfig.getInstance().getSupplementalDataInfo(); 29 private static final Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo(); 30 private static final Map<String, R2<List<String>, String>> REGION_ALIASES = aliasInfo.get("territory"); 31 private static final Map<String, R2<List<String>, String>> LANGUAGE_ALIASES = aliasInfo.get("language"); 32 private static final XLikelySubtags DEFAULT = new XLikelySubtags(SDI.getLikelySubtags(), true); 33 getDefault()34 public static final XLikelySubtags getDefault() { 35 return DEFAULT; 36 } 37 38 private static final boolean SHORT = false; 39 40 static abstract class Maker { make()41 abstract <V> V make(); 42 43 @SuppressWarnings("unchecked") getSubtable(Map<K, V> langTable, final K language)44 public <K, V> V getSubtable(Map<K, V> langTable, final K language) { 45 V scriptTable = langTable.get(language); 46 if (scriptTable == null) { 47 langTable.put(language, scriptTable = (V) make()); 48 } 49 return scriptTable; 50 } 51 52 static final Maker HASHMAP = new Maker() { 53 @SuppressWarnings("unchecked") 54 public Map<Object, Object> make() { 55 return new HashMap<>(); 56 } 57 }; 58 59 static final Maker TREEMAP = new Maker() { 60 @SuppressWarnings("unchecked") 61 public Map<Object, Object> make() { 62 return new TreeMap<>(); 63 } 64 }; 65 } 66 67 public static class LSR { 68 public final String language; 69 public final String script; 70 public final String region; 71 from(String language, String script, String region)72 public static LSR from(String language, String script, String region) { 73 return new LSR(language, script, region); 74 } 75 from(ULocale locale)76 public static LSR from(ULocale locale) { 77 return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry()); 78 } 79 fromMaximalized(ULocale locale)80 public static LSR fromMaximalized(ULocale locale) { 81 return fromMaximalized(locale.getLanguage(), locale.getScript(), locale.getCountry()); 82 } 83 fromMaximalized(String language, String script, String region)84 public static LSR fromMaximalized(String language, String script, String region) { 85 String canonicalLanguage = getCanonical(LANGUAGE_ALIASES.get(language)); 86 // hack 87 if (language.equals("mo")) { 88 canonicalLanguage = "ro"; 89 } 90 String canonicalRegion = getCanonical(REGION_ALIASES.get(region)); 91 92 return DEFAULT.maximize( 93 canonicalLanguage == null ? language : canonicalLanguage, 94 script, 95 canonicalRegion == null ? region : canonicalRegion); 96 } 97 LSR(String language, String script, String region)98 public LSR(String language, String script, String region) { 99 this.language = language; 100 this.script = script; 101 this.region = region; 102 } 103 104 @Override toString()105 public String toString() { 106 StringBuilder result = new StringBuilder(language); 107 if (!script.isEmpty()) { 108 result.append('-').append(script); 109 } 110 if (!region.isEmpty()) { 111 result.append('-').append(region); 112 } 113 return result.toString(); 114 } 115 replace(String language2, String script2, String region2)116 public LSR replace(String language2, String script2, String region2) { 117 if (language2 == null && script2 == null && region2 == null) return this; 118 return new LSR( 119 language2 == null ? language : language2, 120 script2 == null ? script : script2, 121 region2 == null ? region : region2); 122 } 123 124 @Override equals(Object obj)125 public boolean equals(Object obj) { 126 LSR other = (LSR) obj; 127 return language.equals(other.language) 128 && script.equals(other.script) 129 && region.equals(other.region); 130 } 131 132 @Override hashCode()133 public int hashCode() { 134 return Objects.hash(language, script, region); 135 } 136 } 137 138 final Map<String, Map<String, Map<String, LSR>>> langTable; 139 XLikelySubtags(Map<String, String> rawData, boolean skipNoncanonical)140 public XLikelySubtags(Map<String, String> rawData, boolean skipNoncanonical) { 141 this.langTable = init(rawData, skipNoncanonical); 142 } 143 init(final Map<String, String> rawData, boolean skipNoncanonical)144 private Map<String, Map<String, Map<String, LSR>>> init(final Map<String, String> rawData, boolean skipNoncanonical) { 145 // prepare alias info. We want a mapping from the canonical form to all aliases 146 147 Multimap<String, String> canonicalToAliasLanguage = HashMultimap.create(); 148 getAliasInfo(LANGUAGE_ALIASES, canonicalToAliasLanguage); 149 150 // Don't bother with script; there are none 151 152 Multimap<String, String> canonicalToAliasRegion = HashMultimap.create(); 153 getAliasInfo(REGION_ALIASES, canonicalToAliasRegion); 154 155 Maker maker = Maker.TREEMAP; 156 Map<String, Map<String, Map<String, LSR>>> result = maker.make(); 157 LanguageTagParser ltp = new LanguageTagParser(); 158 Splitter bar = Splitter.on('_'); 159 int last = -1; 160 // set the base data 161 Map<LSR, LSR> internCache = new HashMap<>(); 162 for (Entry<String, String> sourceTarget : rawData.entrySet()) { 163 ltp.set(sourceTarget.getKey()); 164 final String language = ltp.getLanguage(); 165 final String script = ltp.getScript(); 166 final String region = ltp.getRegion(); 167 168 ltp.set(sourceTarget.getValue()); 169 String languageTarget = ltp.getLanguage(); 170 final String scriptTarget = ltp.getScript(); 171 final String regionTarget = ltp.getRegion(); 172 173 set(result, language, script, region, languageTarget, scriptTarget, regionTarget, internCache); 174 // now add aliases 175 Collection<String> languageAliases = canonicalToAliasLanguage.get(language); 176 if (languageAliases.isEmpty()) { 177 languageAliases = Collections.singleton(language); 178 } 179 Collection<String> regionAliases = canonicalToAliasRegion.get(region); 180 if (regionAliases.isEmpty()) { 181 regionAliases = Collections.singleton(region); 182 } 183 for (String languageAlias : languageAliases) { 184 for (String regionAlias : regionAliases) { 185 if (languageAlias.equals(language) && regionAlias.equals(region)) { 186 continue; 187 } 188 set(result, languageAlias, script, regionAlias, languageTarget, scriptTarget, regionTarget, internCache); 189 } 190 } 191 } 192 // hack 193 set(result, "und", "Latn", "", "en", "Latn", "US", internCache); 194 195 // hack, ensure that if und-YY => und-Xxxx-YY, then we add Xxxx=>YY to the table 196 // <likelySubtag from="und_GH" to="ak_Latn_GH"/> 197 198 // so und-Latn-GH => ak-Latn-GH 199 Map<String, Map<String, LSR>> undScriptMap = result.get("und"); 200 Map<String, LSR> undEmptyRegionMap = undScriptMap.get(""); 201 for (Entry<String, LSR> regionEntry : undEmptyRegionMap.entrySet()) { 202 final LSR value = regionEntry.getValue(); 203 set(result, "und", value.script, value.region, value); 204 } 205 // 206 // check that every level has "" (or "und") 207 if (!result.containsKey("und")) { 208 throw new IllegalArgumentException("failure: base"); 209 } 210 for (Entry<String, Map<String, Map<String, LSR>>> langEntry : result.entrySet()) { 211 String lang = langEntry.getKey(); 212 final Map<String, Map<String, LSR>> scriptMap = langEntry.getValue(); 213 if (!scriptMap.containsKey("")) { 214 throw new IllegalArgumentException("failure: " + lang); 215 } 216 for (Entry<String, Map<String, LSR>> scriptEntry : scriptMap.entrySet()) { 217 String script = scriptEntry.getKey(); 218 final Map<String, LSR> regionMap = scriptEntry.getValue(); 219 if (!regionMap.containsKey("")) { 220 throw new IllegalArgumentException("failure: " + lang + "-" + script); 221 } 222 // for (Entry<String, LSR> regionEntry : regionMap.entrySet()) { 223 // String region = regionEntry.getKey(); 224 // LSR value = regionEntry.getValue(); 225 // } 226 } 227 } 228 return result; 229 } 230 getAliasInfo(Map<String, R2<List<String>, String>> aliasInfo, Multimap<String, String> canonicalToAlias)231 private void getAliasInfo(Map<String, R2<List<String>, String>> aliasInfo, Multimap<String, String> canonicalToAlias) { 232 for (Entry<String, R2<List<String>, String>> e : aliasInfo.entrySet()) { 233 final String alias = e.getKey(); 234 if (alias.contains("_")) { 235 continue; // only do simple aliasing 236 } 237 String canonical = getCanonical(e.getValue()); 238 canonicalToAlias.put(canonical, alias); 239 } 240 } 241 getCanonical(R2<List<String>, String> aliasAndReason)242 private static String getCanonical(R2<List<String>, String> aliasAndReason) { 243 if (aliasAndReason == null) { 244 return null; 245 } 246 if (aliasAndReason.get1().equals("overlong")) { 247 return null; 248 } 249 List<String> value = aliasAndReason.get0(); 250 if (value.size() != 1) { 251 return null; 252 } 253 final String canonical = value.iterator().next(); 254 if (canonical.contains("_")) { 255 return null; // only do simple aliasing 256 } 257 return canonical; 258 } 259 set(Map<String, Map<String, Map<String, LSR>>> langTable, final String language, final String script, final String region, final String languageTarget, final String scriptTarget, final String regionTarget, Map<LSR, LSR> internCache)260 private void set(Map<String, Map<String, Map<String, LSR>>> langTable, final String language, final String script, final String region, 261 final String languageTarget, final String scriptTarget, final String regionTarget, Map<LSR, LSR> internCache) { 262 LSR newValue = new LSR(languageTarget, scriptTarget, regionTarget); 263 LSR oldValue = internCache.get(newValue); 264 if (oldValue == null) { 265 internCache.put(newValue, newValue); 266 oldValue = newValue; 267 } 268 set(langTable, language, script, region, oldValue); 269 } 270 set(Map<String, Map<String, Map<String, LSR>>> langTable, final String language, final String script, final String region, LSR newValue)271 private void set(Map<String, Map<String, Map<String, LSR>>> langTable, final String language, final String script, final String region, LSR newValue) { 272 Map<String, Map<String, LSR>> scriptTable = Maker.TREEMAP.getSubtable(langTable, language); 273 Map<String, LSR> regionTable = Maker.TREEMAP.getSubtable(scriptTable, script); 274 LSR oldValue = regionTable.get(region); 275 if (oldValue != null) { 276 int debug = 0; 277 } 278 regionTable.put(region, newValue); 279 } 280 281 /** 282 * Convenience methods 283 * @param source 284 * @return 285 */ maximize(String source)286 public LSR maximize(String source) { 287 return maximize(ULocale.forLanguageTag(source)); 288 } 289 maximize(ULocale source)290 public LSR maximize(ULocale source) { 291 return maximize(source.getLanguage(), source.getScript(), source.getCountry()); 292 } 293 maximize(LSR source)294 public LSR maximize(LSR source) { 295 return maximize(source.language, source.script, source.region); 296 } 297 298 // public static ULocale addLikelySubtags(ULocale loc) { 299 // 300 // } 301 302 /** 303 * Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN". 304 */ maximize(String language, String script, String region)305 public LSR maximize(String language, String script, String region) { 306 int retainOldMask = 0; 307 Map<String, Map<String, LSR>> scriptTable = langTable.get(language); 308 if (scriptTable == null) { // cannot happen if language == "und" 309 retainOldMask |= 4; 310 scriptTable = langTable.get("und"); 311 } else if (!language.equals("und")) { 312 retainOldMask |= 4; 313 } 314 315 if (script.equals("Zzzz")) { 316 script = ""; 317 } 318 Map<String, LSR> regionTable = scriptTable.get(script); 319 if (regionTable == null) { // cannot happen if script == "" 320 retainOldMask |= 2; 321 regionTable = scriptTable.get(""); 322 } else if (!script.isEmpty()) { 323 retainOldMask |= 2; 324 } 325 326 if (region.equals("ZZ")) { 327 region = ""; 328 } 329 LSR result = regionTable.get(region); 330 if (result == null) { // cannot happen if region == "" 331 retainOldMask |= 1; 332 result = regionTable.get(""); 333 if (result == null) { 334 return null; 335 } 336 } else if (!region.isEmpty()) { 337 retainOldMask |= 1; 338 } 339 340 switch (retainOldMask) { 341 default: 342 case 0: 343 return result; 344 case 1: 345 return result.replace(null, null, region); 346 case 2: 347 return result.replace(null, script, null); 348 case 3: 349 return result.replace(null, script, region); 350 case 4: 351 return result.replace(language, null, null); 352 case 5: 353 return result.replace(language, null, region); 354 case 6: 355 return result.replace(language, script, null); 356 case 7: 357 return result.replace(language, script, region); 358 } 359 } 360 minimizeSubtags(String languageIn, String scriptIn, String regionIn, Minimize fieldToFavor)361 private LSR minimizeSubtags(String languageIn, String scriptIn, String regionIn, Minimize fieldToFavor) { 362 LSR result = maximize(languageIn, scriptIn, regionIn); 363 364 // We could try just a series of checks, like: 365 // LSR result2 = addLikelySubtags(languageIn, "", ""); 366 // if result.equals(result2) return result2; 367 // However, we can optimize 2 of the cases: 368 // (languageIn, "", "") 369 // (languageIn, "", regionIn) 370 371 Map<String, Map<String, LSR>> scriptTable = langTable.get(result.language); 372 373 Map<String, LSR> regionTable0 = scriptTable.get(""); 374 LSR value00 = regionTable0.get(""); 375 boolean favorRegionOk = false; 376 if (result.script.equals(value00.script)) { //script is default 377 if (result.region.equals(value00.region)) { 378 return result.replace(null, "", ""); 379 } else if (fieldToFavor == fieldToFavor.FAVOR_REGION) { 380 return result.replace(null, "", null); 381 } else { 382 favorRegionOk = true; 383 } 384 } 385 386 // The last case is not as easy to optimize. 387 // Maybe do later, but for now use the straightforward code. 388 LSR result2 = maximize(languageIn, scriptIn, ""); 389 if (result2.equals(result)) { 390 return result.replace(null, null, ""); 391 } else if (favorRegionOk) { 392 return result.replace(null, "", null); 393 } 394 return result; 395 } 396 show(Map<String, V> map, String indent, StringBuilder output)397 private static <V> StringBuilder show(Map<String, V> map, String indent, StringBuilder output) { 398 String first = indent.isEmpty() ? "" : "\t"; 399 for (Entry<String, V> e : map.entrySet()) { 400 String key = e.getKey(); 401 V value = e.getValue(); 402 output.append(first + (key.isEmpty() ? "∅" : key)); 403 if (value instanceof Map) { 404 show((Map) value, indent + "\t", output); 405 } else { 406 output.append("\t" + CldrUtility.toString(value)).append("\n"); 407 } 408 first = indent; 409 } 410 return output; 411 } 412 413 @Override toString()414 public String toString() { 415 return show(langTable, "", new StringBuilder()).toString(); 416 } 417 main(String[] args)418 public static void main(String[] args) { 419 System.out.println(LSR.fromMaximalized(ULocale.ENGLISH)); 420 421 SupplementalDataInfo sdi = SDI; 422 final Map<String, String> rawData = sdi.getLikelySubtags(); 423 XLikelySubtags ls = XLikelySubtags.getDefault(); 424 System.out.println(ls); 425 ls.maximize(new ULocale("iw")); 426 if (true) return; 427 428 LanguageTagParser ltp = new LanguageTagParser(); 429 // String[][] tests = { 430 // {"und", "en-Latn-US"}, 431 // {"und-TW", "en-Latn-US"}, 432 // {"und-CN", "en-Latn-US"}, 433 // {"und-Hans", "en-Latn-US"}, 434 // {"und-Hans-CN", "en-Latn-US"}, 435 // {"und-Hans-TW", "en-Latn-US"}, 436 // {"und-Hant", "en-Latn-US"}, 437 // {"und-Hant-TW", "en-Latn-US"}, 438 // {"und-Hant-CN", "en-Latn-US"}, 439 // {"zh-TW", "en-Latn-US"}, 440 // {"zh-CN", "en-Latn-US"}, 441 // {"zh-Hans", "en-Latn-US"}, 442 // {"zh-Hans-CN", "en-Latn-US"}, 443 // {"zh-Hans-TW", "en-Latn-US"}, 444 // {"zh-Hant", "en-Latn-US"}, 445 // {"zh-Hant-TW", "en-Latn-US"}, 446 // {"zh-Hant-CN", "en-Latn-US"}, 447 // }; 448 // for (String[] sourceTarget : tests) { 449 // ltp.set(sourceTarget[0]); 450 // LSR result = ls.addLikelySubtags(ltp.getLanguage(), ltp.getScript(), ltp.getRegion()); 451 // ltp.set(sourceTarget[1]); 452 // ULocale sourceLocale = ULocale.forLanguageTag(sourceTarget[0]); 453 // ULocale max = ULocale.addLikelySubtags(sourceLocale); 454 // boolean same = max.toLanguageTag().equals(result.toString()); 455 // System.out.println(sourceTarget[0] + "\t" + sourceTarget[1] + "\t" + result + (same ? "" : "\t≠" + max.toLanguageTag())); 456 // } 457 458 // get all the languages, scripts, and regions 459 Set<String> languages = new TreeSet<String>(); 460 Set<String> scripts = new TreeSet<String>(); 461 Set<String> regions = new TreeSet<String>(); 462 Counter<String> languageCounter = new Counter<>(); 463 Counter<String> scriptCounter = new Counter<>(); 464 Counter<String> regionCounter = new Counter<>(); 465 466 for (Entry<String, String> sourceTarget : rawData.entrySet()) { 467 final String source = sourceTarget.getKey(); 468 ltp.set(source); 469 languages.add(ltp.getLanguage()); 470 scripts.add(ltp.getScript()); 471 regions.add(ltp.getRegion()); 472 final String target = sourceTarget.getValue(); 473 ltp.set(target); 474 add(target, languageCounter, ltp.getLanguage(), 1); 475 add(target, scriptCounter, ltp.getScript(), 1); 476 add(target, regionCounter, ltp.getRegion(), 1); 477 } 478 ltp.set("und-Zzzz-ZZ"); 479 languageCounter.add(ltp.getLanguage(), 1); 480 scriptCounter.add(ltp.getScript(), 1); 481 regionCounter.add(ltp.getRegion(), 1); 482 483 if (SHORT) { 484 removeSingletons(languages, languageCounter); 485 removeSingletons(scripts, scriptCounter); 486 removeSingletons(regions, regionCounter); 487 } 488 489 System.out.println("languages: " + languages.size() + "\n\t" + languages + "\n\t" + languageCounter); 490 System.out.println("scripts: " + scripts.size() + "\n\t" + scripts + "\n\t" + scriptCounter); 491 System.out.println("regions: " + regions.size() + "\n\t" + regions + "\n\t" + regionCounter); 492 493 int maxCount = Integer.MAX_VALUE; 494 495 int counter = maxCount; 496 long tempTime = System.nanoTime(); 497 newMax: for (String language : languages) { 498 for (String script : scripts) { 499 for (String region : regions) { 500 if (--counter < 0) break newMax; 501 LSR result = ls.maximize(language, script, region); 502 } 503 } 504 } 505 long newMaxTime = System.nanoTime() - tempTime; 506 System.out.println("newMaxTime: " + newMaxTime); 507 508 counter = maxCount; 509 tempTime = System.nanoTime(); 510 newMin: for (String language : languages) { 511 for (String script : scripts) { 512 for (String region : regions) { 513 if (--counter < 0) break newMin; 514 LSR minNewS = ls.minimizeSubtags(language, script, region, Minimize.FAVOR_SCRIPT); 515 } 516 } 517 } 518 long newMinTime = System.nanoTime() - tempTime; 519 System.out.println("newMinTime: " + newMinTime); 520 521 // ***** 522 523 tempTime = System.nanoTime(); 524 counter = maxCount; 525 oldMax: for (String language : languages) { 526 for (String script : scripts) { 527 for (String region : regions) { 528 if (--counter < 0) break oldMax; 529 ULocale tempLocale = new ULocale(language, script, region); 530 ULocale max = ULocale.addLikelySubtags(tempLocale); 531 } 532 } 533 } 534 long oldMaxTime = System.nanoTime() - tempTime; 535 System.out.println("oldMaxTime: " + oldMaxTime + "\t" + oldMaxTime / newMaxTime + "x"); 536 537 counter = maxCount; 538 tempTime = System.nanoTime(); 539 oldMin: for (String language : languages) { 540 for (String script : scripts) { 541 for (String region : regions) { 542 if (--counter < 0) break oldMin; 543 ULocale tempLocale = new ULocale(language, script, region); 544 ULocale minOldS = ULocale.minimizeSubtags(tempLocale, Minimize.FAVOR_SCRIPT); 545 } 546 } 547 } 548 long oldMinTime = System.nanoTime() - tempTime; 549 System.out.println("oldMinTime: " + oldMinTime + "\t" + oldMinTime / newMinTime + "x"); 550 551 counter = maxCount; 552 testMain: for (String language : languages) { 553 System.out.println(language); 554 int tests = 0; 555 for (String script : scripts) { 556 for (String region : regions) { 557 ++tests; 558 if (--counter < 0) break testMain; 559 LSR maxNew = ls.maximize(language, script, region); 560 LSR minNewS = ls.minimizeSubtags(language, script, region, Minimize.FAVOR_SCRIPT); 561 LSR minNewR = ls.minimizeSubtags(language, script, region, Minimize.FAVOR_REGION); 562 563 ULocale tempLocale = new ULocale(language, script, region); 564 ULocale maxOld = ULocale.addLikelySubtags(tempLocale); 565 ULocale minOldS = ULocale.minimizeSubtags(tempLocale, Minimize.FAVOR_SCRIPT); 566 ULocale minOldR = ULocale.minimizeSubtags(tempLocale, Minimize.FAVOR_REGION); 567 568 // check values 569 final String maxNewS = String.valueOf(maxNew); 570 final String maxOldS = maxOld.toLanguageTag(); 571 boolean sameMax = maxOldS.equals(maxNewS); 572 573 final String minNewSS = String.valueOf(minNewS); 574 final String minOldSS = minOldS.toLanguageTag(); 575 boolean sameMinS = minNewSS.equals(minOldSS); 576 577 final String minNewRS = String.valueOf(minNewR); 578 final String minOldRS = minOldS.toLanguageTag(); 579 boolean sameMinR = minNewRS.equals(minOldRS); 580 581 if (sameMax && sameMinS && sameMinR) continue; 582 System.out.println(new LSR(language, script, region) 583 + "\tmax: " + maxNew 584 + (sameMax ? "" : "≠" + maxOldS) 585 + "\tminS: " + minNewS 586 + (sameMinS ? "" : "≠" + minOldS) 587 + "\tminR: " + minNewR 588 + (sameMinR ? "" : "≠" + minOldR)); 589 } 590 } 591 System.out.println(language + ": " + tests); 592 } 593 } 594 add(String target, Counter<String> languageCounter, String language, int count)595 private static void add(String target, Counter<String> languageCounter, String language, int count) { 596 if (language.equals("aa")) { 597 int debug = 0; 598 } 599 languageCounter.add(language, count); 600 } 601 removeSingletons(Set<String> languages, Counter<String> languageCounter)602 private static void removeSingletons(Set<String> languages, Counter<String> languageCounter) { 603 for (String s : languageCounter) { 604 final long count = languageCounter.get(s); 605 if (count <= 1) { 606 languages.remove(s); 607 } 608 } 609 } 610 } 611