1 package org.unicode.cldr.util; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.MoreObjects; 5 import com.google.common.base.Objects; 6 import com.google.common.collect.ComparisonChain; 7 import com.google.common.collect.ImmutableMap; 8 import com.google.common.collect.ImmutableMultimap; 9 import com.google.common.collect.ImmutableSet; 10 import com.google.common.collect.Multimap; 11 import com.google.common.collect.TreeMultimap; 12 import com.ibm.icu.impl.Row.R2; 13 import java.util.ArrayList; 14 import java.util.Collection; 15 import java.util.EnumSet; 16 import java.util.HashSet; 17 import java.util.LinkedHashSet; 18 import java.util.List; 19 import java.util.Map; 20 import java.util.Map.Entry; 21 import java.util.Set; 22 import java.util.TreeMap; 23 import java.util.TreeSet; 24 import org.unicode.cldr.util.StandardCodes.LstrType; 25 26 /** 27 * Provides Unicode Language Identifier canonicalization for use in testing. The implementation is 28 * designed to be simple, and is not at all optimized for production use. It is used to verify the 29 * correctness of the specification algorithm, sanity-check the supplementalMetadata.xml alias data, 30 * and generate test files for use by implementations. 31 */ 32 public class LsrvCanonicalizer { 33 34 public static final Set<LstrType> LSRV = 35 ImmutableSet.of(LstrType.language, LstrType.script, LstrType.region, LstrType.variant); 36 public static final Joiner UNDERBAR_JOINER = Joiner.on('_'); 37 38 /** 39 * A representation of a Unicode Language Identifier in a format that makes it simple to 40 * process. The LSRV fields are represented as multimaps, though the LSR fields restricted to 41 * have only have 0 or 1 element. 42 */ 43 public static class XLanguageTag { 44 final Multimap<LstrType, String> data; 45 XLanguageTag(Multimap<LstrType, String> result)46 private XLanguageTag(Multimap<LstrType, String> result) { 47 data = ImmutableMultimap.copyOf(result); 48 } 49 keys()50 public Set<LstrType> keys() { 51 return data.keySet(); 52 } 53 get(LstrType lstrType)54 public Collection<String> get(LstrType lstrType) { 55 return data.get(lstrType); 56 } 57 toLocaleString()58 public String toLocaleString() { 59 StringBuilder buffer = new StringBuilder(); 60 final Collection<String> region = data.get(LstrType.language); 61 if (!region.isEmpty()) { 62 buffer.append(UNDERBAR_JOINER.join(region)); 63 } else { 64 buffer.append("und"); 65 } 66 addItem(buffer, LstrType.script, "", "_", UNDERBAR_JOINER); 67 addItem(buffer, LstrType.region, "", "_", UNDERBAR_JOINER); 68 addItem(buffer, LstrType.variant, "", "_", UNDERBAR_JOINER); 69 70 return buffer.toString(); 71 } 72 73 @Override toString()74 public String toString() { 75 StringBuilder buffer = new StringBuilder(); 76 addItem(buffer, LstrType.language, "", "L:", UNDERBAR_JOINER); 77 addItem(buffer, LstrType.script, ";", "S:", UNDERBAR_JOINER); 78 addItem(buffer, LstrType.region, ";", "R:", UNDERBAR_JOINER); 79 addItem(buffer, LstrType.variant, ";", "V:", UNDERBAR_JOINER); 80 return buffer.toString(); 81 } 82 addItem( StringBuilder buffer, LstrType lstrType, String separator, String prefix, final Joiner dashJoiner)83 public void addItem( 84 StringBuilder buffer, 85 LstrType lstrType, 86 String separator, 87 String prefix, 88 final Joiner dashJoiner) { 89 final Collection<String> region = data.get(lstrType); 90 if (!region.isEmpty()) { 91 if (buffer.length() > 0) { 92 buffer.append(separator); 93 } 94 buffer.append(prefix).append(dashJoiner.join(region)); 95 } 96 } 97 fromTag(LstrType lstrType, String tag)98 public static XLanguageTag fromTag(LstrType lstrType, String tag) { 99 Multimap<LstrType, String> result = TreeMultimap.create(); 100 LanguageTagParser source = new LanguageTagParser(); 101 final boolean isLanguage = lstrType == LstrType.language; 102 String prefix = isLanguage ? "" : "und_"; 103 try { 104 source.set(prefix + tag); 105 } catch (Exception e) { 106 return null; // skip ill-formed for now 107 // if (lstrType == LstrType.region && tag.length() == 3) { 108 // //result.put(LstrType.language, "und"); 109 // result.put(LstrType.region, tag); 110 // } else { 111 // result.put(LstrType.language, tag); 112 // } 113 // //System.out.println("ILLEGAL SOURCE\t" + lstrType + ":\t" + tag + 114 // " ⇒ " + result); // for debugging 115 // return new XLanguageTag(result); 116 } 117 if (!source.getLanguage().isEmpty() && !source.getLanguage().contains("und")) { 118 result.put(LstrType.language, source.getLanguage()); 119 } 120 if (!source.getScript().isEmpty()) { 121 result.put(LstrType.script, source.getScript()); 122 } 123 if (!source.getRegion().isEmpty()) { 124 result.put(LstrType.region, source.getRegion()); 125 } 126 if (!source.getVariants().isEmpty()) { 127 result.putAll(LstrType.variant, source.getVariants()); 128 } 129 return new XLanguageTag(result); 130 } 131 132 @Override equals(Object obj)133 public boolean equals(Object obj) { 134 return data.equals(((XLanguageTag) obj).data); 135 } 136 137 @Override hashCode()138 public int hashCode() { 139 return data.hashCode(); 140 } 141 set(LstrType lstrType, String string)142 public XLanguageTag set(LstrType lstrType, String string) { 143 Multimap<LstrType, String> result = TreeMultimap.create(data); 144 if (lstrType != LstrType.variant) { 145 result.removeAll(lstrType); 146 } 147 result.put(lstrType, string); 148 return new XLanguageTag(result); 149 } 150 151 /** 152 * containsAll is used in matching a ReplacementRule.<br> 153 * It is here instead of on ReplacementRule so we can use in the denormalization utility 154 * used in testing. 155 */ containsAll(XLanguageTag type)156 public boolean containsAll(XLanguageTag type) { 157 for (LstrType lstrType : LSRV) { 158 final Collection<String> sources = get(lstrType); 159 final Collection<String> types = type.get(lstrType); 160 if (!sources.containsAll(types)) { 161 return false; 162 } 163 } 164 return true; 165 } 166 167 /** 168 * Once a rule matches, this actually does the replacement.<br> 169 * It is here instead of on ReplacementRule so we can use it in the denormalization utility 170 * used in testing. 171 */ replacePartsFrom( XLanguageTag typeParts, XLanguageTag replacementParts)172 public XLanguageTag replacePartsFrom( 173 XLanguageTag typeParts, XLanguageTag replacementParts) { 174 Multimap<LstrType, String> result = TreeMultimap.create(); 175 for (LstrType lstrType : LSRV) { 176 Collection<String> sources = get(lstrType); 177 Collection<String> types = typeParts.get(lstrType); 178 Collection<String> replacements = replacementParts.get(lstrType); 179 result.putAll(lstrType, sources); 180 if (!types.isEmpty() && !replacements.isEmpty()) { 181 removeAll(result, lstrType, types); 182 result.putAll(lstrType, replacements); 183 } else if (!types.isEmpty() && replacements.isEmpty()) { 184 removeAll(result, lstrType, types); 185 } else if (types.isEmpty() && !replacements.isEmpty()) { 186 if (sources.isEmpty()) { 187 result.putAll(lstrType, replacements); 188 } 189 } else { 190 // otherwise both empty, skip 191 } 192 } 193 return new XLanguageTag(result); 194 } 195 } 196 197 /** 198 * A representation of the alias data for Unicode Language Identifiers in the 199 * supplementalMetadata.txt file. 200 */ 201 public static class ReplacementRule implements Comparable<ReplacementRule> { 202 private final XLanguageTag typeParts; 203 final XLanguageTag replacementParts; 204 final List<XLanguageTag> 205 secondaryReplacementSet; // TODO, using this information in special cases to impute 206 // the best language according to LDML 207 final String reason; 208 final boolean regular; 209 ReplacementRule( LstrType lstrType, String type, XLanguageTag typeParts, XLanguageTag replacementParts, List<XLanguageTag> secondaryReplacementSet, String reason)210 private ReplacementRule( 211 LstrType lstrType, 212 String type, 213 XLanguageTag typeParts, 214 XLanguageTag replacementParts, 215 List<XLanguageTag> secondaryReplacementSet, 216 String reason) { 217 this.typeParts = typeParts; 218 this.replacementParts = replacementParts; 219 this.secondaryReplacementSet = secondaryReplacementSet; 220 this.reason = reason; 221 this.regular = 222 typeParts.keys().equals(replacementParts.keys()) 223 && typeParts.get(LstrType.variant).size() 224 == replacementParts.get(LstrType.variant).size(); 225 } 226 from( LstrType lstrType, String type, List<String> replacement, String reason)227 static ReplacementRule from( 228 LstrType lstrType, String type, List<String> replacement, String reason) { 229 XLanguageTag typeParts = XLanguageTag.fromTag(lstrType, type); 230 if (typeParts == null) { 231 return null; // skip ill-formed for now 232 } 233 XLanguageTag replacementParts = XLanguageTag.fromTag(lstrType, replacement.get(0)); 234 if (replacementParts == null) { 235 return null; // skip ill-formed for now 236 } 237 List<XLanguageTag> secondaryReplacementSet = new ArrayList<>(); 238 for (int i = 1; i < replacement.size(); ++i) { 239 secondaryReplacementSet.add(XLanguageTag.fromTag(lstrType, replacement.get(i))); 240 } 241 return new ReplacementRule( 242 lstrType, type, typeParts, replacementParts, secondaryReplacementSet, reason); 243 } 244 245 @Override compareTo(ReplacementRule o)246 public int compareTo(ReplacementRule o) { 247 return ComparisonChain.start() 248 .compare( 249 -getType().keys().size(), 250 -o.getType().keys().size()) // sort most keys first 251 .compare(getType().toString(), o.getType().toString()) 252 .result(); 253 } 254 255 @Override equals(Object obj)256 public boolean equals(Object obj) { 257 return compareTo((ReplacementRule) obj) == 0; 258 } 259 260 @Override hashCode()261 public int hashCode() { 262 return Objects.hashCode(getType()); 263 } 264 265 @Override toString()266 public String toString() { 267 return MoreObjects.toStringHelper(getClass()) 268 .add("type", getType()) 269 .add("replacement", replacementParts) 270 .toString(); 271 } 272 getType()273 public XLanguageTag getType() { 274 return typeParts; 275 } 276 getReplacement()277 public XLanguageTag getReplacement() { 278 return replacementParts; 279 } 280 } 281 282 /** Utility to remove multiple items from Multimap */ removeAll(Multimap<K, V> result, K key, Iterable<V> value)283 public static <K, V> Multimap<K, V> removeAll(Multimap<K, V> result, K key, Iterable<V> value) { 284 for (V type : value) { 285 result.remove(key, type); 286 } 287 return result; 288 } 289 290 private Set<ReplacementRule> rules = new TreeSet<>(); 291 private Multimap<LstrType, String> inType = TreeMultimap.create(); 292 private Map<LstrType, String> irrelevant = new TreeMap<>(); 293 add(ReplacementRule replacementRule)294 private void add(ReplacementRule replacementRule) { 295 getRules().add(replacementRule); 296 } 297 298 /** 299 * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) 300 * 301 * @param lstrType This is a special flag used to indicate which supplementalMetadata alias type 302 * the languageTag is from. That determines whether to extend the type and replacement to be 303 * full LSRVs if they are partial, by adding "und_", for example. 304 * @param languageTag May be partial, if the lstrType is not LstrType.language. 305 */ canonicalize(LstrType lstrType, String languageTag)306 public String canonicalize(LstrType lstrType, String languageTag) { 307 XLanguageTag newTag = canonicalizeToX(XLanguageTag.fromTag(lstrType, languageTag), null); 308 return newTag.toString(); 309 } 310 311 /** 312 * Canonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in the 313 * XLanguageTag format. Also returns the rules used in the canonicalization.<br> 314 * NOT OPTIMIZED: just uses a linear search for simplicity; production code would use more 315 * efficient mechanisms 316 */ canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed)317 public XLanguageTag canonicalizeToX(XLanguageTag fromTag, List<ReplacementRule> rulesUsed) { 318 if (rulesUsed != null) { 319 rulesUsed.clear(); 320 } 321 XLanguageTag newTag = fromTag; 322 startAtTheTop: 323 while (true) { 324 for (ReplacementRule rule : getRules()) { 325 if (newTag.containsAll(rule.getType())) { 326 XLanguageTag temp = 327 newTag.replacePartsFrom(rule.getType(), rule.getReplacement()); 328 if (!temp.equals(newTag)) { 329 newTag = temp; 330 if (rulesUsed != null) { 331 rulesUsed.add(rule); 332 } 333 continue startAtTheTop; 334 } 335 } 336 } 337 return newTag; 338 } 339 } 340 341 /** 342 * Decanonicalize a Unicode Language Identifier (LSRV - language, script, region, variants) in 343 * the XLanguageTag format. Also returns the rules used in the canonicalization. Used in test 344 * case generation NOT OPTIMIZED: just for testing 345 */ decanonicalizeToX(XLanguageTag fromTag)346 public Set<XLanguageTag> decanonicalizeToX(XLanguageTag fromTag) { 347 Set<XLanguageTag> result = new HashSet<>(); 348 result.add(fromTag); 349 Set<XLanguageTag> intermediate = new HashSet<>(); 350 while (true) { 351 for (ReplacementRule rule : getRules()) { 352 if (!rule.getType().get(LstrType.variant).isEmpty()) { 353 continue; 354 } 355 for (XLanguageTag newTag : result) { 356 if (newTag.containsAll(rule.getReplacement())) { // reverse normal order 357 XLanguageTag changed = 358 newTag.replacePartsFrom( 359 rule.getReplacement(), 360 rule.getType()); // reverse normal order 361 if (!intermediate.contains(changed) && !result.contains(changed)) { 362 intermediate.add(changed); 363 } 364 } 365 } 366 } 367 if (intermediate.isEmpty()) { 368 result.remove(fromTag); 369 return result; 370 } 371 result.addAll(intermediate); 372 intermediate.clear(); 373 } 374 } 375 376 /** Utility for getting a filtered list of rules, mostly useful in debugging. */ filter(LstrType lstrType, String value)377 public List<ReplacementRule> filter(LstrType lstrType, String value) { 378 List<ReplacementRule> result = new ArrayList<>(); 379 for (ReplacementRule rule : getRules()) { 380 final Collection<String> items = rule.getType().get(lstrType); 381 if (value == null && !items.isEmpty() || value != null && items.contains(value)) { 382 result.add(rule); 383 } 384 } 385 return result; 386 } 387 getInstance()388 public static final LsrvCanonicalizer getInstance() { 389 return SINGLETON; 390 } 391 392 private static final LsrvCanonicalizer SINGLETON = load(); 393 load()394 private static LsrvCanonicalizer load() { 395 SupplementalDataInfo SDI = CLDRConfig.getInstance().getSupplementalDataInfo(); 396 Map<String, Map<String, R2<List<String>, String>>> aliases = SDI.getLocaleAliasInfo(); 397 // type -> tag -> , like "language" -> "sh" -> <{"sr_Latn"}, reason> 398 399 LsrvCanonicalizer rrs = new LsrvCanonicalizer(); 400 for (Entry<String, Map<String, R2<List<String>, String>>> typeTagReplacement : 401 aliases.entrySet()) { 402 String type = typeTagReplacement.getKey(); 403 if (type.contains("-")) { 404 throw new IllegalArgumentException( 405 "Bad format for alias: should have _ instead of -."); 406 } 407 LstrType lstrType = LstrType.fromString(type); 408 if (!LSRV.contains(lstrType)) { 409 continue; 410 } 411 for (Entry<String, R2<List<String>, String>> tagReplacementReason : 412 typeTagReplacement.getValue().entrySet()) { 413 String tag = tagReplacementReason.getKey(); 414 if (tag.contains("-")) { 415 throw new IllegalArgumentException( 416 "Bad format for alias: should have _ instead of -."); 417 } 418 List<String> replacement = tagReplacementReason.getValue().get0(); 419 if (replacement == null) { 420 System.out.println("No replacement: " + tagReplacementReason); 421 continue; 422 } 423 String reason = tagReplacementReason.getValue().get1(); 424 final ReplacementRule replacementRule = 425 ReplacementRule.from(lstrType, tag, replacement, reason); 426 if (replacementRule == null) { 427 // System.out.println("No rule: " + tagReplacementReason); 428 continue; 429 } 430 rrs.add(replacementRule); 431 } 432 } 433 rrs.rules = ImmutableSet.copyOf(rrs.rules); 434 for (ReplacementRule rule : rrs.rules) { 435 XLanguageTag type = rule.getType(); 436 XLanguageTag replacement = rule.getReplacement(); 437 for (LstrType lstrType : LsrvCanonicalizer.LSRV) { 438 rrs.inType.putAll(lstrType, type.get(lstrType)); 439 rrs.inType.putAll(lstrType, replacement.get(lstrType)); 440 } 441 } 442 rrs.inType = ImmutableMultimap.copyOf(rrs.inType); 443 444 for (LstrType lstrType : LsrvCanonicalizer.LSRV) { 445 Set<String> all = 446 new LinkedHashSet<>( 447 Validity.getInstance() 448 .getStatusToCodes(lstrType) 449 .get(Validity.Status.regular)); 450 all.removeAll(rrs.inType.get(lstrType)); 451 if (lstrType == LstrType.variant && all.contains("fonipa")) { 452 rrs.irrelevant.put(lstrType, "fonipa"); 453 } else { 454 rrs.irrelevant.put(lstrType, all.iterator().next()); 455 } 456 } 457 rrs.irrelevant = ImmutableMap.copyOf(rrs.irrelevant); 458 return rrs; 459 } 460 461 /** Returns the set of all the Replacement rules in the canonicalizer. */ getRules()462 public Set<ReplacementRule> getRules() { 463 return rules; 464 } 465 466 /** Types of test data */ 467 public enum TestDataTypes { 468 explicit, 469 fromAliases, 470 decanonicalized, 471 withIrrelevants 472 } 473 474 /** 475 * Returns test data for the rules, used to generate test data files. 476 * 477 * @param testDataTypes if null, returns all the data; otherwise the specified set. 478 * @return 479 */ getTestData(Set<TestDataTypes> testDataTypes)480 public Map<TestDataTypes, Map<String, String>> getTestData(Set<TestDataTypes> testDataTypes) { 481 Map<TestDataTypes, Map<String, String>> result = new TreeMap<>(); 482 483 if (testDataTypes == null) { 484 testDataTypes = EnumSet.allOf(TestDataTypes.class); 485 } 486 Set<String> allToTest = new TreeSet<>(); 487 if (testDataTypes.contains(TestDataTypes.explicit)) { 488 Map<String, String> testData2 = new TreeMap<>(); 489 String[][] tests = { 490 {"hye_arevmda", "hyw"}, 491 {"art_lojban", "jbo"}, 492 {"en_arevela", "en"}, 493 {"hy_arevela", "hy"}, 494 {"en_arevmda_arevela", "en"}, 495 {"hy_arevmda", "hyw"}, 496 {"hy_arevmda_arevela", "hyw"}, 497 {"en_lojban", "en"}, 498 {"en_US_polytoni", "en_US_polyton"}, 499 {"en_US_heploc", "en_US_alalc97"}, 500 {"en_US_aaland", "en_US"}, 501 {"en_aaland", "en_AX"}, 502 {"no_nynorsk_bokmal", "nb"}, 503 {"no_bokmal_nynorsk", "nb"}, 504 {"zh_guoyu_hakka_xiang", "hak"}, 505 {"zh_hakka_xiang", "hak"}, 506 }; 507 for (String row[] : tests) { 508 String toTest = row[0]; 509 String expected = row[1]; 510 testData2.put(toTest, expected); 511 } 512 allToTest.addAll(testData2.keySet()); 513 result.put(TestDataTypes.explicit, ImmutableMap.copyOf(testData2)); 514 } 515 516 if (testDataTypes.contains(TestDataTypes.fromAliases)) { 517 Map<String, String> testData2 = new TreeMap<>(); 518 for (ReplacementRule rule : getRules()) { 519 String toTest = rule.getType().toLocaleString(); 520 String expected = rule.getReplacement().toLocaleString(); 521 if (!allToTest.contains(toTest)) { 522 testData2.put(toTest, expected); 523 } 524 } 525 allToTest.addAll(testData2.keySet()); 526 result.put(TestDataTypes.fromAliases, ImmutableMap.copyOf(testData2)); 527 } 528 529 if (testDataTypes.contains(TestDataTypes.decanonicalized)) { 530 Map<String, String> testData2 = new TreeMap<>(); 531 for (String testItem : allToTest) { 532 for (XLanguageTag decon : 533 decanonicalizeToX(XLanguageTag.fromTag(LstrType.language, testItem))) { 534 XLanguageTag newTag = canonicalizeToX(decon, null); 535 final String toTest = decon.toLocaleString(); 536 if (!allToTest.contains(toTest)) { 537 testData2.put(toTest, newTag.toLocaleString()); 538 } 539 } 540 } 541 allToTest.addAll(testData2.keySet()); 542 result.put(TestDataTypes.decanonicalized, ImmutableMap.copyOf(testData2)); 543 } 544 545 if (testDataTypes.contains(TestDataTypes.withIrrelevants)) { 546 Map<String, String> testData2 = new TreeMap<>(); 547 for (String testItem : allToTest) { 548 XLanguageTag fluffedUp = 549 fluff(XLanguageTag.fromTag(LstrType.language, testItem), irrelevant); 550 XLanguageTag newTag = canonicalizeToX(fluffedUp, null); 551 final String toTest = fluffedUp.toLocaleString(); 552 if (!allToTest.contains(toTest)) { 553 testData2.put(toTest, newTag.toLocaleString()); 554 } 555 } 556 allToTest.addAll(testData2.keySet()); 557 result.put(TestDataTypes.withIrrelevants, ImmutableMap.copyOf(testData2)); 558 } 559 560 result = ImmutableMap.copyOf(result); 561 return result; 562 } 563 fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing)564 private static XLanguageTag fluff(XLanguageTag type, Map<LstrType, String> toAddIfMissing) { 565 XLanguageTag newTag = type; 566 for (LstrType lstrType : LsrvCanonicalizer.LSRV) { 567 if (type.get(lstrType).isEmpty() || lstrType == LstrType.variant) { 568 newTag = newTag.set(lstrType, toAddIfMissing.get(lstrType)); 569 } 570 } 571 return newTag; 572 } 573 574 /** Returns all the fields used in the type attribute of the alias rule. */ getInType(LstrType language)575 public Collection<String> getInType(LstrType language) { 576 return inType.get(language); 577 } 578 579 /** 580 * Returns some sample fields that do not appear in the type attribute of the alias rule, used 581 * for testing. 582 */ getIrrelevantField(LstrType language)583 public String getIrrelevantField(LstrType language) { 584 return irrelevant.get(language); 585 } 586 } 587