1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package com.ibm.icu.dev.tool.locale; 4 5 import java.io.BufferedWriter; 6 import java.io.File; 7 import java.io.FileOutputStream; 8 import java.io.IOException; 9 import java.io.OutputStreamWriter; 10 import java.io.PrintWriter; 11 import java.nio.ByteBuffer; 12 import java.nio.charset.StandardCharsets; 13 import java.util.ArrayList; 14 import java.util.Collection; 15 import java.util.Collections; 16 import java.util.HashSet; 17 import java.util.LinkedHashMap; 18 import java.util.LinkedHashSet; 19 import java.util.List; 20 import java.util.Map; 21 import java.util.Set; 22 import java.util.TreeMap; 23 import java.util.TreeSet; 24 25 import com.ibm.icu.impl.ICUData; 26 import com.ibm.icu.impl.ICUResourceBundle; 27 import com.ibm.icu.impl.UResource; 28 import com.ibm.icu.impl.locale.LSR; 29 import com.ibm.icu.impl.locale.LocaleDistance; 30 import com.ibm.icu.impl.locale.XCldrStub.Multimap; 31 import com.ibm.icu.impl.locale.XCldrStub.Predicate; 32 import com.ibm.icu.impl.locale.XCldrStub.Splitter; 33 import com.ibm.icu.impl.locale.XCldrStub.TreeMultimap; 34 import com.ibm.icu.impl.locale.XLikelySubtags; 35 import com.ibm.icu.util.BytesTrieBuilder; 36 import com.ibm.icu.util.Output; 37 import com.ibm.icu.util.ULocale; 38 39 public final class LocaleDistanceBuilder { 40 private static final String ANY = "�"; // matches any character. Uses value above any subtag. 41 42 private static final boolean DEBUG_OUTPUT = LSR.DEBUG_OUTPUT; 43 fixAny(String string)44 private static String fixAny(String string) { 45 return "*".equals(string) ? ANY : string; 46 } 47 getSupplementalDataBundle(String name)48 private static ICUResourceBundle getSupplementalDataBundle(String name) { 49 return ICUResourceBundle.getBundleInstance( 50 ICUData.ICU_BASE_NAME, name, 51 ICUResourceBundle.ICU_DATA_CLASS_LOADER, ICUResourceBundle.OpenType.DIRECT); 52 } 53 54 private static final class TerritoryContainment { 55 /** Directed, acyclic containment graph. Maps each container to its direct contents. */ 56 final Multimap<String, String> graph = TreeMultimap.create(); 57 /** Maps each container to all of its contents, direct and indirect. */ 58 final Multimap<String, String> resolved = TreeMultimap.create(); 59 /** Maps each container only to its leaf contents. */ 60 final Multimap<String, String> toLeavesOnly = TreeMultimap.create(); 61 /** The leaves of the graph. */ 62 final Set<String> leaves; 63 TerritoryContainment(ICUResourceBundle supplementalData)64 TerritoryContainment(ICUResourceBundle supplementalData) { 65 UResource.Value value = supplementalData.getValueWithFallback("territoryContainment"); 66 UResource.Key key = new UResource.Key(); 67 addContainments(key, value); 68 resolve("001"); 69 70 for (Map.Entry<String, Set<String>> entry : resolved.asMap().entrySet()) { 71 String container = entry.getKey(); 72 for (String contained : entry.getValue()) { 73 if (resolved.get(contained) == null) { // a leaf node (usually a country) 74 toLeavesOnly.put(container, contained); 75 } 76 } 77 } 78 leaves = toLeavesOnly.get("001"); 79 } 80 addContainments(UResource.Key key, UResource.Value value)81 private void addContainments(UResource.Key key, UResource.Value value) { 82 UResource.Table containers = value.getTable(); 83 for (int i = 0; containers.getKeyAndValue(i, key, value); ++i) { 84 if (key.length() <= 3) { 85 String container = key.toString(); 86 String[] contents = value.getStringArrayOrStringAsArray(); 87 for (String s : contents) { 88 graph.put(container, s); 89 } 90 } else { 91 addContainments(key, value); // containedGroupings etc. 92 } 93 } 94 } 95 resolve(String region)96 private Set<String> resolve(String region) { 97 Set<String> contained = graph.get(region); 98 if (contained == null) { 99 return Collections.emptySet(); 100 } 101 resolved.putAll(region, contained); // do top level 102 // then recursively 103 for (String subregion : contained) { 104 resolved.putAll(region, resolve(subregion)); 105 } 106 return resolved.get(region); 107 } 108 } 109 110 private static final class Rule { 111 final List<String> desired; 112 final List<String> supported; 113 final int distance; 114 final boolean oneway; 115 Rule(List<String> desired, List<String> supported, int distance, boolean oneway)116 Rule(List<String> desired, List<String> supported, int distance, boolean oneway) { 117 this.desired = desired; 118 this.supported = supported; 119 this.distance = distance; 120 this.oneway = oneway; 121 } 122 } 123 makeUniqueIndex(Map<T, Integer> objectToInt, T source)124 private static final <T> int makeUniqueIndex(Map<T, Integer> objectToInt, T source) { 125 Integer result = objectToInt.get(source); 126 if (result == null) { 127 int newResult = objectToInt.size(); 128 objectToInt.put(source, newResult); 129 return newResult; 130 } else { 131 return result; 132 } 133 } 134 135 private static final class TrieBuilder { 136 byte[] bytes = new byte[24]; 137 int length = 0; 138 BytesTrieBuilder tb = new BytesTrieBuilder(); 139 addStar(int value)140 void addStar(int value) { 141 assert value >= 0; 142 bytes[length++] = '*'; 143 tb.add(bytes, length, value); 144 } 145 addSubtag(String s, int value)146 void addSubtag(String s, int value) { 147 assert !s.isEmpty(); 148 assert !s.equals(ANY); 149 int end = s.length() - 1; 150 for (int i = 0;; ++i) { 151 char c = s.charAt(i); 152 assert c <= 0x7f; 153 if (i < end) { 154 bytes[length++] = (byte) c; 155 } else { 156 // Mark the last character as a terminator to avoid overlap matches. 157 bytes[length++] = (byte) (c | LocaleDistance.END_OF_SUBTAG); 158 break; 159 } 160 } 161 if (value >= 0) { 162 tb.add(bytes, length, value); 163 } 164 } 165 build()166 byte[] build() { 167 ByteBuffer buffer = tb.buildByteBuffer(BytesTrieBuilder.Option.SMALL); 168 // Allocate an array with just the necessary capacity, 169 // so that we do not hold on to a larger array for a long time. 170 byte[] bytes = new byte[buffer.remaining()]; 171 buffer.get(bytes); 172 if (DEBUG_OUTPUT) { 173 System.out.println("distance trie size: " + bytes.length + " bytes"); 174 } 175 return bytes; 176 } 177 } 178 179 private static final class DistanceTable { 180 int nodeDistance; // distance for the lookup so far 181 final Map<String, Map<String, DistanceTable>> subtables; 182 DistanceTable(int distance)183 DistanceTable(int distance) { 184 nodeDistance = distance; 185 subtables = new TreeMap<>(); 186 } 187 188 @Override equals(Object obj)189 public boolean equals(Object obj) { 190 DistanceTable other; 191 return this == obj || 192 (obj != null 193 && obj.getClass() == this.getClass() 194 && nodeDistance == (other = (DistanceTable) obj).nodeDistance 195 && subtables.equals(other.subtables)); 196 } 197 @Override hashCode()198 public int hashCode() { 199 return nodeDistance ^ subtables.hashCode(); 200 } 201 getDistance(String desired, String supported, Output<DistanceTable> distanceTable, boolean starEquals)202 private int getDistance(String desired, String supported, 203 Output<DistanceTable> distanceTable, boolean starEquals) { 204 boolean star = false; 205 Map<String, DistanceTable> sub2 = subtables.get(desired); 206 if (sub2 == null) { 207 sub2 = subtables.get(ANY); // <*, supported> 208 star = true; 209 } 210 DistanceTable value = sub2.get(supported); // <*/desired, supported> 211 if (value == null) { 212 value = sub2.get(ANY); // <*/desired, *> 213 if (value == null && !star) { 214 sub2 = subtables.get(ANY); // <*, supported> 215 value = sub2.get(supported); 216 if (value == null) { 217 value = sub2.get(ANY); // <*, *> 218 } 219 } 220 star = true; 221 } 222 if (distanceTable != null) { 223 distanceTable.value = value; 224 } 225 int result = starEquals && star && desired.equals(supported) ? 0 : value.nodeDistance; 226 return result; 227 } 228 getAnyAnyNode()229 private DistanceTable getAnyAnyNode() { 230 return subtables.get(ANY).get(ANY); 231 } 232 copy(DistanceTable other)233 void copy(DistanceTable other) { 234 for (Map.Entry<String, Map<String, DistanceTable>> e1 : other.subtables.entrySet()) { 235 for (Map.Entry<String, DistanceTable> e2 : e1.getValue().entrySet()) { 236 DistanceTable value = e2.getValue(); 237 addSubtable(e1.getKey(), e2.getKey(), value.nodeDistance); 238 } 239 } 240 } 241 addSubtable(String desired, String supported, int distance)242 DistanceTable addSubtable(String desired, String supported, int distance) { 243 Map<String, DistanceTable> sub2 = subtables.get(desired); 244 if (sub2 == null) { 245 subtables.put(desired, sub2 = new TreeMap<>()); 246 } 247 DistanceTable oldNode = sub2.get(supported); 248 if (oldNode != null) { 249 return oldNode; 250 } 251 252 final DistanceTable newNode = new DistanceTable(distance); 253 sub2.put(supported, newNode); 254 return newNode; 255 } 256 257 /** 258 * Return null if value doesn't exist 259 */ getNode(String desired, String supported)260 private DistanceTable getNode(String desired, String supported) { 261 Map<String, DistanceTable> sub2 = subtables.get(desired); 262 if (sub2 == null) { 263 return null; 264 } 265 return sub2.get(supported); 266 } 267 268 269 /** add table for each subitem that matches and doesn't have a table already 270 */ addSubtables( String desired, String supported, Predicate<DistanceTable> action)271 void addSubtables( 272 String desired, String supported, 273 Predicate<DistanceTable> action) { 274 DistanceTable node = getNode(desired, supported); 275 if (node == null) { 276 // get the distance it would have 277 Output<DistanceTable> node2 = new Output<>(); 278 int distance = getDistance(desired, supported, node2, true); 279 // now add it 280 node = addSubtable(desired, supported, distance); 281 if (node2.value != null) { 282 DistanceTable nextTable = node2.value; 283 node.copy(nextTable); 284 } 285 } 286 action.test(node); 287 } 288 addSubtables(String desiredLang, String supportedLang, String desiredScript, String supportedScript, int percentage)289 void addSubtables(String desiredLang, String supportedLang, 290 String desiredScript, String supportedScript, 291 int percentage) { 292 293 // add to all the values that have the matching desiredLang and supportedLang 294 @SuppressWarnings("unused") 295 boolean haveKeys = false; 296 for (Map.Entry<String, Map<String, DistanceTable>> e1 : subtables.entrySet()) { 297 String key1 = e1.getKey(); 298 final boolean desiredIsKey = desiredLang.equals(key1); 299 if (desiredIsKey || desiredLang.equals(ANY)) { 300 for (Map.Entry<String, DistanceTable> e2 : e1.getValue().entrySet()) { 301 String key2 = e2.getKey(); 302 final boolean supportedIsKey = supportedLang.equals(key2); 303 haveKeys |= (desiredIsKey && supportedIsKey); 304 if (supportedIsKey || supportedLang.equals(ANY)) { 305 DistanceTable value = e2.getValue(); 306 value.addSubtable(desiredScript, supportedScript, percentage); 307 } 308 } 309 } 310 } 311 // now add the sequence explicitly 312 DistanceTable dt = new DistanceTable(-1); 313 dt.addSubtable(desiredScript, supportedScript, percentage); 314 CopyIfEmpty r = new CopyIfEmpty(dt); 315 addSubtables(desiredLang, supportedLang, r); 316 } 317 addSubtables(String desiredLang, String supportedLang, String desiredScript, String supportedScript, String desiredRegion, String supportedRegion, int percentage)318 void addSubtables(String desiredLang, String supportedLang, 319 String desiredScript, String supportedScript, 320 String desiredRegion, String supportedRegion, 321 int percentage) { 322 323 // add to all the values that have the matching desiredLang and supportedLang 324 @SuppressWarnings("unused") 325 boolean haveKeys = false; 326 for (Map.Entry<String, Map<String, DistanceTable>> e1 : subtables.entrySet()) { 327 String key1 = e1.getKey(); 328 final boolean desiredIsKey = desiredLang.equals(key1); 329 if (desiredIsKey || desiredLang.equals(ANY)) { 330 for (Map.Entry<String, DistanceTable> e2 : e1.getValue().entrySet()) { 331 String key2 = e2.getKey(); 332 final boolean supportedIsKey = supportedLang.equals(key2); 333 haveKeys |= (desiredIsKey && supportedIsKey); 334 if (supportedIsKey || supportedLang.equals(ANY)) { 335 DistanceTable value = e2.getValue(); 336 value.addSubtables(desiredScript, supportedScript, desiredRegion, supportedRegion, percentage); 337 } 338 } 339 } 340 } 341 // now add the sequence explicitly 342 343 DistanceTable dt = new DistanceTable(-1); 344 dt.addSubtable(desiredRegion, supportedRegion, percentage); 345 AddSub r = new AddSub(desiredScript, supportedScript, dt); 346 addSubtables(desiredLang, supportedLang, r); 347 } 348 prune(int level, int[] distances)349 void prune(int level, int[] distances) { 350 for (Map<String, DistanceTable> suppNodeMap : subtables.values()) { 351 for (DistanceTable node : suppNodeMap.values()) { 352 node.prune(level + 1, distances); 353 } 354 } 355 if (subtables.size() == 1) { 356 DistanceTable next = getAnyAnyNode(); 357 if (level == 1) { 358 // Remove script table -*-*-50 where there are no other script rules 359 // and no following region rules. 360 // If there are region rules, then mark this table for skipping. 361 if (next.nodeDistance == distances[LocaleDistance.IX_DEF_SCRIPT_DISTANCE]) { 362 if (next.subtables.isEmpty()) { 363 subtables.clear(); 364 } else { 365 nodeDistance |= LocaleDistance.DISTANCE_SKIP_SCRIPT; 366 } 367 } 368 } else if (level == 2) { 369 // Remove region table -*-*-4 where there are no other region rules. 370 if (next.nodeDistance == distances[LocaleDistance.IX_DEF_REGION_DISTANCE]) { 371 subtables.clear(); 372 } 373 } 374 } 375 } 376 377 @Override toString()378 public String toString() { 379 StringBuilder sb = new StringBuilder("distance: ").append(nodeDistance).append('\n'); 380 return toString("", sb).toString(); 381 } 382 toString(String indent, StringBuilder buffer)383 private StringBuilder toString(String indent, StringBuilder buffer) { 384 String indent2 = indent.isEmpty() ? "" : "\t"; 385 for (Map.Entry<String, Map<String, DistanceTable>> e1 : subtables.entrySet()) { 386 final Map<String, DistanceTable> subsubtable = e1.getValue(); 387 buffer.append(indent2).append(e1.getKey()); 388 String indent3 = "\t"; 389 for (Map.Entry<String, DistanceTable> e2 : subsubtable.entrySet()) { 390 DistanceTable value = e2.getValue(); 391 buffer.append(indent3).append(e2.getKey()); 392 buffer.append('\t').append(value.nodeDistance); 393 value.toString(indent+"\t\t\t", buffer); 394 buffer.append('\n'); 395 indent3 = indent+'\t'; 396 } 397 indent2 = indent; 398 } 399 return buffer; 400 } 401 toTrie(TrieBuilder builder)402 void toTrie(TrieBuilder builder) { 403 if (nodeDistance >= 0 && (nodeDistance & LocaleDistance.DISTANCE_SKIP_SCRIPT) != 0) { 404 getAnyAnyNode().toTrie(builder); 405 return; 406 } 407 int startLength = builder.length; 408 for (Map.Entry<String, Map<String, DistanceTable>> desSuppNode : subtables.entrySet()) { 409 String desired = desSuppNode.getKey(); 410 Map<String, DistanceTable> suppNodeMap = desSuppNode.getValue(); 411 // Collapse ANY-ANY into one single *. 412 if (desired.equals(ANY)) { 413 assert suppNodeMap.size() == 1; 414 DistanceTable node = suppNodeMap.get(ANY); 415 builder.addStar(node.nodeDistance); 416 node.toTrie(builder); 417 } else { 418 builder.addSubtag(desired, -1); 419 int desiredLength = builder.length; 420 for (Map.Entry<String, DistanceTable> suppNode : suppNodeMap.entrySet()) { 421 String supported = suppNode.getKey(); 422 assert !supported.equals(ANY); 423 DistanceTable node = suppNode.getValue(); 424 builder.addSubtag(supported, node.nodeDistance); 425 node.toTrie(builder); 426 builder.length = desiredLength; 427 } 428 } 429 builder.length = startLength; 430 } 431 } 432 } 433 434 private static final class CopyIfEmpty implements Predicate<DistanceTable> { 435 private final DistanceTable toCopy; CopyIfEmpty(DistanceTable resetIfNotNull)436 CopyIfEmpty(DistanceTable resetIfNotNull) { 437 this.toCopy = resetIfNotNull; 438 } 439 @Override test(DistanceTable node)440 public boolean test(DistanceTable node) { 441 if (node.subtables.isEmpty()) { 442 node.copy(toCopy); 443 } 444 return true; 445 } 446 } 447 448 private static final class AddSub implements Predicate<DistanceTable> { 449 private final String desiredSub; 450 private final String supportedSub; 451 private final CopyIfEmpty r; 452 AddSub(String desiredSub, String supportedSub, DistanceTable distanceTableToCopy)453 AddSub(String desiredSub, String supportedSub, DistanceTable distanceTableToCopy) { 454 this.r = new CopyIfEmpty(distanceTableToCopy); 455 this.desiredSub = desiredSub; 456 this.supportedSub = supportedSub; 457 } 458 @Override test(DistanceTable node)459 public boolean test(DistanceTable node) { 460 if (node == null) { 461 throw new IllegalArgumentException("bad structure"); 462 } else { 463 node.addSubtables(desiredSub, supportedSub, r); 464 } 465 return true; 466 } 467 } 468 getIdsFromVariable( Multimap<String, String> variableToPartition, String variable)469 private static Collection<String> getIdsFromVariable( 470 Multimap<String, String> variableToPartition, String variable) { 471 if (variable.equals("*")) { 472 return Collections.singleton("*"); 473 } 474 Collection<String> result = variableToPartition.get(variable); 475 if (result == null || result.isEmpty()) { 476 throw new IllegalArgumentException("Variable not defined: " + variable); 477 } 478 return result; 479 } 480 481 // VisibleForTesting build()482 public static LocaleDistance.Data build() { 483 // From CLDR supplementalData/languageMatching/languageMatches type="written_new"/ 484 // and then paradigmLocales, matchVariable, and the last languageMatch items. 485 ICUResourceBundle supplementalData = getSupplementalDataBundle("supplementalData"); 486 String[] paradigms = supplementalData.getValueWithFallback( 487 "languageMatchingInfo/written/paradigmLocales").getStringArray(); 488 // LinkedHashSet for stable order; otherwise a unit test is flaky. 489 Set<LSR> paradigmLSRs = new LinkedHashSet<>(); // could be TreeSet if LSR were Comparable 490 for (String paradigm : paradigms) { 491 ULocale pl = new ULocale(paradigm); 492 LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl); 493 // Clear the LSR flags to make the data equality test in 494 // LocaleDistanceTest happy. 495 paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS)); 496 } 497 498 TerritoryContainment tc = new TerritoryContainment(supplementalData); 499 500 RegionMapperBuilder rmb = new RegionMapperBuilder(tc); 501 UResource.Value value = supplementalData.getValueWithFallback( 502 "languageMatchingInfo/written/matchVariable"); 503 UResource.Table variables = value.getTable(); 504 UResource.Key key = new UResource.Key(); 505 for (int i = 0; variables.getKeyAndValue(i, key, value); ++i) { 506 String variable = "$" + key.toString(); 507 String regions = value.getString(); 508 rmb.add(variable, regions); 509 } 510 511 // Parse the rules. 512 // We could almost process them while reading them from the source data, 513 // but a rule may contain a region code rather than a variable. 514 // We need to create a variable for each such region code 515 // before rmb.build() and before processing the rules. 516 Splitter bar = Splitter.on('_'); 517 518 int prevSize = 0; 519 value = supplementalData.getValueWithFallback("languageMatchingNew/written"); 520 UResource.Array matches = value.getArray(); 521 List<Rule> rules = new ArrayList<>(matches.getSize()); 522 for (int i = 0; matches.getValue(i, value); ++i) { 523 String[] tuple = value.getStringArray(); 524 int distance = Integer.parseInt(tuple[2]); 525 boolean oneway = tuple.length >= 4 && tuple[3].equals("1"); 526 List<String> desired = new ArrayList<>(bar.splitToList(tuple[0])); 527 List<String> supported = new ArrayList<>(bar.splitToList(tuple[1])); 528 int size = desired.size(); 529 if (size != supported.size()) { 530 throw new IllegalArgumentException("uneven languageMatches pair"); 531 } 532 if (size < prevSize) { 533 throw new IllegalArgumentException("languageMatches out of order"); 534 } 535 prevSize = size; 536 // Implementation shortcuts assume: 537 // - At any level, either both or neither rule subtags are *. 538 // - If the rule language subtags are *, the other-level subtags must also be *. 539 // If there are rules that do not fit these constraints, 540 // then we need to revise the implementation. 541 int langStars = checkStars(desired.get(0), supported.get(0), false); 542 if (size >= 2) { 543 checkStars(desired.get(1), supported.get(1), langStars == 2); 544 } 545 if (size == 3) { 546 checkStars(desired.get(2), supported.get(2), langStars == 2); 547 rmb.ensureRegionIsVariable(desired); 548 rmb.ensureRegionIsVariable(supported); 549 } 550 rules.add(new Rule(desired, supported, distance, oneway)); 551 } 552 553 rmb.build(); 554 555 /** 556 * Used for processing rules. At the start we have a variable setting like $A1=US+CA+MX. 557 * We generate a mapping from $A1 to a set of partitions {P1, P2} 558 * When we hit a rule that contains a variable, 559 * we replace that rule by multiple rules for the partitions. 560 */ 561 final Multimap<String, String> variableToPartition = rmb.variableToPartitions; 562 563 final DistanceTable defaultDistanceTable = new DistanceTable(-1); 564 int minRegionDistance = 100; 565 for (Rule rule : rules) { 566 List<String> desired = rule.desired; 567 List<String> supported = rule.supported; 568 if (rule.desired.size() <= 2) { 569 // language-only or language-script 570 add(defaultDistanceTable, desired, supported, rule.distance); 571 if (!rule.oneway && !desired.equals(supported)) { 572 add(defaultDistanceTable, supported, desired, rule.distance); 573 } 574 } else { 575 // language-script-region 576 if (rule.distance < minRegionDistance) { 577 minRegionDistance = rule.distance; 578 } 579 Collection<String> desiredRegions = getIdsFromVariable(variableToPartition, desired.get(2)); 580 Collection<String> supportedRegions = getIdsFromVariable(variableToPartition, supported.get(2)); 581 for (String desiredRegion2 : desiredRegions) { 582 desired.set(2, desiredRegion2.toString()); // fix later 583 for (String supportedRegion2 : supportedRegions) { 584 supported.set(2, supportedRegion2.toString()); // fix later 585 add(defaultDistanceTable, desired, supported, rule.distance); 586 if (!rule.oneway) { 587 add(defaultDistanceTable, supported, desired, rule.distance); 588 } 589 } 590 } 591 } 592 } 593 594 int[] distances = new int[LocaleDistance.IX_LIMIT]; 595 DistanceTable node = defaultDistanceTable.getAnyAnyNode(); 596 distances[LocaleDistance.IX_DEF_LANG_DISTANCE] = node.nodeDistance; 597 node = node.getAnyAnyNode(); 598 distances[LocaleDistance.IX_DEF_SCRIPT_DISTANCE] = node.nodeDistance; 599 node = node.getAnyAnyNode(); 600 distances[LocaleDistance.IX_DEF_REGION_DISTANCE] = node.nodeDistance; 601 distances[LocaleDistance.IX_MIN_REGION_DISTANCE] = minRegionDistance; 602 603 defaultDistanceTable.prune(0, distances); 604 assert defaultDistanceTable.getAnyAnyNode().subtables.isEmpty(); 605 defaultDistanceTable.subtables.remove(ANY); 606 607 TrieBuilder trieBuilder = new TrieBuilder(); 608 defaultDistanceTable.toTrie(trieBuilder); 609 byte[] trie = trieBuilder.build(); 610 return new LocaleDistance.Data( 611 trie, rmb.regionToPartitionsIndex, rmb.partitionArrays, 612 paradigmLSRs, distances); 613 } 614 checkStars(String desired, String supported, boolean allStars)615 private static int checkStars(String desired, String supported, boolean allStars) { 616 int stars = (desired.equals("*") ? 1 : 0) + (supported.equals("*") ? 1 : 0); 617 if (stars == 1) { 618 throw new IllegalArgumentException("either both or neither rule subtags must be *: " + 619 desired + ", " + supported); 620 } 621 if (allStars && stars != 2) { 622 throw new IllegalArgumentException("both language subtags are * --> " + 623 "both rule subtags on all levels must be *: " + 624 desired + ", " + supported); 625 } 626 return stars; 627 } 628 add(DistanceTable languageDesired2Supported, List<String> desired, List<String> supported, int percentage)629 private static void add(DistanceTable languageDesired2Supported, 630 List<String> desired, List<String> supported, int percentage) { 631 int size = desired.size(); 632 if (size != supported.size() || size < 1 || size > 3) { 633 throw new IllegalArgumentException(); 634 } 635 final String desiredLang = fixAny(desired.get(0)); 636 final String supportedLang = fixAny(supported.get(0)); 637 if (size == 1) { 638 languageDesired2Supported.addSubtable(desiredLang, supportedLang, percentage); 639 } else { 640 final String desiredScript = fixAny(desired.get(1)); 641 final String supportedScript = fixAny(supported.get(1)); 642 if (size == 2) { 643 languageDesired2Supported.addSubtables(desiredLang, supportedLang, desiredScript, supportedScript, percentage); 644 } else { 645 final String desiredRegion = fixAny(desired.get(2)); 646 final String supportedRegion = fixAny(supported.get(2)); 647 languageDesired2Supported.addSubtables(desiredLang, supportedLang, desiredScript, supportedScript, desiredRegion, supportedRegion, percentage); 648 } 649 } 650 } 651 652 private static final class RegionMapperBuilder { 653 private final Set<String> variables = new HashSet<>(); 654 final private Multimap<String, String> regionToRawPartition = TreeMultimap.create(); 655 final private RegionSet regionSet; 656 private final TerritoryContainment tc; 657 658 // build() output 659 Multimap<String, String> variableToPartitions; 660 private byte[] regionToPartitionsIndex; 661 private String[] partitionArrays; 662 RegionMapperBuilder(TerritoryContainment tc)663 RegionMapperBuilder(TerritoryContainment tc) { 664 regionSet = new RegionSet(tc); 665 this.tc = tc; 666 } 667 isKnownVariable(String variable)668 private boolean isKnownVariable(String variable) { 669 return variables.contains(variable) || variable.equals("*"); 670 } 671 add(String variable, String barString)672 void add(String variable, String barString) { 673 assert !isKnownVariable(variable); 674 assert variable.startsWith("$"); 675 assert !variable.startsWith("$!"); 676 variables.add(variable); 677 Set<String> tempRegions = regionSet.parseSet(barString); 678 679 for (String region : tempRegions) { 680 regionToRawPartition.put(region, variable); 681 } 682 683 // now add the inverse variable 684 685 Set<String> inverse = regionSet.inverse(); 686 String inverseVariable = "$!" + variable.substring(1); 687 assert !isKnownVariable(inverseVariable); 688 variables.add(inverseVariable); 689 for (String region : inverse) { 690 regionToRawPartition.put(region, inverseVariable); 691 } 692 } 693 ensureRegionIsVariable(List<String> lsrList)694 void ensureRegionIsVariable(List<String> lsrList) { 695 String region = lsrList.get(2); 696 if (!isKnownVariable(region)) { 697 assert LSR.indexForRegion(region) > 0; // well-formed region subtag 698 String variable = "$" + region; 699 add(variable, region); 700 lsrList.set(2, variable); 701 } 702 } 703 build()704 void build() { 705 // Partitions as sets of variables. 706 // LinkedHashMap to store & number unique sets. 707 // Example: {"$!cnsar", "$!enUS", "$!maghreb", "$americas"} 708 Map<Collection<String>, Integer> partitionVariables = new LinkedHashMap<>(); 709 // Partitions as sets of lookup ID strings. 710 // Example: {"1", "5"} 711 Map<Collection<String>, Integer> partitionStrings = new LinkedHashMap<>(); 712 // pIndex 0: default value in regionToPartitionsIndex 713 Collection<String> noPartitions = Collections.singleton("."); 714 makeUniqueIndex(partitionStrings, noPartitions); 715 716 // Example: "$americas" -> {"1", "5"} 717 variableToPartitions = TreeMultimap.create(); 718 // Maps the index of each region code to a pIndex into partitionStrings. 719 regionToPartitionsIndex = new byte[LSR.REGION_INDEX_LIMIT]; 720 // Maps a partition string to the set of region codes in that partition. 721 // Example: "5" -> {"PR", "US", "VI"} 722 Multimap<String, String> partitionToRegions = TreeMultimap.create(); 723 724 for (Map.Entry<String, Set<String>> e : regionToRawPartition.asMap().entrySet()) { 725 final String region = e.getKey(); 726 final Collection<String> rawPartition = e.getValue(); 727 // Single-character string. 728 // Must be an ASCII character and must not be '*'. 729 // Used to start with α. 730 char partitionChar = (char) ('0' + makeUniqueIndex(partitionVariables, rawPartition)); 731 assert partitionChar <= 0x7f; 732 String partition = String.valueOf(partitionChar); 733 int pIndex = makeUniqueIndex(partitionStrings, Collections.singleton(partition)); 734 // The pIndex must fit into a byte. 735 // For Java code simplicity, we want it to also be non-negative. 736 assert pIndex <= 0x7f; 737 738 regionToPartitionsIndex[LSR.indexForRegion(region)] = (byte) pIndex; 739 partitionToRegions.put(partition, region); 740 741 for (String variable : rawPartition) { 742 variableToPartitions.put(variable, partition); 743 } 744 } 745 746 // We get a mapping of each macro to the partitions it intersects with. 747 // Example: "419" -> {"1", "5"} 748 Multimap<String,String> macroToPartitions = TreeMultimap.create(); 749 for (Map.Entry<String, Set<String>> e : tc.resolved.asMap().entrySet()) { 750 String macro = e.getKey(); 751 for (Map.Entry<String, Set<String>> e2 : partitionToRegions.asMap().entrySet()) { 752 String partition = e2.getKey(); 753 if (!Collections.disjoint(e.getValue(), e2.getValue())) { 754 macroToPartitions.put(macro, partition); 755 } 756 } 757 } 758 759 // Create a combined mapping from a region code, which can be a macro region, 760 // via the getRegionIndex() of that region code, 761 // to a set of single-character partition strings. 762 for (Map.Entry<String, Set<String>> m2p : macroToPartitions.asMap().entrySet()) { 763 String macro = m2p.getKey(); 764 int regionIndex = LSR.indexForRegion(macro); 765 if (regionToPartitionsIndex[regionIndex] == 0) { 766 Set<String> partitions = m2p.getValue(); 767 int pIndex = makeUniqueIndex(partitionStrings, partitions); 768 regionToPartitionsIndex[regionIndex] = (byte) pIndex; 769 } 770 } 771 // LSR.indexForRegion(ill-formed region) returns 0. 772 // Its regionToPartitionsIndex must also be 0 for the noPartitions value. 773 assert regionToPartitionsIndex[0] == 0; 774 775 // Turn the Collection of Collections of single-character strings 776 // into an array of strings. 777 Collection<Collection<String>> list = partitionStrings.keySet(); 778 partitionArrays = new String[list.size()]; 779 StringBuilder sb = new StringBuilder(); 780 int i = 0; 781 for (Collection<String> partitions : list) { 782 assert !partitions.isEmpty(); 783 sb.setLength(0); 784 for (String p : partitions) { 785 assert p.length() == 1; 786 sb.append(p); 787 } 788 partitionArrays[i++] = sb.toString(); 789 } 790 } 791 } 792 793 /** 794 * Parses a string of regions like "US+005-BR" and produces a set of resolved regions. 795 * All macroregions are fully resolved to sets of non-macro regions. 796 * <br>Syntax is simple for now: 797 * <pre>regionSet := region ([-+] region)*</pre> 798 * No precedence, so "x+y-y+z" is (((x+y)-y)+z) NOT (x+y)-(y+z) 799 */ 800 private static final class RegionSet { 801 private enum Operation {add, remove} 802 private final TerritoryContainment tc; 803 // temporaries used in processing 804 final private Set<String> tempRegions = new TreeSet<>(); 805 private Operation operation = null; 806 RegionSet(TerritoryContainment tc)807 RegionSet(TerritoryContainment tc) { 808 this.tc = tc; 809 } 810 parseSet(String barString)811 private Set<String> parseSet(String barString) { 812 operation = Operation.add; 813 int last = 0; 814 tempRegions.clear(); 815 int i = 0; 816 for (; i < barString.length(); ++i) { 817 char c = barString.charAt(i); // UTF16 is ok, since syntax is only ascii 818 switch(c) { 819 case '+': 820 add(barString, last, i); 821 last = i+1; 822 operation = Operation.add; 823 break; 824 case '-': 825 add(barString, last, i); 826 last = i+1; 827 operation = Operation.remove; 828 break; 829 } 830 } 831 add(barString, last, i); 832 return tempRegions; 833 } 834 inverse()835 private Set<String> inverse() { 836 TreeSet<String> result = new TreeSet<>(tc.leaves); 837 result.removeAll(tempRegions); 838 return result; 839 } 840 add(String barString, int last, int i)841 private void add(String barString, int last, int i) { 842 if (i > last) { 843 String region = barString.substring(last,i); 844 changeSet(operation, region); 845 } 846 } 847 changeSet(Operation operation, String region)848 private void changeSet(Operation operation, String region) { 849 Collection<String> contained = tc.toLeavesOnly.get(region); 850 if (contained != null && !contained.isEmpty()) { 851 if (Operation.add == operation) { 852 tempRegions.addAll(contained); 853 } else { 854 tempRegions.removeAll(contained); 855 } 856 } else if (Operation.add == operation) { 857 tempRegions.add(region); 858 } else { 859 tempRegions.remove(region); 860 } 861 } 862 } 863 864 private static final String TXT_PATH = "/tmp"; 865 private static final String TXT_FILE_BASE_NAME = "langInfo"; 866 private static final String TXT_FILE_NAME = TXT_FILE_BASE_NAME + ".txt"; 867 openWriter()868 private static PrintWriter openWriter() throws IOException { 869 File file = new File(TXT_PATH, TXT_FILE_NAME); 870 return new PrintWriter( 871 new BufferedWriter( 872 new OutputStreamWriter( 873 new FileOutputStream(file), StandardCharsets.UTF_8), 4096)); 874 } 875 printManyHexBytes(PrintWriter out, byte[] bytes)876 private static void printManyHexBytes(PrintWriter out, byte[] bytes) { 877 for (int i = 0;; ++i) { 878 if (i == bytes.length) { 879 out.println(); 880 break; 881 } 882 if (i != 0 && (i & 0xf) == 0) { 883 out.println(); 884 } 885 out.format("%02x", bytes[i] & 0xff); 886 } 887 } 888 main(String[] args)889 public static final void main(String[] args) throws IOException { 890 XLikelySubtags.Data likelyData = LikelySubtagsBuilder.build(); 891 LocaleDistance.Data distanceData = build(); 892 System.out.println("Writing LocaleDistance.Data to " + TXT_PATH + '/' + TXT_FILE_NAME); 893 try (PrintWriter out = openWriter()) { 894 out.println("// © 2019 and later: Unicode, Inc. and others.\n" + 895 "// License & terms of use: http://www.unicode.org/copyright.html\n" + 896 "// Generated by ICU4J LocaleDistanceBuilder.\n" + 897 TXT_FILE_BASE_NAME + ":table(nofallback){"); 898 out.println(" likely{"); 899 out.println(" languageAliases{ // " + likelyData.languageAliases.size()); 900 for (Map.Entry<String, String> entry : 901 new TreeMap<>(likelyData.languageAliases).entrySet()) { 902 out.println(" \"" + entry.getKey() + "\",\"" + entry.getValue() + "\","); 903 } 904 out.println(" } // languageAliases"); 905 906 out.println(" regionAliases{ // " + likelyData.regionAliases.size()); 907 for (Map.Entry<String, String> entry : 908 new TreeMap<>(likelyData.regionAliases).entrySet()) { 909 out.println(" \"" + entry.getKey() + "\",\"" + entry.getValue() + "\","); 910 } 911 out.println(" } // regionAliases"); 912 913 out.println(" trie:bin{ // BytesTrie: " + likelyData.trie.length + " bytes"); 914 printManyHexBytes(out, likelyData.trie); 915 out.println(" } // trie"); 916 917 out.println(" lsrs{ // " + likelyData.lsrs.length); 918 for (LSR lsr : likelyData.lsrs) { 919 out.println(" \"" + lsr.language + "\",\"" + 920 lsr.script + "\",\"" + lsr.region + "\","); 921 } 922 out.println(" } // lsrs"); 923 out.println(" } // likely"); 924 925 out.println(" match{"); 926 out.println(" trie:bin{ // BytesTrie: " + distanceData.trie.length + " bytes"); 927 printManyHexBytes(out, distanceData.trie); 928 out.println(" } // trie"); 929 930 out.println(" regionToPartitions:bin{ // " + 931 distanceData.regionToPartitionsIndex.length + " bytes"); 932 printManyHexBytes(out, distanceData.regionToPartitionsIndex); 933 out.println(" } // regionToPartitions"); 934 935 out.print(" partitions{"); 936 boolean first = true; 937 for (String p : distanceData.partitionArrays) { 938 if (first) { 939 first = false; 940 } else { 941 out.append(','); 942 } 943 out.append('"').print(p); 944 out.append('"'); 945 } 946 out.println("}"); 947 948 out.println(" paradigms{"); 949 for (LSR lsr : distanceData.paradigmLSRs) { 950 out.println(" \"" + lsr.language + "\",\"" + 951 lsr.script + "\",\"" + lsr.region + "\","); 952 } 953 out.println(" }"); 954 955 out.print(" distances:intvector{"); 956 first = true; 957 for (int d : distanceData.distances) { 958 if (first) { 959 first = false; 960 } else { 961 out.append(','); 962 } 963 out.print(d); 964 } 965 out.println("}"); 966 967 out.println(" } // match"); 968 out.println("}"); 969 } 970 } 971 } 972