• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package com.ibm.icu.dev.tool.locale;
4 
5 import java.io.BufferedWriter;
6 import java.io.File;
7 import java.io.FileOutputStream;
8 import java.io.IOException;
9 import java.io.OutputStreamWriter;
10 import java.io.PrintWriter;
11 import java.nio.ByteBuffer;
12 import java.nio.charset.StandardCharsets;
13 import java.util.ArrayList;
14 import java.util.Collection;
15 import java.util.Collections;
16 import java.util.HashSet;
17 import java.util.LinkedHashMap;
18 import java.util.LinkedHashSet;
19 import java.util.List;
20 import java.util.Map;
21 import java.util.Set;
22 import java.util.TreeMap;
23 import java.util.TreeSet;
24 
25 import com.ibm.icu.impl.ICUData;
26 import com.ibm.icu.impl.ICUResourceBundle;
27 import com.ibm.icu.impl.UResource;
28 import com.ibm.icu.impl.locale.LSR;
29 import com.ibm.icu.impl.locale.LocaleDistance;
30 import com.ibm.icu.impl.locale.XCldrStub.Multimap;
31 import com.ibm.icu.impl.locale.XCldrStub.Predicate;
32 import com.ibm.icu.impl.locale.XCldrStub.Splitter;
33 import com.ibm.icu.impl.locale.XCldrStub.TreeMultimap;
34 import com.ibm.icu.impl.locale.XLikelySubtags;
35 import com.ibm.icu.util.BytesTrieBuilder;
36 import com.ibm.icu.util.Output;
37 import com.ibm.icu.util.ULocale;
38 
39 public final class LocaleDistanceBuilder {
40     private static final String ANY = "�"; // matches any character. Uses value above any subtag.
41 
42     private static final boolean DEBUG_OUTPUT = LSR.DEBUG_OUTPUT;
43 
fixAny(String string)44     private static String fixAny(String string) {
45         return "*".equals(string) ? ANY : string;
46     }
47 
getSupplementalDataBundle(String name)48     private static ICUResourceBundle getSupplementalDataBundle(String name) {
49         return ICUResourceBundle.getBundleInstance(
50             ICUData.ICU_BASE_NAME, name,
51             ICUResourceBundle.ICU_DATA_CLASS_LOADER, ICUResourceBundle.OpenType.DIRECT);
52     }
53 
54     private static final class TerritoryContainment {
55         /** Directed, acyclic containment graph. Maps each container to its direct contents. */
56         final Multimap<String, String> graph = TreeMultimap.create();
57         /** Maps each container to all of its contents, direct and indirect. */
58         final Multimap<String, String> resolved = TreeMultimap.create();
59         /** Maps each container only to its leaf contents. */
60         final Multimap<String, String> toLeavesOnly = TreeMultimap.create();
61         /** The leaves of the graph. */
62         final Set<String> leaves;
63 
TerritoryContainment(ICUResourceBundle supplementalData)64         TerritoryContainment(ICUResourceBundle supplementalData) {
65             UResource.Value value = supplementalData.getValueWithFallback("territoryContainment");
66             UResource.Key key = new UResource.Key();
67             addContainments(key, value);
68             resolve("001");
69 
70             for (Map.Entry<String, Set<String>> entry : resolved.asMap().entrySet()) {
71                 String container = entry.getKey();
72                 for (String contained : entry.getValue()) {
73                     if (resolved.get(contained) == null) {  // a leaf node (usually a country)
74                         toLeavesOnly.put(container, contained);
75                     }
76                 }
77             }
78             leaves = toLeavesOnly.get("001");
79         }
80 
addContainments(UResource.Key key, UResource.Value value)81         private void addContainments(UResource.Key key, UResource.Value value) {
82             UResource.Table containers = value.getTable();
83             for (int i = 0; containers.getKeyAndValue(i, key, value); ++i) {
84                 if (key.length() <= 3) {
85                     String container = key.toString();
86                     String[] contents = value.getStringArrayOrStringAsArray();
87                     for (String s : contents) {
88                         graph.put(container, s);
89                     }
90                 } else {
91                     addContainments(key, value);  // containedGroupings etc.
92                 }
93             }
94         }
95 
resolve(String region)96         private Set<String> resolve(String region) {
97             Set<String> contained = graph.get(region);
98             if (contained == null) {
99                 return Collections.emptySet();
100             }
101             resolved.putAll(region, contained); // do top level
102             // then recursively
103             for (String subregion : contained) {
104                 resolved.putAll(region, resolve(subregion));
105             }
106             return resolved.get(region);
107         }
108     }
109 
110     private static final class Rule {
111         final List<String> desired;
112         final List<String> supported;
113         final int distance;
114         final boolean oneway;
115 
Rule(List<String> desired, List<String> supported, int distance, boolean oneway)116         Rule(List<String> desired, List<String> supported, int distance, boolean oneway) {
117             this.desired = desired;
118             this.supported = supported;
119             this.distance = distance;
120             this.oneway = oneway;
121         }
122     }
123 
makeUniqueIndex(Map<T, Integer> objectToInt, T source)124     private static final <T> int makeUniqueIndex(Map<T, Integer> objectToInt, T source) {
125         Integer result = objectToInt.get(source);
126         if (result == null) {
127             int newResult = objectToInt.size();
128             objectToInt.put(source, newResult);
129             return newResult;
130         } else {
131             return result;
132         }
133     }
134 
135     private static final class TrieBuilder {
136         byte[] bytes = new byte[24];
137         int length = 0;
138         BytesTrieBuilder tb = new BytesTrieBuilder();
139 
addStar(int value)140         void addStar(int value) {
141             assert value >= 0;
142             bytes[length++] = '*';
143             tb.add(bytes, length, value);
144         }
145 
addSubtag(String s, int value)146         void addSubtag(String s, int value) {
147             assert !s.isEmpty();
148             assert !s.equals(ANY);
149             int end = s.length() - 1;
150             for (int i = 0;; ++i) {
151                 char c = s.charAt(i);
152                 assert c <= 0x7f;
153                 if (i < end) {
154                     bytes[length++] = (byte) c;
155                 } else {
156                     // Mark the last character as a terminator to avoid overlap matches.
157                     bytes[length++] = (byte) (c | LocaleDistance.END_OF_SUBTAG);
158                     break;
159                 }
160             }
161             if (value >= 0) {
162                 tb.add(bytes, length, value);
163             }
164         }
165 
build()166         byte[] build() {
167             ByteBuffer buffer = tb.buildByteBuffer(BytesTrieBuilder.Option.SMALL);
168             // Allocate an array with just the necessary capacity,
169             // so that we do not hold on to a larger array for a long time.
170             byte[] bytes = new byte[buffer.remaining()];
171             buffer.get(bytes);
172             if (DEBUG_OUTPUT) {
173                 System.out.println("distance trie size: " + bytes.length + " bytes");
174             }
175             return bytes;
176         }
177     }
178 
179     private static final class DistanceTable {
180         int nodeDistance;  // distance for the lookup so far
181         final Map<String, Map<String, DistanceTable>> subtables;
182 
DistanceTable(int distance)183         DistanceTable(int distance) {
184             nodeDistance = distance;
185             subtables = new TreeMap<>();
186         }
187 
188         @Override
equals(Object obj)189         public boolean equals(Object obj) {
190             DistanceTable other;
191             return this == obj ||
192                     (obj != null
193                     && obj.getClass() == this.getClass()
194                     && nodeDistance == (other = (DistanceTable) obj).nodeDistance
195                     && subtables.equals(other.subtables));
196         }
197         @Override
hashCode()198         public int hashCode() {
199             return nodeDistance ^ subtables.hashCode();
200         }
201 
getDistance(String desired, String supported, Output<DistanceTable> distanceTable, boolean starEquals)202         private int getDistance(String desired, String supported,
203                 Output<DistanceTable> distanceTable, boolean starEquals) {
204             boolean star = false;
205             Map<String, DistanceTable> sub2 = subtables.get(desired);
206             if (sub2 == null) {
207                 sub2 = subtables.get(ANY); // <*, supported>
208                 star = true;
209             }
210             DistanceTable value = sub2.get(supported);   // <*/desired, supported>
211             if (value == null) {
212                 value = sub2.get(ANY);  // <*/desired, *>
213                 if (value == null && !star) {
214                     sub2 = subtables.get(ANY);   // <*, supported>
215                     value = sub2.get(supported);
216                     if (value == null) {
217                         value = sub2.get(ANY);   // <*, *>
218                     }
219                 }
220                 star = true;
221             }
222             if (distanceTable != null) {
223                 distanceTable.value = value;
224             }
225             int result = starEquals && star && desired.equals(supported) ? 0 : value.nodeDistance;
226             return result;
227         }
228 
getAnyAnyNode()229         private DistanceTable getAnyAnyNode() {
230             return subtables.get(ANY).get(ANY);
231         }
232 
copy(DistanceTable other)233         void copy(DistanceTable other) {
234             for (Map.Entry<String, Map<String, DistanceTable>> e1 : other.subtables.entrySet()) {
235                 for (Map.Entry<String, DistanceTable> e2 : e1.getValue().entrySet()) {
236                     DistanceTable value = e2.getValue();
237                     addSubtable(e1.getKey(), e2.getKey(), value.nodeDistance);
238                 }
239             }
240         }
241 
addSubtable(String desired, String supported, int distance)242         DistanceTable addSubtable(String desired, String supported, int distance) {
243             Map<String, DistanceTable> sub2 = subtables.get(desired);
244             if (sub2 == null) {
245                 subtables.put(desired, sub2 = new TreeMap<>());
246             }
247             DistanceTable oldNode = sub2.get(supported);
248             if (oldNode != null) {
249                 return oldNode;
250             }
251 
252             final DistanceTable newNode = new DistanceTable(distance);
253             sub2.put(supported, newNode);
254             return newNode;
255         }
256 
257         /**
258          * Return null if value doesn't exist
259          */
getNode(String desired, String supported)260         private DistanceTable getNode(String desired, String supported) {
261             Map<String, DistanceTable> sub2 = subtables.get(desired);
262             if (sub2 == null) {
263                 return null;
264             }
265             return sub2.get(supported);
266         }
267 
268 
269         /** add table for each subitem that matches and doesn't have a table already
270          */
addSubtables( String desired, String supported, Predicate<DistanceTable> action)271         void addSubtables(
272                 String desired, String supported,
273                 Predicate<DistanceTable> action) {
274             DistanceTable node = getNode(desired, supported);
275             if (node == null) {
276                 // get the distance it would have
277                 Output<DistanceTable> node2 = new Output<>();
278                 int distance = getDistance(desired, supported, node2, true);
279                 // now add it
280                 node = addSubtable(desired, supported, distance);
281                 if (node2.value != null) {
282                     DistanceTable nextTable = node2.value;
283                     node.copy(nextTable);
284                 }
285             }
286             action.test(node);
287         }
288 
addSubtables(String desiredLang, String supportedLang, String desiredScript, String supportedScript, int percentage)289         void addSubtables(String desiredLang, String supportedLang,
290                 String desiredScript, String supportedScript,
291                 int percentage) {
292 
293             // add to all the values that have the matching desiredLang and supportedLang
294             @SuppressWarnings("unused")
295             boolean haveKeys = false;
296             for (Map.Entry<String, Map<String, DistanceTable>> e1 : subtables.entrySet()) {
297                 String key1 = e1.getKey();
298                 final boolean desiredIsKey = desiredLang.equals(key1);
299                 if (desiredIsKey || desiredLang.equals(ANY)) {
300                     for (Map.Entry<String, DistanceTable> e2 : e1.getValue().entrySet()) {
301                         String key2 = e2.getKey();
302                         final boolean supportedIsKey = supportedLang.equals(key2);
303                         haveKeys |= (desiredIsKey && supportedIsKey);
304                         if (supportedIsKey || supportedLang.equals(ANY)) {
305                             DistanceTable value = e2.getValue();
306                             value.addSubtable(desiredScript, supportedScript, percentage);
307                         }
308                     }
309                 }
310             }
311             // now add the sequence explicitly
312             DistanceTable dt = new DistanceTable(-1);
313             dt.addSubtable(desiredScript, supportedScript, percentage);
314             CopyIfEmpty r = new CopyIfEmpty(dt);
315             addSubtables(desiredLang, supportedLang, r);
316         }
317 
addSubtables(String desiredLang, String supportedLang, String desiredScript, String supportedScript, String desiredRegion, String supportedRegion, int percentage)318         void addSubtables(String desiredLang, String supportedLang,
319                 String desiredScript, String supportedScript,
320                 String desiredRegion, String supportedRegion,
321                 int percentage) {
322 
323             // add to all the values that have the matching desiredLang and supportedLang
324             @SuppressWarnings("unused")
325             boolean haveKeys = false;
326             for (Map.Entry<String, Map<String, DistanceTable>> e1 : subtables.entrySet()) {
327                 String key1 = e1.getKey();
328                 final boolean desiredIsKey = desiredLang.equals(key1);
329                 if (desiredIsKey || desiredLang.equals(ANY)) {
330                     for (Map.Entry<String, DistanceTable> e2 : e1.getValue().entrySet()) {
331                         String key2 = e2.getKey();
332                         final boolean supportedIsKey = supportedLang.equals(key2);
333                         haveKeys |= (desiredIsKey && supportedIsKey);
334                         if (supportedIsKey || supportedLang.equals(ANY)) {
335                             DistanceTable value = e2.getValue();
336                             value.addSubtables(desiredScript, supportedScript, desiredRegion, supportedRegion, percentage);
337                         }
338                     }
339                 }
340             }
341             // now add the sequence explicitly
342 
343             DistanceTable dt = new DistanceTable(-1);
344             dt.addSubtable(desiredRegion, supportedRegion, percentage);
345             AddSub r = new AddSub(desiredScript, supportedScript, dt);
346             addSubtables(desiredLang,  supportedLang,  r);
347         }
348 
prune(int level, int[] distances)349         void prune(int level, int[] distances) {
350             for (Map<String, DistanceTable> suppNodeMap : subtables.values()) {
351                 for (DistanceTable node : suppNodeMap.values()) {
352                     node.prune(level + 1, distances);
353                 }
354             }
355             if (subtables.size() == 1) {
356                 DistanceTable next = getAnyAnyNode();
357                 if (level == 1) {
358                     // Remove script table -*-*-50 where there are no other script rules
359                     // and no following region rules.
360                     // If there are region rules, then mark this table for skipping.
361                     if (next.nodeDistance == distances[LocaleDistance.IX_DEF_SCRIPT_DISTANCE]) {
362                         if (next.subtables.isEmpty()) {
363                             subtables.clear();
364                         } else {
365                             nodeDistance |= LocaleDistance.DISTANCE_SKIP_SCRIPT;
366                         }
367                     }
368                 } else if (level == 2) {
369                     // Remove region table -*-*-4 where there are no other region rules.
370                     if (next.nodeDistance == distances[LocaleDistance.IX_DEF_REGION_DISTANCE]) {
371                         subtables.clear();
372                     }
373                 }
374             }
375         }
376 
377         @Override
toString()378         public String toString() {
379             StringBuilder sb = new StringBuilder("distance: ").append(nodeDistance).append('\n');
380             return toString("", sb).toString();
381         }
382 
toString(String indent, StringBuilder buffer)383         private StringBuilder toString(String indent, StringBuilder buffer) {
384             String indent2 = indent.isEmpty() ? "" : "\t";
385             for (Map.Entry<String, Map<String, DistanceTable>> e1 : subtables.entrySet()) {
386                 final Map<String, DistanceTable> subsubtable = e1.getValue();
387                 buffer.append(indent2).append(e1.getKey());
388                 String indent3 = "\t";
389                 for (Map.Entry<String, DistanceTable> e2 : subsubtable.entrySet()) {
390                     DistanceTable value = e2.getValue();
391                     buffer.append(indent3).append(e2.getKey());
392                     buffer.append('\t').append(value.nodeDistance);
393                     value.toString(indent+"\t\t\t", buffer);
394                     buffer.append('\n');
395                     indent3 = indent+'\t';
396                 }
397                 indent2 = indent;
398             }
399             return buffer;
400         }
401 
toTrie(TrieBuilder builder)402         void toTrie(TrieBuilder builder) {
403             if (nodeDistance >= 0 && (nodeDistance & LocaleDistance.DISTANCE_SKIP_SCRIPT) != 0) {
404                 getAnyAnyNode().toTrie(builder);
405                 return;
406             }
407             int startLength = builder.length;
408             for (Map.Entry<String, Map<String, DistanceTable>> desSuppNode : subtables.entrySet()) {
409                 String desired = desSuppNode.getKey();
410                 Map<String, DistanceTable> suppNodeMap = desSuppNode.getValue();
411                 // Collapse ANY-ANY into one single *.
412                 if (desired.equals(ANY)) {
413                     assert suppNodeMap.size() == 1;
414                     DistanceTable node = suppNodeMap.get(ANY);
415                     builder.addStar(node.nodeDistance);
416                     node.toTrie(builder);
417                 } else {
418                     builder.addSubtag(desired, -1);
419                     int desiredLength = builder.length;
420                     for (Map.Entry<String, DistanceTable> suppNode : suppNodeMap.entrySet()) {
421                         String supported = suppNode.getKey();
422                         assert !supported.equals(ANY);
423                         DistanceTable node = suppNode.getValue();
424                         builder.addSubtag(supported, node.nodeDistance);
425                         node.toTrie(builder);
426                         builder.length = desiredLength;
427                     }
428                 }
429                 builder.length = startLength;
430             }
431         }
432     }
433 
434     private static final class CopyIfEmpty implements Predicate<DistanceTable> {
435         private final DistanceTable toCopy;
CopyIfEmpty(DistanceTable resetIfNotNull)436         CopyIfEmpty(DistanceTable resetIfNotNull) {
437             this.toCopy = resetIfNotNull;
438         }
439         @Override
test(DistanceTable node)440         public boolean test(DistanceTable node) {
441             if (node.subtables.isEmpty()) {
442                 node.copy(toCopy);
443             }
444             return true;
445         }
446     }
447 
448     private static final class AddSub implements Predicate<DistanceTable> {
449         private final String desiredSub;
450         private final String supportedSub;
451         private final CopyIfEmpty r;
452 
AddSub(String desiredSub, String supportedSub, DistanceTable distanceTableToCopy)453         AddSub(String desiredSub, String supportedSub, DistanceTable distanceTableToCopy) {
454             this.r = new CopyIfEmpty(distanceTableToCopy);
455             this.desiredSub = desiredSub;
456             this.supportedSub = supportedSub;
457         }
458         @Override
test(DistanceTable node)459         public boolean test(DistanceTable node) {
460             if (node == null) {
461                 throw new IllegalArgumentException("bad structure");
462             } else {
463                 node.addSubtables(desiredSub, supportedSub, r);
464             }
465             return true;
466         }
467     }
468 
getIdsFromVariable( Multimap<String, String> variableToPartition, String variable)469     private static Collection<String> getIdsFromVariable(
470             Multimap<String, String> variableToPartition, String variable) {
471         if (variable.equals("*")) {
472             return Collections.singleton("*");
473         }
474         Collection<String> result = variableToPartition.get(variable);
475         if (result == null || result.isEmpty()) {
476             throw new IllegalArgumentException("Variable not defined: " + variable);
477         }
478         return result;
479     }
480 
481     // VisibleForTesting
build()482     public static LocaleDistance.Data build() {
483         // From CLDR supplementalData/languageMatching/languageMatches type="written_new"/
484         //   and then paradigmLocales, matchVariable, and the last languageMatch items.
485         ICUResourceBundle supplementalData = getSupplementalDataBundle("supplementalData");
486         String[] paradigms = supplementalData.getValueWithFallback(
487                 "languageMatchingInfo/written/paradigmLocales").getStringArray();
488         // LinkedHashSet for stable order; otherwise a unit test is flaky.
489         Set<LSR> paradigmLSRs = new LinkedHashSet<>();  // could be TreeSet if LSR were Comparable
490         for (String paradigm : paradigms) {
491             ULocale pl = new ULocale(paradigm);
492             LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl);
493             // Clear the LSR flags to make the data equality test in
494             // LocaleDistanceTest happy.
495             paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS));
496         }
497 
498         TerritoryContainment tc = new TerritoryContainment(supplementalData);
499 
500         RegionMapperBuilder rmb = new RegionMapperBuilder(tc);
501         UResource.Value value = supplementalData.getValueWithFallback(
502                 "languageMatchingInfo/written/matchVariable");
503         UResource.Table variables = value.getTable();
504         UResource.Key key = new UResource.Key();
505         for (int i = 0; variables.getKeyAndValue(i, key, value); ++i) {
506             String variable = "$" + key.toString();
507             String regions = value.getString();
508             rmb.add(variable, regions);
509         }
510 
511         // Parse the rules.
512         // We could almost process them while reading them from the source data,
513         // but a rule may contain a region code rather than a variable.
514         // We need to create a variable for each such region code
515         // before rmb.build() and before processing the rules.
516         Splitter bar = Splitter.on('_');
517 
518         int prevSize = 0;
519         value = supplementalData.getValueWithFallback("languageMatchingNew/written");
520         UResource.Array matches = value.getArray();
521         List<Rule> rules = new ArrayList<>(matches.getSize());
522         for (int i = 0; matches.getValue(i, value); ++i) {
523             String[] tuple = value.getStringArray();
524             int distance = Integer.parseInt(tuple[2]);
525             boolean oneway = tuple.length >= 4 && tuple[3].equals("1");
526             List<String> desired = new ArrayList<>(bar.splitToList(tuple[0]));
527             List<String> supported = new ArrayList<>(bar.splitToList(tuple[1]));
528             int size = desired.size();
529             if (size != supported.size()) {
530                 throw new IllegalArgumentException("uneven languageMatches pair");
531             }
532             if (size < prevSize) {
533                 throw new IllegalArgumentException("languageMatches out of order");
534             }
535             prevSize = size;
536             // Implementation shortcuts assume:
537             // - At any level, either both or neither rule subtags are *.
538             // - If the rule language subtags are *, the other-level subtags must also be *.
539             // If there are rules that do not fit these constraints,
540             // then we need to revise the implementation.
541             int langStars = checkStars(desired.get(0), supported.get(0), false);
542             if (size >= 2) {
543                 checkStars(desired.get(1), supported.get(1), langStars == 2);
544             }
545             if (size == 3) {
546                 checkStars(desired.get(2), supported.get(2), langStars == 2);
547                 rmb.ensureRegionIsVariable(desired);
548                 rmb.ensureRegionIsVariable(supported);
549             }
550             rules.add(new Rule(desired, supported, distance, oneway));
551         }
552 
553         rmb.build();
554 
555         /**
556          * Used for processing rules. At the start we have a variable setting like $A1=US+CA+MX.
557          * We generate a mapping from $A1 to a set of partitions {P1, P2}
558          * When we hit a rule that contains a variable,
559          * we replace that rule by multiple rules for the partitions.
560          */
561         final Multimap<String, String> variableToPartition = rmb.variableToPartitions;
562 
563         final DistanceTable defaultDistanceTable = new DistanceTable(-1);
564         int minRegionDistance = 100;
565         for (Rule rule : rules) {
566             List<String> desired = rule.desired;
567             List<String> supported = rule.supported;
568             if (rule.desired.size() <= 2) {
569                 // language-only or language-script
570                 add(defaultDistanceTable, desired, supported, rule.distance);
571                 if (!rule.oneway && !desired.equals(supported)) {
572                     add(defaultDistanceTable, supported, desired, rule.distance);
573                 }
574             } else {
575                 // language-script-region
576                 if (rule.distance < minRegionDistance) {
577                     minRegionDistance = rule.distance;
578                 }
579                 Collection<String> desiredRegions = getIdsFromVariable(variableToPartition, desired.get(2));
580                 Collection<String> supportedRegions = getIdsFromVariable(variableToPartition, supported.get(2));
581                 for (String desiredRegion2 : desiredRegions) {
582                     desired.set(2, desiredRegion2.toString()); // fix later
583                     for (String supportedRegion2 : supportedRegions) {
584                         supported.set(2, supportedRegion2.toString()); // fix later
585                         add(defaultDistanceTable, desired, supported, rule.distance);
586                         if (!rule.oneway) {
587                             add(defaultDistanceTable, supported, desired, rule.distance);
588                         }
589                     }
590                 }
591             }
592         }
593 
594         int[] distances = new int[LocaleDistance.IX_LIMIT];
595         DistanceTable node = defaultDistanceTable.getAnyAnyNode();
596         distances[LocaleDistance.IX_DEF_LANG_DISTANCE] = node.nodeDistance;
597         node = node.getAnyAnyNode();
598         distances[LocaleDistance.IX_DEF_SCRIPT_DISTANCE] = node.nodeDistance;
599         node = node.getAnyAnyNode();
600         distances[LocaleDistance.IX_DEF_REGION_DISTANCE] = node.nodeDistance;
601         distances[LocaleDistance.IX_MIN_REGION_DISTANCE] = minRegionDistance;
602 
603         defaultDistanceTable.prune(0, distances);
604         assert defaultDistanceTable.getAnyAnyNode().subtables.isEmpty();
605         defaultDistanceTable.subtables.remove(ANY);
606 
607         TrieBuilder trieBuilder = new TrieBuilder();
608         defaultDistanceTable.toTrie(trieBuilder);
609         byte[] trie = trieBuilder.build();
610         return new LocaleDistance.Data(
611                 trie, rmb.regionToPartitionsIndex, rmb.partitionArrays,
612                 paradigmLSRs, distances);
613     }
614 
checkStars(String desired, String supported, boolean allStars)615     private static int checkStars(String desired, String supported, boolean allStars) {
616         int stars = (desired.equals("*") ? 1 : 0) + (supported.equals("*") ? 1 : 0);
617         if (stars == 1) {
618             throw new IllegalArgumentException("either both or neither rule subtags must be *: " +
619                     desired + ", " + supported);
620         }
621         if (allStars && stars != 2) {
622             throw new IllegalArgumentException("both language subtags are * --> " +
623                     "both rule subtags on all levels must be *: " +
624                     desired + ", " + supported);
625         }
626         return stars;
627     }
628 
add(DistanceTable languageDesired2Supported, List<String> desired, List<String> supported, int percentage)629     private static void add(DistanceTable languageDesired2Supported,
630             List<String> desired, List<String> supported, int percentage) {
631         int size = desired.size();
632         if (size != supported.size() || size < 1 || size > 3) {
633             throw new IllegalArgumentException();
634         }
635         final String desiredLang = fixAny(desired.get(0));
636         final String supportedLang = fixAny(supported.get(0));
637         if (size == 1) {
638             languageDesired2Supported.addSubtable(desiredLang, supportedLang, percentage);
639         } else {
640             final String desiredScript = fixAny(desired.get(1));
641             final String supportedScript = fixAny(supported.get(1));
642             if (size == 2) {
643                 languageDesired2Supported.addSubtables(desiredLang, supportedLang, desiredScript, supportedScript, percentage);
644             } else {
645                 final String desiredRegion = fixAny(desired.get(2));
646                 final String supportedRegion = fixAny(supported.get(2));
647                 languageDesired2Supported.addSubtables(desiredLang, supportedLang, desiredScript, supportedScript, desiredRegion, supportedRegion, percentage);
648             }
649         }
650     }
651 
652     private static final class RegionMapperBuilder {
653         private final Set<String> variables = new HashSet<>();
654         final private Multimap<String, String> regionToRawPartition = TreeMultimap.create();
655         final private RegionSet regionSet;
656         private final TerritoryContainment tc;
657 
658         // build() output
659         Multimap<String, String> variableToPartitions;
660         private byte[] regionToPartitionsIndex;
661         private String[] partitionArrays;
662 
RegionMapperBuilder(TerritoryContainment tc)663         RegionMapperBuilder(TerritoryContainment tc) {
664             regionSet = new RegionSet(tc);
665             this.tc = tc;
666         }
667 
isKnownVariable(String variable)668         private boolean isKnownVariable(String variable) {
669             return variables.contains(variable) || variable.equals("*");
670         }
671 
add(String variable, String barString)672         void add(String variable, String barString) {
673             assert !isKnownVariable(variable);
674             assert variable.startsWith("$");
675             assert !variable.startsWith("$!");
676             variables.add(variable);
677             Set<String> tempRegions = regionSet.parseSet(barString);
678 
679             for (String region : tempRegions) {
680                 regionToRawPartition.put(region, variable);
681             }
682 
683             // now add the inverse variable
684 
685             Set<String> inverse = regionSet.inverse();
686             String inverseVariable = "$!" + variable.substring(1);
687             assert !isKnownVariable(inverseVariable);
688             variables.add(inverseVariable);
689             for (String region : inverse) {
690                 regionToRawPartition.put(region, inverseVariable);
691             }
692         }
693 
ensureRegionIsVariable(List<String> lsrList)694         void ensureRegionIsVariable(List<String> lsrList) {
695             String region = lsrList.get(2);
696             if (!isKnownVariable(region)) {
697                 assert LSR.indexForRegion(region) > 0;  // well-formed region subtag
698                 String variable = "$" + region;
699                 add(variable, region);
700                 lsrList.set(2, variable);
701             }
702         }
703 
build()704         void build() {
705             // Partitions as sets of variables.
706             // LinkedHashMap to store & number unique sets.
707             // Example: {"$!cnsar", "$!enUS", "$!maghreb", "$americas"}
708             Map<Collection<String>, Integer> partitionVariables = new LinkedHashMap<>();
709             // Partitions as sets of lookup ID strings.
710             // Example: {"1", "5"}
711             Map<Collection<String>, Integer> partitionStrings = new LinkedHashMap<>();
712             // pIndex 0: default value in regionToPartitionsIndex
713             Collection<String> noPartitions = Collections.singleton(".");
714             makeUniqueIndex(partitionStrings, noPartitions);
715 
716             // Example: "$americas" -> {"1", "5"}
717             variableToPartitions = TreeMultimap.create();
718             // Maps the index of each region code to a pIndex into partitionStrings.
719             regionToPartitionsIndex = new byte[LSR.REGION_INDEX_LIMIT];
720             // Maps a partition string to the set of region codes in that partition.
721             // Example: "5" -> {"PR", "US", "VI"}
722             Multimap<String, String> partitionToRegions = TreeMultimap.create();
723 
724             for (Map.Entry<String, Set<String>> e : regionToRawPartition.asMap().entrySet()) {
725                 final String region = e.getKey();
726                 final Collection<String> rawPartition = e.getValue();
727                 // Single-character string.
728                 // Must be an ASCII character and must not be '*'.
729                 // Used to start with α.
730                 char partitionChar = (char) ('0' + makeUniqueIndex(partitionVariables, rawPartition));
731                 assert partitionChar <= 0x7f;
732                 String partition = String.valueOf(partitionChar);
733                 int pIndex = makeUniqueIndex(partitionStrings, Collections.singleton(partition));
734                 // The pIndex must fit into a byte.
735                 // For Java code simplicity, we want it to also be non-negative.
736                 assert pIndex <= 0x7f;
737 
738                 regionToPartitionsIndex[LSR.indexForRegion(region)] = (byte) pIndex;
739                 partitionToRegions.put(partition, region);
740 
741                 for (String variable : rawPartition) {
742                     variableToPartitions.put(variable, partition);
743                 }
744             }
745 
746             // We get a mapping of each macro to the partitions it intersects with.
747             // Example: "419" -> {"1", "5"}
748             Multimap<String,String> macroToPartitions = TreeMultimap.create();
749             for (Map.Entry<String, Set<String>> e : tc.resolved.asMap().entrySet()) {
750                 String macro = e.getKey();
751                 for (Map.Entry<String, Set<String>> e2 : partitionToRegions.asMap().entrySet()) {
752                     String partition = e2.getKey();
753                     if (!Collections.disjoint(e.getValue(), e2.getValue())) {
754                         macroToPartitions.put(macro, partition);
755                     }
756                 }
757             }
758 
759             // Create a combined mapping from a region code, which can be a macro region,
760             // via the getRegionIndex() of that region code,
761             // to a set of single-character partition strings.
762             for (Map.Entry<String, Set<String>> m2p : macroToPartitions.asMap().entrySet()) {
763                 String macro = m2p.getKey();
764                 int regionIndex = LSR.indexForRegion(macro);
765                 if (regionToPartitionsIndex[regionIndex] == 0) {
766                     Set<String> partitions = m2p.getValue();
767                     int pIndex = makeUniqueIndex(partitionStrings, partitions);
768                     regionToPartitionsIndex[regionIndex] = (byte) pIndex;
769                 }
770             }
771             // LSR.indexForRegion(ill-formed region) returns 0.
772             // Its regionToPartitionsIndex must also be 0 for the noPartitions value.
773             assert regionToPartitionsIndex[0] == 0;
774 
775             // Turn the Collection of Collections of single-character strings
776             // into an array of strings.
777             Collection<Collection<String>> list = partitionStrings.keySet();
778             partitionArrays = new String[list.size()];
779             StringBuilder sb = new StringBuilder();
780             int i = 0;
781             for (Collection<String> partitions : list) {
782                 assert !partitions.isEmpty();
783                 sb.setLength(0);
784                 for (String p : partitions) {
785                     assert p.length() == 1;
786                     sb.append(p);
787                 }
788                 partitionArrays[i++] = sb.toString();
789             }
790         }
791     }
792 
793     /**
794      * Parses a string of regions like "US+005-BR" and produces a set of resolved regions.
795      * All macroregions are fully resolved to sets of non-macro regions.
796      * <br>Syntax is simple for now:
797      * <pre>regionSet := region ([-+] region)*</pre>
798      * No precedence, so "x+y-y+z" is (((x+y)-y)+z) NOT (x+y)-(y+z)
799      */
800     private static final class RegionSet {
801         private enum Operation {add, remove}
802         private final TerritoryContainment tc;
803         // temporaries used in processing
804         final private Set<String> tempRegions = new TreeSet<>();
805         private Operation operation = null;
806 
RegionSet(TerritoryContainment tc)807         RegionSet(TerritoryContainment tc) {
808             this.tc = tc;
809         }
810 
parseSet(String barString)811         private Set<String> parseSet(String barString) {
812             operation = Operation.add;
813             int last = 0;
814             tempRegions.clear();
815             int i = 0;
816             for (; i < barString.length(); ++i) {
817                 char c = barString.charAt(i); // UTF16 is ok, since syntax is only ascii
818                 switch(c) {
819                 case '+':
820                     add(barString, last, i);
821                     last = i+1;
822                     operation = Operation.add;
823                     break;
824                 case '-':
825                     add(barString, last, i);
826                     last = i+1;
827                     operation = Operation.remove;
828                     break;
829                 }
830             }
831             add(barString, last, i);
832             return tempRegions;
833         }
834 
inverse()835         private Set<String> inverse() {
836             TreeSet<String> result = new TreeSet<>(tc.leaves);
837             result.removeAll(tempRegions);
838             return result;
839         }
840 
add(String barString, int last, int i)841         private void add(String barString, int last, int i) {
842             if (i > last) {
843                 String region = barString.substring(last,i);
844                 changeSet(operation, region);
845             }
846         }
847 
changeSet(Operation operation, String region)848         private void changeSet(Operation operation, String region) {
849             Collection<String> contained = tc.toLeavesOnly.get(region);
850             if (contained != null && !contained.isEmpty()) {
851                 if (Operation.add == operation) {
852                     tempRegions.addAll(contained);
853                 } else {
854                     tempRegions.removeAll(contained);
855                 }
856             } else if (Operation.add == operation) {
857                 tempRegions.add(region);
858             } else {
859                 tempRegions.remove(region);
860             }
861         }
862     }
863 
864     private static final String TXT_PATH = "/tmp";
865     private static final String TXT_FILE_BASE_NAME = "langInfo";
866     private static final String TXT_FILE_NAME = TXT_FILE_BASE_NAME + ".txt";
867 
openWriter()868     private static PrintWriter openWriter() throws IOException {
869         File file = new File(TXT_PATH, TXT_FILE_NAME);
870         return new PrintWriter(
871             new BufferedWriter(
872                 new OutputStreamWriter(
873                     new FileOutputStream(file), StandardCharsets.UTF_8), 4096));
874     }
875 
printManyHexBytes(PrintWriter out, byte[] bytes)876     private static void printManyHexBytes(PrintWriter out, byte[] bytes) {
877         for (int i = 0;; ++i) {
878             if (i == bytes.length) {
879                 out.println();
880                 break;
881             }
882             if (i != 0 && (i & 0xf) == 0) {
883                 out.println();
884             }
885             out.format("%02x", bytes[i] & 0xff);
886         }
887     }
888 
main(String[] args)889     public static final void main(String[] args) throws IOException {
890         XLikelySubtags.Data likelyData = LikelySubtagsBuilder.build();
891         LocaleDistance.Data distanceData = build();
892         System.out.println("Writing LocaleDistance.Data to " + TXT_PATH + '/' + TXT_FILE_NAME);
893         try (PrintWriter out = openWriter()) {
894             out.println("// © 2019 and later: Unicode, Inc. and others.\n" +
895                     "// License & terms of use: http://www.unicode.org/copyright.html\n" +
896                     "// Generated by ICU4J LocaleDistanceBuilder.\n" +
897                     TXT_FILE_BASE_NAME + ":table(nofallback){");
898             out.println("    likely{");
899             out.println("        languageAliases{  // " + likelyData.languageAliases.size());
900             for (Map.Entry<String, String> entry :
901                     new TreeMap<>(likelyData.languageAliases).entrySet()) {
902                 out.println("            \"" + entry.getKey() + "\",\"" + entry.getValue() + "\",");
903             }
904             out.println("        }  // languageAliases");
905 
906             out.println("        regionAliases{  // " + likelyData.regionAliases.size());
907             for (Map.Entry<String, String> entry :
908                     new TreeMap<>(likelyData.regionAliases).entrySet()) {
909                 out.println("            \"" + entry.getKey() + "\",\"" + entry.getValue() + "\",");
910             }
911             out.println("        }  // regionAliases");
912 
913             out.println("        trie:bin{  // BytesTrie: " + likelyData.trie.length + " bytes");
914             printManyHexBytes(out, likelyData.trie);
915             out.println("        }  // trie");
916 
917             out.println("        lsrs{  // " + likelyData.lsrs.length);
918             for (LSR lsr : likelyData.lsrs) {
919                 out.println("            \"" + lsr.language + "\",\"" +
920                         lsr.script + "\",\"" + lsr.region + "\",");
921             }
922             out.println("        }  // lsrs");
923             out.println("    }  // likely");
924 
925             out.println("    match{");
926             out.println("        trie:bin{  // BytesTrie: " + distanceData.trie.length + " bytes");
927             printManyHexBytes(out, distanceData.trie);
928             out.println("        }  // trie");
929 
930             out.println("        regionToPartitions:bin{  // " +
931                     distanceData.regionToPartitionsIndex.length + " bytes");
932             printManyHexBytes(out, distanceData.regionToPartitionsIndex);
933             out.println("        }  // regionToPartitions");
934 
935             out.print("        partitions{");
936             boolean first = true;
937             for (String p : distanceData.partitionArrays) {
938                 if (first) {
939                     first = false;
940                 } else {
941                     out.append(',');
942                 }
943                 out.append('"').print(p);
944                 out.append('"');
945             }
946             out.println("}");
947 
948             out.println("        paradigms{");
949             for (LSR lsr : distanceData.paradigmLSRs) {
950                 out.println("            \"" + lsr.language + "\",\"" +
951                         lsr.script + "\",\"" + lsr.region + "\",");
952             }
953             out.println("        }");
954 
955             out.print("        distances:intvector{");
956             first = true;
957             for (int d : distanceData.distances) {
958                 if (first) {
959                     first = false;
960                 } else {
961                     out.append(',');
962                 }
963                 out.print(d);
964             }
965             out.println("}");
966 
967             out.println("    }  // match");
968             out.println("}");
969         }
970     }
971 }
972