1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.localedistance; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap; 7 8 import java.util.Map.Entry; 9 import java.util.Set; 10 import java.util.regex.Pattern; 11 12 import org.unicode.cldr.api.AttributeKey; 13 import org.unicode.cldr.api.CldrData; 14 import org.unicode.cldr.api.CldrPath; 15 import org.unicode.cldr.api.PathMatcher; 16 17 import com.google.common.base.Splitter; 18 import com.google.common.collect.ImmutableSet; 19 import com.google.common.collect.ImmutableSetMultimap; 20 import com.google.common.collect.SetMultimap; 21 import com.google.common.collect.Sets; 22 import com.google.common.collect.SortedSetMultimap; 23 import com.google.common.collect.TreeMultimap; 24 25 /** 26 * Territory containment graph. This is built from CLDR supplemental data and 27 * represents all territories and their containment, including macro regions 28 * such as {@code "016"}. The root node of the graph is {@code "001"}. 29 */ 30 final class TerritoryContainment { 31 // CLDR paths for containment data. 32 private static final PathMatcher CONTAINMENT_PATH = 33 PathMatcher.of("//supplementalData/territoryContainment/group[@type=*]"); 34 private static final AttributeKey TYPE = AttributeKey.keyOf("group", "type"); 35 private static final AttributeKey CONTAINS = AttributeKey.keyOf("group", "contains"); 36 37 // Standard CLDR list values are split by space. 38 // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", ""). 39 private static final Splitter LIST_SPLITTER = 40 Splitter.on(' ').trimResults().omitEmptyStrings(); 41 // The world region must be the only root in the graph. 42 private static final String WORLD = "001"; 43 private static final Pattern REGION = Pattern.compile("[A-Z]{2}|[0-9]{3}"); 44 45 /** 46 * Returns the territory containment information described by the given CLDR 47 * supplemental data. 48 */ getContainment(CldrData supplementalData)49 public static TerritoryContainment getContainment(CldrData supplementalData) { 50 // Directed, acyclic containment graph. Maps each territory to its direct contents. 51 // Note that since things like deprecated regions are included here, this allows 52 // sub-regions to have more than one parent. 53 SortedSetMultimap<String, String> graph = TreeMultimap.create(); 54 supplementalData.accept(CldrData.PathOrder.DTD, v -> { 55 CldrPath path = v.getPath(); 56 if (CONTAINMENT_PATH.matches(path)) { 57 graph.putAll(v.get(TYPE), LIST_SPLITTER.split(v.get(CONTAINS))); 58 } 59 }); 60 return new TerritoryContainment(ImmutableSetMultimap.copyOf(graph)); 61 } 62 63 /** Maps each macro-region to all its leaf contents (direct and indirect). */ 64 private final ImmutableSetMultimap<String, String> macroToLeafRegions; 65 TerritoryContainment(ImmutableSetMultimap<String, String> graph)66 private TerritoryContainment(ImmutableSetMultimap<String, String> graph) { 67 // Do some double checking of the CLDR data. 68 graph.values().forEach( 69 r -> checkArgument(REGION.matcher(r).matches(), "bad region '%s' in: %s", r, graph)); 70 checkArgument(graph.containsKey(WORLD), "missing world region '%s'", WORLD); 71 // There should be only one "root" in the graph, so every other region should be 72 // contained by something. 73 Set<String> allContained = ImmutableSet.copyOf(graph.values()); 74 Set<String> roots = ImmutableSet.copyOf(Sets.difference(graph.keySet(), allContained)); 75 checkArgument(roots.equals(ImmutableSet.of(WORLD)), 76 "world region '%s' must be the only containment graph root (was %s)", WORLD, roots); 77 78 // Start with a copy of the direct containment graph (but still pass in the direct 79 // graph to avoid issues with concurrent modification). 80 // If the graph is cyclic, this step will never terminate and run out of memory 81 // (and since this is a build-time tool, that's probably fine). 82 SortedSetMultimap<String, String> resolved = TreeMultimap.create(graph); 83 resolve(WORLD, graph, resolved); 84 // For leaf regions (direct or indirect) just retain any sub-regions which don't 85 // have child regions from the resolved graph. 86 this.macroToLeafRegions = resolved.entries().stream() 87 // Only keep macro regions (leaf regions don't have child regions by definition). 88 .filter(e -> !graph.get(e.getKey()).isEmpty()) 89 // Only keep the single-region e.getValue() if it is a leaf region. 90 .filter(e -> graph.get(e.getValue()).isEmpty()) 91 .collect(toImmutableSetMultimap(Entry::getKey, Entry::getValue)); 92 } 93 94 // Recursively resolve the region and its child regions. resolve( String region, SetMultimap<String, String> graph, SetMultimap<String, String> resolved)95 private static Set<String> resolve( 96 String region, SetMultimap<String, String> graph, SetMultimap<String, String> resolved) { 97 graph.get(region).forEach(sub -> resolved.putAll(region, resolve(sub, graph, resolved))); 98 return resolved.get(region); 99 } 100 101 /** 102 * Returns the leaf regions contained in the given region (if the given region is a 103 * leaf region, then the empty set is returned). 104 */ getLeafRegionsOf(String region)105 public ImmutableSet<String> getLeafRegionsOf(String region) { 106 return macroToLeafRegions.get(region); 107 } 108 109 /** Returns all leaf regions. */ getLeafRegions()110 public ImmutableSet<String> getLeafRegions() { 111 return macroToLeafRegions.get(WORLD); 112 } 113 114 /** Returns all macro regions. */ getMacroRegions()115 public ImmutableSet<String> getMacroRegions() { 116 return macroToLeafRegions.keySet(); 117 } 118 } 119