1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.localedistance; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkNotNull; 7 import static com.google.common.base.Preconditions.checkState; 8 import static java.util.Arrays.asList; 9 import static org.unicode.cldr.api.CldrData.PathOrder.DTD; 10 import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; 11 12 import java.io.IOException; 13 import java.util.ArrayList; 14 import java.util.Arrays; 15 import java.util.Collection; 16 import java.util.LinkedHashSet; 17 import java.util.List; 18 import java.util.Map; 19 import java.util.Optional; 20 import java.util.Set; 21 import java.util.logging.Logger; 22 import java.util.stream.Collectors; 23 import java.util.stream.Stream; 24 25 import org.unicode.cldr.api.AttributeKey; 26 import org.unicode.cldr.api.CldrData; 27 import org.unicode.cldr.api.CldrDataSupplier; 28 import org.unicode.cldr.api.CldrPath; 29 import org.unicode.cldr.api.CldrValue; 30 import org.unicode.cldr.api.PathMatcher; 31 import org.unicode.icu.tool.cldrtoicu.DebugWriter; 32 import org.unicode.icu.tool.cldrtoicu.IcuData; 33 import org.unicode.icu.tool.cldrtoicu.RbPath; 34 import org.unicode.icu.tool.cldrtoicu.RbValue; 35 36 import com.google.common.annotations.VisibleForTesting; 37 import com.google.common.base.Splitter; 38 import com.google.common.collect.ImmutableList; 39 import com.google.common.collect.ImmutableSet; 40 import com.google.common.collect.Iterables; 41 import com.google.common.primitives.Bytes; 42 import com.ibm.icu.impl.locale.LSR; 43 import com.ibm.icu.impl.locale.LocaleDistance; 44 import com.ibm.icu.impl.locale.XLikelySubtags; 45 import com.ibm.icu.util.ULocale; 46 47 /** 48 * Mapper for generating locale distance tables from CLDR language data. 49 * 50 * <p>Note that this is an atypical mapper which does a lot more processing than other 51 * ICU mapper classes and relies on several auxilliary classes (which is why it's in a 52 * different package). Conceptually it's still a "mapper" though, just not a simple one. 53 * 54 * <p>This mapper was converted from the LocaleDistanceBuilder code in the ICU4J project. 55 */ 56 public final class LocaleDistanceMapper { 57 private static final Logger logger = Logger.getLogger(LocaleDistanceMapper.class.getName()); 58 59 // All the language matching data comes from the "written_new" language data in 60 // "common/supplemental/languageInfo.xml". 61 private static final PathMatcher WRITTEN_LANGUAGE_PREFIX = 62 PathMatcher.of("//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]"); 63 64 // Definitions of region containment variables used when expressing match distances. E.g.: 65 // <matchVariable id="$maghreb" value="MA+DZ+TN+LY+MR+EH"/> 66 private static final PathMatcher VARIABLE_PATH = 67 WRITTEN_LANGUAGE_PREFIX.withSuffix("matchVariable[@id=*]"); 68 private static final AttributeKey VARIABLE_ID = AttributeKey.keyOf("matchVariable", "id"); 69 private static final AttributeKey VARIABLE_VALUE = AttributeKey.keyOf("matchVariable", "value"); 70 71 // Language distance data, including wildcards and variable references (possibly negated). E.g.: 72 // <languageMatch desired="ja_Latn" supported="ja_Jpan" distance="5" oneway="true"/> 73 // <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/> 74 // <languageMatch desired="en_*_$!enUS" supported="en_*_GB" distance="3"/> 75 private static final PathMatcher LANGUAGE_MATCH_PATH = 76 WRITTEN_LANGUAGE_PREFIX.withSuffix("languageMatch[@desired=*][@supported=*]"); 77 private static final AttributeKey MATCH_DESIRED = 78 AttributeKey.keyOf("languageMatch", "desired"); 79 private static final AttributeKey MATCH_SUPPORTED = 80 AttributeKey.keyOf("languageMatch", "supported"); 81 private static final AttributeKey MATCH_DISTANCE = 82 AttributeKey.keyOf("languageMatch", "distance"); 83 // Optional, assume false if not present. 84 private static final AttributeKey MATCH_ONEWAY = 85 AttributeKey.keyOf("languageMatch", "oneway"); 86 87 // Singleton element containing the list of special case "paradigm" locales, which should 88 // always be preferred if there is a tie. E.g.: 89 // <paradigmLocales locales="en en_GB es es_419 pt_BR pt_PT"/> 90 // 91 // Since there are no distinguishing attributes for this path, there can only be one 92 // instance which we can just lookup directly. 93 private static final CldrPath PARADIGM_LOCALES_PATH = CldrPath.parseDistinguishingPath( 94 "//supplementalData/languageMatching/languageMatches[@type=\"written_new\"]/paradigmLocales"); 95 private static final AttributeKey PARADIGM_LOCALES = 96 AttributeKey.keyOf("paradigmLocales", "locales"); 97 98 // NOTE: You must omit empty strings, since otherwise " foo " becomes ("", "foo", ""). 99 private static final Splitter LIST_SPLITTER = 100 Splitter.on(' ').trimResults().omitEmptyStrings(); 101 102 // Output resource bundle paths, split into two basic groups for likely locale mappings 103 // and match data. 104 private static final RbPath LIKELY_LANGUAGES = RbPath.of("likely", "languageAliases"); 105 private static final RbPath LIKELY_REGIONS = RbPath.of("likely", "regionAliases"); 106 private static final RbPath LIKELY_TRIE = RbPath.of("likely", "trie:bin"); 107 private static final RbPath LIKELY_LSRS = RbPath.of("likely", "lsrs"); 108 109 private static final RbPath MATCH_TRIE = RbPath.of("match", "trie:bin"); 110 private static final RbPath MATCH_REGION_TO_PARTITIONS = RbPath.of("match", "regionToPartitions:bin"); 111 private static final RbPath MATCH_PARTITIONS = RbPath.of("match", "partitions"); 112 private static final RbPath MATCH_PARADIGMS = RbPath.of("match", "paradigms"); 113 private static final RbPath MATCH_DISTANCES = RbPath.of("match", "distances:intvector"); 114 115 // To split locale specifications (e.g. "ja_Latn" or "en_*_$!enUS"). 116 private static final Splitter UNDERSCORE = Splitter.on('_'); 117 118 /** 119 * Processes data from the given supplier to generate locale matcher ICU data. 120 * 121 * @param src the CLDR data supplier to process. 122 * @return the IcuData instance to be written to a file. 123 */ process(CldrDataSupplier src)124 public static IcuData process(CldrDataSupplier src) { 125 return process(src.getDataForType(SUPPLEMENTAL)); 126 } 127 128 @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. process(CldrData data)129 static IcuData process(CldrData data) { 130 IcuData icuData = new IcuData("langInfo", false); 131 132 XLikelySubtags.Data likelyData = LikelySubtagsBuilder.build(data); 133 icuData.add(LIKELY_LANGUAGES, ofMapEntries(likelyData.languageAliases)); 134 icuData.add(LIKELY_REGIONS, ofMapEntries(likelyData.regionAliases)); 135 icuData.add(LIKELY_TRIE, ofBytes(likelyData.trie)); 136 icuData.add(LIKELY_LSRS, ofLsrs(asList(likelyData.lsrs))); 137 138 LocaleDistance.Data distanceData = buildDistanceData(data); 139 icuData.add(MATCH_TRIE, ofBytes(distanceData.trie)); 140 icuData.add(MATCH_REGION_TO_PARTITIONS, ofBytes(distanceData.regionToPartitionsIndex)); 141 icuData.add(MATCH_PARTITIONS, RbValue.of(distanceData.partitionArrays)); 142 icuData.add(MATCH_PARADIGMS, ofLsrs(distanceData.paradigmLSRs)); 143 icuData.add(MATCH_DISTANCES, RbValue.of(Arrays.stream(distanceData.distances).mapToObj(Integer::toString))); 144 return icuData; 145 } 146 147 /** 148 * A simple holder for language, script and region which allows for wildcards (i.e. "*") 149 * and variables to represent partitions of regions (e.g. "$enUS"). Minimal additional 150 * validation is done on incoming fields as data is assumed to be correct. 151 */ 152 private static final class LsrSpec { 153 /** 154 * Parse a raw specification string (e.g. "en", "ja_Latn", "*_*_*", "ar_*_$maghreb" 155 * or "en_*_GB") into a structured spec. Note that if the specification string 156 * contains a "bare" region (e.g. "en_*_GB") then it is registered as a variable in 157 * the given RegionMapper builder, so the returned {@code LsrSpec} will be 158 * {@code "en_*_$GB"}. 159 */ parse(String rawSpec, PartitionInfo.Builder rmb)160 public static LsrSpec parse(String rawSpec, PartitionInfo.Builder rmb) { 161 List<String> parts = UNDERSCORE.splitToList(rawSpec); 162 checkArgument(parts.size() <= 3, "invalid raw LSR specification: %s", rawSpec); 163 String language = parts.get(0); 164 Optional<String> script = parts.size() > 1 ? Optional.of(parts.get(1)) : Optional.empty(); 165 // While parsing the region part, ensure any "bare" region subtags are converted 166 // to variables (e.g. "GB" -> "$GB") and registered with the parition map. 167 Optional<String> region = 168 parts.size() > 2 ? Optional.of(rmb.ensureVariable(parts.get(2))) : Optional.empty(); 169 return new LsrSpec(language, script, region); 170 } 171 172 // A language subtag (e.g. "en") or "*". 173 private final String language; 174 // If present, a script subtag (e.g. "Latn") or "*". 175 private final Optional<String> script; 176 // If present, a registered variable with '$' prefix (e.g. "$foo" or "$GB") or "*". 177 private final Optional<String> regionVariable; 178 LsrSpec(String language, Optional<String> script, Optional<String> regionVariable)179 private LsrSpec(String language, Optional<String> script, Optional<String> regionVariable) { 180 this.language = language; 181 this.script = script; 182 this.regionVariable = regionVariable; 183 // Implementation shortcuts assume: 184 // - If the language subtags are '*', the other-level subtags must also be '*' (if present). 185 // If there are rules that do not fit these constraints, we need to revise the implementation. 186 if (isAny(language)) { 187 script.ifPresent( 188 s -> checkArgument(isAny(s), "expected wildcard script, got: %s", script)); 189 regionVariable.ifPresent( 190 r -> checkArgument(isAny(r), "expected wildcard region, got: %s", regionVariable)); 191 } 192 } 193 getLanguage()194 public String getLanguage() { 195 return language; 196 } 197 getScript()198 public String getScript() { 199 return script.orElseThrow(() -> new IllegalArgumentException("no script available: " + this)); 200 } 201 getRegionVariable()202 public String getRegionVariable() { 203 return regionVariable.orElseThrow(() -> new IllegalArgumentException("no region available: " + this)); 204 } 205 size()206 public int size() { 207 return regionVariable.isPresent() ? 3 : script.isPresent() ? 2 : 1; 208 } 209 210 @Override toString()211 public String toString() { 212 return language + script.map(s -> "_" + s).orElse("") + regionVariable.map(r -> "_" + r).orElse(""); 213 } 214 } 215 216 /** 217 * Represents a {@code <languageMatch>} rule derived from supplemental data, such as: 218 * <pre>{@code 219 * <languageMatch desired="zh_Hans" supported="zh_Hant" distance="15" oneway="true"/> 220 * }</pre> 221 * or: 222 * <pre>{@code 223 * <languageMatch desired="ar_*_$maghreb" supported="ar_*_$maghreb" distance="4"/> 224 * }</pre> 225 * 226 * <p>The job of a {@code Rule} is to provide a mechanism for capturing the data in 227 * {@code <languageMatch>} elements and subsequently adding that information to a 228 * {@link DistanceTable.Builder} in a structured way. 229 */ 230 private static final class LanguageMatchRule { 231 private final LsrSpec desired; 232 private final LsrSpec supported; 233 private final int distance; 234 private final boolean oneway; 235 LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway)236 public LanguageMatchRule(LsrSpec desired, LsrSpec supported, int distance, boolean oneway) { 237 this.desired = checkNotNull(desired); 238 this.supported = checkNotNull(supported); 239 this.distance = distance; 240 this.oneway = oneway; 241 // Implementation shortcuts assume: 242 // - At any level, either both or neither spec subtags are *. 243 // If there are rules that do not fit these constraints, we need to revise the implementation. 244 checkArgument(desired.size() == supported.size(), 245 "mismatched rule specifications in: %s, %s", desired, supported); 246 checkArgument(isAny(desired.language) == isAny(supported.language), 247 "wildcard mismatch for languages in: %s, %s", desired, supported); 248 checkArgument(isAny(desired.script) == isAny(supported.script), 249 "wildcard mismatch for scripts in: %s, %s", desired, supported); 250 checkArgument(isAny(desired.regionVariable) == isAny(supported.regionVariable), 251 "wildcard mismatch for languages in: %s, %s", desired, supported); 252 } 253 size()254 int size() { 255 return desired.size(); 256 } 257 isDefaultRule()258 boolean isDefaultRule() { 259 // We already know that in LsrSpec, if the language is "*" then all subtags are too. 260 return isAny(desired.language); 261 } 262 263 /** 264 * Adds this rule to the given distance table, using the given partition map to 265 * resolve any region variables present in the desired or supported specs. 266 */ addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions)267 void addTo(DistanceTable.Builder distanceTable, PartitionInfo partitions) { 268 // Note that rather than using the rule's "size" to mediate the different 269 // cases, we could have had 3 distinct sub-types of a common rule API (e.g. 270 // "LanguageRule", "ScriptRule" and "RegionRule"), each with a different 271 // addTo() callback. However this would have been quite a lot more code 272 // for not much real gain. 273 switch (size()) { 274 case 1: // Language only. 275 distanceTable.addDistance(distance, oneway, 276 desired.getLanguage(), supported.getLanguage()); 277 break; 278 279 case 2: // Language and script present. 280 distanceTable.addDistance(distance, oneway, 281 desired.getLanguage(), supported.getLanguage(), 282 desired.getScript(), supported.getScript()); 283 break; 284 285 case 3: // Language, script and region variable present. 286 // Add the rule distance for every combination of desired/supported 287 // partition IDs for the region variables. This is important for 288 // variables like "$americas" which overlap with multiple paritions. 289 // 290 // Note that in this case (because region variables map to sets of 291 // partition IDs) we can get situations where "shouldReverse" is true, 292 // but the desired/supported pairs being passed in are identical (e.g. 293 // different region variables map to distinct partition groups which 294 // share some common elements). 295 // 296 // This is fine, providing that the distance table is going to ignore 297 // identical mappings (which it does). Alternatively we could just 298 // re-calculate "shouldReverse" inside this loop to account for partition 299 // IDs rather than region variables. 300 ImmutableSet<String> desiredPartitionIds = 301 partitions.getPartitionIds(desired.getRegionVariable()); 302 ImmutableSet<String> supportedPartitionIds = 303 partitions.getPartitionIds(supported.getRegionVariable()); 304 for (String desiredPartitionId : desiredPartitionIds) { 305 for (String supportedPartitionId : supportedPartitionIds) { 306 distanceTable.addDistance(distance, oneway, 307 desired.getLanguage(), supported.getLanguage(), 308 desired.getScript(), supported.getScript(), 309 desiredPartitionId, supportedPartitionId); 310 } 311 } 312 break; 313 314 default: 315 throw new IllegalStateException("invalid size for LsrSpec: " + this); 316 } 317 } 318 319 @Override toString()320 public String toString() { 321 return String.format( 322 "Rule{ desired=%s, supported=%s, distance=%d, oneway=%b }", 323 desired, supported, distance, oneway); 324 } 325 } 326 buildDistanceData(CldrData supplementalData)327 private static LocaleDistance.Data buildDistanceData(CldrData supplementalData) { 328 // Resolve any explicitly declared region variables into the partition map. 329 // Territory containment information is used to recursively resolve region 330 // variables (e.g. "$enUS") into a collection of non-macro regions. 331 PartitionInfo.Builder partitionBuilder = 332 PartitionInfo.builder(TerritoryContainment.getContainment(supplementalData)); 333 supplementalData.accept(DTD, v -> { 334 CldrPath path = v.getPath(); 335 if (VARIABLE_PATH.matches(path)) { 336 partitionBuilder.addVariableExpression(v.get(VARIABLE_ID), v.get(VARIABLE_VALUE)); 337 } 338 }); 339 340 // Parse the rules from <languageMatch> elements. Note that the <languageMatch> 341 // element is marked as "ORDERED" in the DTD, which means the elements always 342 // appear in the same order is in the CLDR XML file (even when using DTD order). 343 // 344 // This is one of the relatively rare situations in which using DTD order will 345 // not isolate the ICU data from reordering of the CLDR data. In particular this 346 // matters when specifying language matcher preferences (such as "en_*_GB" vs 347 // "en_*_!enUS"). 348 // 349 // We could almost process the rules while reading them from the source data, but 350 // rules may contain region codes rather than variables, and we need to create a 351 // variable for each such region code before the RegionMapper is built, and 352 // before processing the rules (this happens when the LsrSpec is parsed). 353 List<LanguageMatchRule> rules = new ArrayList<>(); 354 supplementalData.accept(DTD, v -> { 355 CldrPath path = v.getPath(); 356 if (LANGUAGE_MATCH_PATH.matches(path)) { 357 int distance = Integer.parseInt(v.get(MATCH_DISTANCE)); 358 // Lenient against there being no "oneway" attribute. 359 boolean oneway = "true".equalsIgnoreCase(v.get(MATCH_ONEWAY)); 360 LsrSpec desired = LsrSpec.parse(v.get(MATCH_DESIRED), partitionBuilder); 361 LsrSpec supported = LsrSpec.parse(v.get(MATCH_SUPPORTED), partitionBuilder); 362 LanguageMatchRule rule = new LanguageMatchRule(desired, supported, distance, oneway); 363 logger.fine(() -> String.format("rule: %s", rule)); 364 rules.add(rule); 365 } 366 }); 367 // Check that the rules are in the expected order. Rule order is important in ensuring 368 // data correctness and incorrect order may violate business logic assumptions later. 369 // TODO: Consider what other ordering/sanity checks make sense here. 370 for (int n = 0, prevSize = 1; n < rules.size(); n++) { 371 LanguageMatchRule rule = rules.get(n); 372 checkArgument(rule.size() >= prevSize, "<languageMatch> elements out of order at: %s", rule); 373 checkArgument(rule.size() == prevSize || (n > 0 && rules.get(n - 1).isDefaultRule()), 374 "missing default rule before: %s", rule); 375 prevSize = rule.size(); 376 } 377 checkState(rules.stream().distinct().count() == rules.size(), "duplicated rule in: %s", rules); 378 379 // Build region partition data after all the variables have been accounted for 380 // (including the implicit variables found while processing LsrSpecs). 381 PartitionInfo partitions = partitionBuilder.build(); 382 383 // Add all the rules (in order) to the distance table. 384 DistanceTable.Builder distanceTableBuilder = DistanceTable.builder(); 385 rules.forEach(r -> r.addTo(distanceTableBuilder, partitions)); 386 DistanceTable distanceTable = distanceTableBuilder.build(); 387 388 // Note: Using LocaleDistance.Data as a fairly "dumb" container for the return values 389 // requires us to do slightly awkward things, like passing mutable arrays and LSR 390 // instances around, but the advantage it has is that this data structure is also what's 391 // used in client code, so if the likely subtags data changes, it will be a forcing 392 // function to change this code. 393 return new LocaleDistance.Data( 394 distanceTable.getTrie().toByteArray(), 395 partitions.getPartitionLookupArray(), 396 partitions.getPartitionStrings(), 397 getParadigmLsrs(supplementalData), 398 distanceTable.getDefaultDistances()); 399 } 400 getParadigmLsrs(CldrData supplementalData)401 private static Set<LSR> getParadigmLsrs(CldrData supplementalData) { 402 // LinkedHashSet for stable order; otherwise a unit test is flaky. 403 CldrValue cldrValue = supplementalData.get(PARADIGM_LOCALES_PATH); 404 checkState(cldrValue != null, 405 "<paradigmLocales> element was missing: %s", PARADIGM_LOCALES_PATH); 406 String localesList = cldrValue.get(PARADIGM_LOCALES); 407 checkState(localesList != null, 408 "<paradigmLocales> 'locales' attribute was missing: %s", cldrValue); 409 410 Set<LSR> paradigmLSRs = new LinkedHashSet<>(); 411 for (String paradigm : LIST_SPLITTER.split(localesList)) { 412 LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(new ULocale(paradigm)); 413 // Clear the LSR flags to make the data equality test in LocaleDistanceTest happy. 414 paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS)); 415 } 416 checkArgument(paradigmLSRs.size() % 2 == 0, "unpaired paradigm locales: %s", paradigmLSRs); 417 return paradigmLSRs; 418 } 419 420 // Returns an RbValue serialized from a map as a sequence of alternating (key, value) 421 // pairs (formatted as one pair per line in the IcuData file). 422 // 423 // E.g. 424 // foo{ 425 // key1, value1, 426 // ... 427 // keyN, valueN, 428 // } ofMapEntries(Map<String, String> map)429 private static RbValue ofMapEntries(Map<String, String> map) { 430 return RbValue.of( 431 map.entrySet().stream() 432 .flatMap(e -> Stream.of(e.getKey(), e.getValue())) 433 .collect(Collectors.toList())) 434 .elementsPerLine(2); 435 } 436 437 // Returns an RbValue serialized from a sequence of LSR instance as a sequence of repeating 438 // (language, region, script) tuples (formatted as one tuple per line in the IcuData file). 439 // 440 // E.g. 441 // foo{ 442 // lang1, script1, region1, 443 // ... 444 // langN, scriptN, regionN, 445 // } ofLsrs(Collection<LSR> lsrs)446 private static RbValue ofLsrs(Collection<LSR> lsrs) { 447 return RbValue.of( 448 lsrs.stream() 449 .flatMap(lsr -> Stream.of(lsr.language, lsr.script, lsr.region)) 450 .collect(Collectors.toList())) 451 .elementsPerLine(3); 452 } 453 454 // Returns an RbValue serialized from a byte array, as a concatenated sequence of rows of 455 // hex values. This is intended only for RbPaths using the ":bin" suffix. 456 // 457 // E.g. 458 // foo{ 459 // 0123456789abcdef0123456789abcdef 460 // ... 461 // 1c0de4c0ffee 462 // } 463 // 464 // Note that typically no indentation is used when writting this binary "blob". ofBytes(byte[] data)465 private static RbValue ofBytes(byte[] data) { 466 ImmutableList.Builder<String> hexValues = ImmutableList.builder(); 467 List<Byte> bytes = Bytes.asList(data); 468 for (List<Byte> line : Iterables.partition(bytes, 16)) { 469 hexValues.add(line.stream().map(b -> String.format("%02x", b)).collect(Collectors.joining())); 470 } 471 return RbValue.of(hexValues.build()); 472 } 473 474 // Returns if the subtag is the '*' wildcard. This is not to be confused with the 475 // "ANY" character used in DistanceTable. isAny(String subtag)476 private static boolean isAny(String subtag) { 477 return subtag.equals("*"); 478 } 479 480 // Returns if the subtag exists and is the '*' wildcard. isAny(Optional<String> subtag)481 private static boolean isAny(Optional<String> subtag) { 482 return subtag.map(LocaleDistanceMapper::isAny).orElse(false); 483 } 484 485 // Main method for running this mapper directly with logging enabled. 486 // CLDR_DIR is picked up from system properties or envirnment variables. 487 // Arguments: <output-file> [<log-level>] main(String[] args)488 public static void main(String[] args) throws IOException { 489 DebugWriter.writeForDebugging(args, LocaleDistanceMapper::process); 490 } 491 } 492