1 package org.unicode.cldr.icu; 2 3 import java.util.ArrayList; 4 import java.util.Collection; 5 import java.util.Comparator; 6 import java.util.HashMap; 7 import java.util.HashSet; 8 import java.util.List; 9 import java.util.Map; 10 import java.util.Set; 11 import java.util.regex.Matcher; 12 import java.util.regex.Pattern; 13 14 import org.unicode.cldr.icu.RegexManager.CldrArray; 15 import org.unicode.cldr.icu.RegexManager.PathValueInfo; 16 import org.unicode.cldr.icu.RegexManager.RegexResult; 17 import org.unicode.cldr.test.DisplayAndInputProcessor.NumericType; 18 import org.unicode.cldr.tool.FilterFactory; 19 import org.unicode.cldr.util.Builder; 20 import org.unicode.cldr.util.CLDRFile; 21 import org.unicode.cldr.util.DtdType; 22 import org.unicode.cldr.util.Factory; 23 import org.unicode.cldr.util.LanguageTagParser; 24 import org.unicode.cldr.util.LocaleIDParser; 25 import org.unicode.cldr.util.PatternCache; 26 import org.unicode.cldr.util.RegexLookup; 27 import org.unicode.cldr.util.RegexLookup.Finder; 28 import org.unicode.cldr.util.SupplementalDataInfo; 29 //import org.unicode.cldr.util.SupplementalDataInfo.MeasurementType; 30 31 import com.ibm.icu.util.Output; 32 33 /** 34 * A mapper that converts locale data from CLDR to the ICU data structure. 35 * 36 * @author jchye 37 */ 38 public class LocaleMapper extends Mapper { 39 /** 40 * Map for converting enums to their integer values. 41 */ 42 private static final Map<String, String> enumMap = Builder.with(new HashMap<String, String>()) 43 .put("titlecase-firstword", "1") 44 .put("no-change", "0") 45 .freeze(); 46 47 private static final Pattern DRAFT_PATTERN = PatternCache.get("\\[@draft=\"\\w+\"]"); 48 private static final Pattern TERRITORY_XPATH = PatternCache.get( 49 "//ldml/localeDisplayNames/territories/territory\\[@type=\"(\\w+)\"]"); 50 private static final Pattern RB_DATETIMEPATTERN = PatternCache.get( 51 "/calendar/(\\w++)/DateTimePatterns"); 52 53 private SupplementalDataInfo supplementalDataInfo; 54 // We may use different factories for resolved or unresolved CLDRFiles depending 55 // on whether filtering is required. 56 private Factory unresolvedFactory; 57 private Factory resolvedFactory; 58 private Factory specialFactory; 59 private RegexManager manager; 60 private String debugXPath; 61 62 private Set<String> deprecatedTerritories; 63 64 /** 65 * Special hack comparator, so that RB strings come out in the right order. 66 * This is only important for the order of items in arrays. 67 */ 68 private static Comparator<String> comparator = new Comparator<String>() { 69 private final Pattern CURRENCY_FORMAT = PatternCache.get( 70 "//ldml/numbers/currencies/currency\\[@type=\"\\w++\"]/(.++)"); 71 private final Pattern DATE_OR_TIME_FORMAT = PatternCache.get( 72 "//ldml/dates/calendars/calendar\\[@type=\"\\w++\"]/(date|time)Formats/.*"); 73 private final Pattern MONTH_PATTERN = PatternCache 74 .get( 75 "//ldml/dates/calendars/calendar\\[@type=\"\\w++\"]/months/monthContext\\[@type=\"[\\w\\-]++\"]/monthWidth\\[@type=\"\\w++\"]/month\\[@type=\"\\d++\"](\\[@yeartype=\"leap\"])?"); 76 private final Pattern CONTEXT_TRANSFORM = PatternCache.get( 77 "//ldml/contextTransforms/contextTransformUsage\\[@type=\"([^\"]++)\"]/contextTransform\\[@type=\"([^\"]++)\"]"); 78 79 private final String[] CURRENCY_ORDER = { "symbol", "displayName", 80 "pattern[@type=\"standard\"]", "decimal", "group" }; 81 82 /** 83 * Reverse the ordering of the following: 84 * //ldml/numbers/currencies/currency[@type="([^"]*)"]/displayName ; curr ; /Currencies/$1 85 * //ldml/numbers/currencies/currency[@type="([^"]*)"]/symbol ; curr ; /Currencies/$1 86 * and the following (time/date) 87 * //ldml/dates/calendars/calendar[@type="([^"]*)"]/(dateFormats|dateTimeFormats|timeFormats)/(?:[^/\[]*)[@type= 88 * "([^"]*)"]/(?:[^/\[]*)[@type="([^"]*)"]/.* ; locales ; /calendar/$1/DateTimePatterns 89 */ 90 @Override 91 public int compare(String arg0, String arg1) { 92 Matcher[] matchers = new Matcher[2]; 93 if (RegexManager.matches(CURRENCY_FORMAT, arg0, arg1, matchers)) { 94 // Use ldml ordering except that symbol should be first. 95 int index0 = getIndexOf(CURRENCY_ORDER, matchers[0].group(1)); 96 int index1 = getIndexOf(CURRENCY_ORDER, matchers[1].group(1)); 97 return index0 - index1; 98 } else if (RegexManager.matches(DATE_OR_TIME_FORMAT, arg0, arg1, matchers)) { 99 int compareValue = matchers[0].group(1).compareTo(matchers[1].group(1)); 100 if (compareValue != 0) return -compareValue; 101 } else if (RegexManager.matches(CONTEXT_TRANSFORM, arg0, arg1, matchers)) { 102 // Sort uiListOrMenu before stand-alone. 103 if (matchers[0].group(1).equals(matchers[1].group(1))) { 104 return -matchers[0].group(2).compareTo(matchers[1].group(2)); 105 } 106 } else if (RegexManager.matches(MONTH_PATTERN, arg0, arg1, matchers)) { 107 // Sort leap year types after normal month types. 108 String matchGroup0 = matchers[0].group(1); 109 String matchGroup1 = matchers[1].group(1); 110 if (matchGroup0 != matchGroup1) { 111 return matchGroup0 == null && matchGroup1 != null ? -1 : 1; 112 } 113 } 114 115 return CLDRFile.getComparator(DtdType.ldml).compare(arg0, arg1); 116 } 117 }; 118 119 /** 120 * Looks for a string in an array 121 * 122 * @param order 123 * the array to be searched 124 * @param key 125 * the string to be searched for 126 * @return the index of the string if found, -1 if not found 127 */ getIndexOf(String[] order, String key)128 private static int getIndexOf(String[] order, String key) { 129 for (int i = 0; i < order.length; i++) { 130 if (order[i].equals(key)) return i; 131 } 132 return -1; 133 } 134 135 /** 136 * LocaleMapper constructor. 137 * 138 * @param factory 139 * the factory containing the CLDR data to be converted 140 * @param specialFactory 141 * a factory containing any additional CLDR data 142 * @param supplementalDataInfo 143 * SupplementalDataInfo object 144 * @param useAltValues 145 * true if alt path filtering should be performed 146 * @param organization 147 * the organization to filter the data by 148 * (null if coverage filtering is not needed) 149 */ LocaleMapper(Factory factory, Factory specialFactory, SupplementalDataInfo supplementalDataInfo, boolean useAltValues, String organization)150 public LocaleMapper(Factory factory, Factory specialFactory, 151 SupplementalDataInfo supplementalDataInfo, boolean useAltValues, 152 String organization) { 153 manager = new RegexManager("ldml2icu_locale.txt"); 154 unresolvedFactory = resolvedFactory = factory; 155 // If filtering is required, filter all unresolved CLDRFiles for use in 156 // fillFromCldr(). We don't filter the resolved CLDRFiles by organization 157 // coverage level because 158 // some rbPaths (e.g. /calendar/x/DateTimePatterns) have a fixed number 159 // of values that must always be present regardless of filtering. 160 if (useAltValues || organization != null) { 161 unresolvedFactory = FilterFactory.load(factory, organization, useAltValues); 162 resolvedFactory = FilterFactory.load(factory, null, useAltValues); 163 } 164 this.specialFactory = specialFactory; 165 this.supplementalDataInfo = supplementalDataInfo; 166 } 167 168 /** 169 * @return the set of locales available for processing by this mapper 170 */ 171 @Override getAvailable()172 public Set<String> getAvailable() { 173 return unresolvedFactory.getAvailable(); 174 } 175 176 /** 177 * @param filename 178 * @return true if a special XML file with the specified filename is available. 179 */ hasSpecialFile(String filename)180 private boolean hasSpecialFile(String filename) { 181 return specialFactory != null && specialFactory.getAvailable().contains(filename); 182 } 183 184 /** 185 * @return the set of deprecated territories to be ignored. Remove when no longer 186 * present in CLDR data. 187 */ getDeprecatedTerritories()188 private Set<String> getDeprecatedTerritories() { 189 if (deprecatedTerritories == null) { 190 deprecatedTerritories = Builder.with( 191 supplementalDataInfo.getLocaleAliasInfo().get("territory").keySet()) 192 .remove("062").remove("172").remove("200").remove("830") 193 .remove("AN").remove("CS").remove("QU").get(); 194 } 195 return deprecatedTerritories; 196 } 197 198 /** 199 * Fills an IcuData object using the CLDR data for the specified locale. 200 * 201 * @param locale 202 * @return the filled IcuData object 203 */ 204 @Override fillFromCldr(String locale)205 public IcuData[] fillFromCldr(String locale) { 206 Set<String> deprecatedTerritories = getDeprecatedTerritories(); 207 CLDRFile resolvedCldr = resolvedFactory.make(locale, true); 208 RegexLookup<RegexResult> pathConverter = manager.getPathConverter(resolvedCldr); 209 210 // First pass through the unresolved CLDRFile to get all icu paths. 211 CLDRFile cldr = unresolvedFactory.make(locale, false); 212 Map<String, CldrArray> pathValueMap = new HashMap<String, CldrArray>(); 213 Set<String> validRbPaths = new HashSet<String>(); 214 for (String xpath : cldr) { 215 // Territory hacks to be removed once CLDR data is fixed. 216 Matcher matcher = TERRITORY_XPATH.matcher(xpath); 217 if (matcher.matches()) { 218 String country = matcher.group(1); 219 if (deprecatedTerritories.contains(country)) { 220 continue; 221 } 222 } 223 224 // Add rb paths. 225 Output<Finder> matcherFound = new Output<Finder>(); 226 Output<String[]> firstInfo = new Output<>(); 227 RegexResult regexResult = matchXPath(pathConverter, cldr, xpath, matcherFound, firstInfo); 228 if (regexResult == null) continue; 229 // String[] arguments = matcherFound.value.getInfo(); 230 String[] arguments = firstInfo.value; 231 for (PathValueInfo info : regexResult) { 232 String rbPath = info.processRbPath(arguments); 233 validRbPaths.add(rbPath); 234 // The immediate parent of every path should also exist. 235 validRbPaths.add(rbPath.substring(0, rbPath.lastIndexOf('/'))); 236 } 237 } 238 239 // Get all values from the resolved CLDRFile. 240 for (String xpath : resolvedCldr) { 241 // Since the unresolved CLDRFile may have been modified, use it 242 // to add values instead of the resolved CLDRFile if possible. 243 CLDRFile fileToUse = cldr.getStringValue(xpath) == null ? resolvedCldr : cldr; 244 addMatchesForPath(xpath, fileToUse, validRbPaths, pathConverter, pathValueMap); 245 } 246 247 // Add fallback paths if necessary. 248 manager.addFallbackValues(resolvedCldr, pathValueMap); 249 250 // Add special values to file. 251 boolean hasSpecial = hasSpecialFile(locale); 252 if (hasSpecial) { 253 CLDRFile specialCldrFile = specialFactory.make(locale, false); 254 for (String xpath : specialCldrFile) { 255 if (resolvedCldr.isHere(xpath)) continue; 256 addMatchesForPath(xpath, specialCldrFile, null, pathConverter, pathValueMap); 257 } 258 } 259 260 for (String rbPath : pathValueMap.keySet()) { 261 // HACK: DateTimePatterns needs a duplicate of the medium 262 // dateTimeFormat (formerly indicated using dateTimeFormats/default). 263 // This hack can be removed when ICU no longer requires it. 264 Matcher matcher = RB_DATETIMEPATTERN.matcher(rbPath); 265 if (matcher.matches()) { 266 String calendar = matcher.group(1); 267 CldrArray valueList = RegexManager.getCldrArray(rbPath, pathValueMap); 268 // Create a dummy xpath to sort the value in front of the other date time formats. 269 String basePath = "//ldml/dates/calendars/calendar[@type=\"" + calendar + "\"]/dateTimeFormats"; 270 String mediumFormatPath = basePath 271 + "/dateTimeFormatLength[@type=\"medium\"]/dateTimeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]"; 272 valueList.add(basePath, 273 getStringValue(resolvedCldr, mediumFormatPath), 274 null); 275 } 276 } 277 278 // HACK: Fill missing narrow era values with their abbreviated versions. 279 CldrArray narrowEras = pathValueMap.get("/calendar/japanese/eras/narrow"); 280 CldrArray abbreviatedEras = pathValueMap.get("/calendar/japanese/eras/abbreviated"); 281 if (narrowEras != null && abbreviatedEras != null) { 282 narrowEras.addAll(abbreviatedEras); 283 } 284 285 IcuData icuData = new IcuData("common/main/" + locale + ".xml", locale, true, enumMap); 286 if (hasSpecial) { 287 icuData.setFileComment("ICU <specials> source: <path>/common/main/" + locale + ".xml"); 288 } 289 fillIcuData(pathValueMap, comparator, icuData); 290 291 // More hacks 292 hackAddExtras(resolvedCldr, locale, icuData); 293 return new IcuData[] { icuData }; 294 } 295 fillIcuData(Map<String, CldrArray> pathValueMap, Comparator<String> comparator, IcuData icuData)296 private void fillIcuData(Map<String, CldrArray> pathValueMap, 297 Comparator<String> comparator, IcuData icuData) { 298 // Convert values to final data structure. 299 for (String rbPath : pathValueMap.keySet()) { 300 icuData.addAll(rbPath, pathValueMap.get(rbPath).sortValues(comparator)); 301 } 302 } 303 getFullXPath(String xpath, CLDRFile cldrFile)304 public static String getFullXPath(String xpath, CLDRFile cldrFile) { 305 String fullPath = cldrFile.getFullXPath(xpath); 306 return fullPath == null ? xpath : DRAFT_PATTERN.matcher(fullPath).replaceAll(""); 307 } 308 309 /** 310 * @param cldr 311 * @param path 312 * @param matcherFound 313 * @param firstInfo 314 * @return the result of converting an xpath into an ICU-style path 315 */ matchXPath(RegexLookup<RegexResult> lookup, CLDRFile cldr, String path, Output<Finder> matcherFound, Output<String[]> firstInfo)316 private RegexResult matchXPath(RegexLookup<RegexResult> lookup, 317 CLDRFile cldr, String path, 318 Output<Finder> matcherFound, Output<String[]> firstInfo) { 319 String fullPath = cldr.getFullXPath(path); 320 fullPath = fullPath == null ? path : DRAFT_PATTERN.matcher(fullPath).replaceAll(""); 321 List<String> debugResults = isDebugXPath(fullPath) ? new ArrayList<String>() : null; 322 Output<String[]> info = new Output<>(); 323 RegexResult result = lookup.get(fullPath, null, info, matcherFound, debugResults); 324 if (debugResults != null) { 325 if (result == null) { 326 RegexManager.printLookupResults(fullPath, debugResults); 327 } else { 328 System.out.println(fullPath + " successfully matched"); 329 } 330 } 331 if (firstInfo != null && info.value != null) { 332 firstInfo.value = info.value; 333 } 334 return result; 335 } 336 337 /** 338 * Attempts to match an xpath and adds the results of a successful match to 339 * the specified map 340 * 341 * @param xpath 342 * the xpath to be matched 343 * @param cldrFile 344 * the CLDR file to get locale data from 345 * @param validRbPaths 346 * the set of valid rbPaths that the result must belong 347 * to, null if such a requirement does not exist 348 * @param pathValueMap 349 * the map that the results will be added to 350 */ addMatchesForPath(String xpath, CLDRFile cldrFile, Set<String> validRbPaths, RegexLookup<RegexResult> pathConverter, Map<String, CldrArray> pathValueMap)351 private void addMatchesForPath(String xpath, CLDRFile cldrFile, 352 Set<String> validRbPaths, RegexLookup<RegexResult> pathConverter, 353 Map<String, CldrArray> pathValueMap) { 354 Output<Finder> matcher = new Output<Finder>(); 355 Output<String[]> firstInfo = new Output<>(); 356 RegexResult regexResult = matchXPath(pathConverter, 357 cldrFile, xpath, matcher, firstInfo); 358 if (regexResult == null) return; 359 // String[] arguments = matcher.value.getInfo(); 360 String[] arguments = firstInfo.value; 361 String cldrValue = getStringValue(cldrFile, xpath); 362 for (PathValueInfo info : regexResult) { 363 String rbPath = info.processRbPath(arguments); 364 // Don't add additional paths at this stage. 365 if (validRbPaths != null && !validRbPaths.contains(rbPath)) continue; 366 CldrArray valueList = RegexManager.getCldrArray(rbPath, pathValueMap); 367 List<String> values = info.processValues(arguments, cldrValue); 368 String baseXPath = info.processXPath(arguments, xpath); 369 String groupKey = info.processGroupKey(arguments); 370 valueList.put(baseXPath, values, groupKey); 371 } 372 } 373 374 /** 375 * @param cldrFile 376 * @param xpath 377 * @return the value of the specified xpath (fallback or otherwise) 378 */ getStringValue(CLDRFile cldrFile, String xpath)379 private String getStringValue(CLDRFile cldrFile, String xpath) { 380 String value = cldrFile.getStringValue(xpath); 381 // HACK: DAIP doesn't currently make spaces in currency formats non-breaking. 382 // Remove this when fixed. 383 if (NumericType.getNumericType(xpath) == NumericType.CURRENCY) { 384 value = value.replace(' ', '\u00A0'); 385 } 386 return value; 387 } 388 389 /** 390 * Adds all mappings that couldn't be represented in the ldml2icu.txt file. 391 * 392 * @param cldrResolved 393 * @param locale 394 */ hackAddExtras(CLDRFile cldrResolved, String locale, IcuData icuData)395 private void hackAddExtras(CLDRFile cldrResolved, String locale, IcuData icuData) { 396 // Specify parent of non-language locales. 397 String parent = supplementalDataInfo.getExplicitParentLocale(locale); 398 if (parent != null) { 399 icuData.add("/%%Parent", parent); 400 } 401 402 // <version number="$Revision: 5806 $"/> 403 String version = cldrResolved.getFullXPath("//ldml/identity/version"); 404 icuData.add("/Version", MapperUtils.formatVersion(version)); 405 406 // PaperSize:intvector{ 279, 216, } - now in supplemental 407 // MeasurementSystem:int{1} - now in supplemental 408 409 // Default calendar. 410 String localeID = cldrResolved.getLocaleID(); 411 String calendar = getCalendarIfDifferent(localeID); 412 if (calendar != null) { 413 icuData.add("/calendar/default", calendar); 414 } 415 } 416 417 /** 418 * Returns the default calendar to be used for a locale. If the default 419 * calendar for the parent locale is the same, null is returned. 420 */ getCalendarIfDifferent(String localeID)421 private String getCalendarIfDifferent(String localeID) { 422 String calendar = getCalendar(localeID); 423 if (calendar == null) return null; 424 String parent = LocaleIDParser.getParent(localeID); 425 String parentCalendar = null; 426 while (parentCalendar == null && parent != null) { 427 parentCalendar = getCalendar(parent); 428 parent = LocaleIDParser.getParent(parent); 429 } 430 return calendar.equals(parentCalendar) ? null : calendar; 431 } 432 433 /** 434 * Returns the default calendar to be used for a locale, if any. 435 */ getCalendar(String localeID)436 private String getCalendar(String localeID) { 437 LanguageTagParser parser = new LanguageTagParser().set(localeID); 438 String region = localeID.equals("root") ? "001" : parser.getRegion(); 439 if (region.equals("")) { 440 localeID = supplementalDataInfo.getLikelySubtags().get(parser.getLanguage()); 441 if (localeID == null) { 442 throw new RuntimeException("Likely subtag not found for " + parser.getLanguage()); 443 } 444 parser.set(localeID); 445 region = parser.getRegion(); 446 if (region == null) region = "001"; 447 } 448 List<String> calendars = supplementalDataInfo.getCalendars(region); 449 return calendars == null ? null : calendars.get(0); 450 } 451 452 //private String getMeasurementToDisplay(String localeID, MeasurementType measurementType) {...} // deleted 453 454 /** 455 * @param localeID 456 * @param measurementType 457 * the type of measurement required 458 * @return the measurement of the specified locale 459 */ 460 // private String getMeasurement(String localeID, MeasurementType measurementType) { 461 // String region = localeID.equals("root") ? "001" : new LanguageTagParser().set(localeID).getRegion(); 462 // Map<MeasurementType, Map<String, String>> regionMeasurementData = supplementalDataInfo 463 // .getTerritoryMeasurementData(); 464 // Map<String, String> typeMap = regionMeasurementData.get(measurementType); 465 // return typeMap.get(region); 466 // } //not used 467 468 /** 469 * Sets xpath to monitor for debugging purposes. 470 * @param debugXPath 471 */ setDebugXPath(String debugXPath)472 public void setDebugXPath(String debugXPath) { 473 this.debugXPath = debugXPath; 474 } 475 476 /** 477 * @param xpath 478 * @return true if the xpath is to be debugged 479 */ isDebugXPath(String xpath)480 boolean isDebugXPath(String xpath) { 481 return debugXPath == null ? false : xpath.startsWith(debugXPath); 482 } 483 484 @Override generateMakefile(Collection<String> aliases)485 public Makefile generateMakefile(Collection<String> aliases) { 486 Makefile makefile = new Makefile("GENRB"); 487 makefile.addSyntheticAlias(aliases); 488 makefile.addAliasSource(); 489 makefile.addSource(sources); 490 return makefile; 491 } 492 } 493