1 package org.unicode.cldr.tool; 2 3 import com.ibm.icu.text.BreakIterator; 4 import com.ibm.icu.text.Collator; 5 import com.ibm.icu.text.NumberFormat; 6 import com.ibm.icu.text.RuleBasedCollator; 7 import com.ibm.icu.text.UTF16; 8 import com.ibm.icu.text.UnicodeSet; 9 import com.ibm.icu.util.ULocale; 10 import java.io.PrintWriter; 11 import java.util.ArrayList; 12 import java.util.Arrays; 13 import java.util.Comparator; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.List; 17 import java.util.Map; 18 import java.util.Random; 19 import java.util.Set; 20 import java.util.TreeMap; 21 import java.util.TreeSet; 22 import org.unicode.cldr.draft.FileUtilities; 23 import org.unicode.cldr.util.ArrayComparator; 24 import org.unicode.cldr.util.CLDRFile; 25 import org.unicode.cldr.util.CLDRPaths; 26 import org.unicode.cldr.util.Factory; 27 import org.unicode.cldr.util.Level; 28 import org.unicode.cldr.util.Organization; 29 import org.unicode.cldr.util.StandardCodes; 30 import org.unicode.cldr.util.SupplementalDataInfo; 31 import org.unicode.cldr.util.XPathParts; 32 33 public class GenerateG2xG2 { 34 static CLDRFile english; 35 static CLDRFile root; 36 main(String[] args)37 public static void main(String[] args) throws Exception { 38 if (showLocales(-1)) return; 39 // showCollator(); 40 41 String sourceLanguage = "G5"; 42 String targetLanguage = "G5"; 43 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 44 english = cldrFactory.make("en", true); 45 root = cldrFactory.make("root", true); 46 StandardCodes sc = StandardCodes.make(); 47 Map<Organization, Map<String, Level>> type_code_value = sc.getLocaleTypes(); 48 Set<String> sourceSet = new TreeSet<>(); 49 Set<String> targetLanguageSet = new TreeSet<>(); 50 targetLanguageSet.add("no"); 51 addPriority("G2", "nn"); 52 addPriority("G2", "no"); 53 targetLanguageSet.add("nn"); 54 Set<String> targetScriptSet = new TreeSet<>(); 55 Set<String> targetRegionSet = new TreeSet<>(); 56 Set<String> targetTZSet = new TreeSet<>(); 57 Set<String> targetCurrencySet = new TreeSet<>(); 58 for (Organization type : type_code_value.keySet()) { 59 Map<String, Level> code_value = type_code_value.get(type); 60 if (!type.equals(Organization.ibm)) continue; 61 for (String locale : code_value.keySet()) { 62 if (locale.equals("no")) continue; 63 String priority = code_value.get(locale).toString(); 64 ULocale ulocale = new ULocale(locale); 65 String language = ulocale.getLanguage(); 66 String script = ulocale.getScript(); 67 String territory = ulocale.getCountry(); 68 if (sourceLanguage.compareTo(priority) >= 0) { 69 if (language.equals("no")) language = "nn"; 70 locale = new ULocale(language, script).toString(); 71 sourceSet.add(locale); 72 addPriority(priority, locale); 73 } 74 if (targetLanguage.compareTo(priority) >= 0) { 75 targetLanguageSet.add(language); 76 targetScriptSet.add(script); 77 targetRegionSet.add(territory); 78 addPriority(priority, language); 79 addPriority(priority, script); 80 addPriority("G4", territory); // will normally be overridden 81 } 82 } 83 } 84 // set the priorities for territories 85 Map<String, List<String>> worldBankInfo = sc.getWorldBankInfo(); 86 Set<String> euCodes = 87 new HashSet<>( 88 Arrays.asList( 89 new String[] { 90 "AT", "BE", "CY", "CZ", "DK", "EE", "FI", "FR", "DE", "GR", 91 "HU", "IT", "LV", "LT", "LU", "MT", "NL", "PL", "PT", "SI", 92 "ES", "SE", "GB" 93 })); 94 for (String countryCode : worldBankInfo.keySet()) { 95 if (priorityMap.get(countryCode) == null) 96 continue; // only use ones we already have: defaults G4 97 List<String> values = worldBankInfo.get(countryCode); 98 double gdp = Double.parseDouble(values.get(1)); 99 if (gdp >= 1E+13) addPriority("G0", countryCode); 100 else if (gdp >= 1E+12) addPriority("G1", countryCode); 101 else if (gdp >= 1E+11) addPriority("G2", countryCode); 102 else if (euCodes.contains(countryCode)) addPriority("G3", countryCode); 103 // else if (gdp >= 1E+10) addPriority("G4", countryCode); 104 } 105 // fill in the currencies, and TZs for the countries that have multiple zones 106 Map<String, Set<String>> c2z = sc.getCountryToZoneSet(); 107 SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance(); 108 Set<String> mainTimeZones = supplementalDataInfo.getCanonicalTimeZones(); 109 for (Iterator<String> it = targetRegionSet.iterator(); it.hasNext(); ) { 110 String country = it.next(); 111 String priority = priorityMap.get(country); 112 for (Iterator<String> it2 = getCurrency(country).iterator(); it2.hasNext(); ) { 113 String currency = it2.next(); 114 targetCurrencySet.add(currency); 115 addPriority(priority, currency); 116 } 117 Set<String> s = c2z.get(country); 118 if (s.size() == 1) continue; 119 for (Iterator<String> it2 = s.iterator(); it2.hasNext(); ) { 120 String tzid = it2.next(); 121 if (!mainTimeZones.contains(tzid)) continue; 122 targetTZSet.add(tzid); 123 addPriority(priority, tzid); 124 } 125 } 126 // print out missing translations. 127 PrintWriter pw = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, "G2xG2.txt"); 128 // show priorities 129 Comparator<String> comp = new UTF16.StringComparator(); 130 @SuppressWarnings("unchecked") 131 Set<String[]> priority_set = 132 new TreeSet<String[]>(new ArrayComparator(new Comparator[] {comp, comp, comp})); 133 for (Iterator<String> it = priorityMap.keySet().iterator(); it.hasNext(); ) { 134 String code = it.next(); 135 String priority = priorityMap.get(code); 136 if (priority == null) continue; 137 int type = getType(code); 138 // if (type != CLDRFile.TERRITORY_NAME) continue; 139 priority_set.add(new String[] {priority, type + "", code}); 140 } 141 String lastPriority = ""; 142 // String lastType = ""; 143 for (Iterator<String[]> it = priority_set.iterator(); it.hasNext(); ) { 144 String[] items = it.next(); 145 if (!lastPriority.equals(items[0])) { 146 lastPriority = items[0]; 147 pw.println(); 148 // pw.println(lastPriority); 149 } 150 String typeName = getTypeName(items[2]); 151 pw.println( 152 lastPriority 153 + "\t" 154 + typeName 155 + "\t" 156 + items[2] 157 + "\t(" 158 + getItemName(english, items[2]) 159 + ")"); 160 } 161 pw.flush(); 162 // print out missing translations. 163 for (Iterator<String> it = sourceSet.iterator(); it.hasNext(); ) { 164 String sourceLocale = it.next(); 165 System.out.print(sourceLocale + ", "); 166 CLDRFile sourceData = cldrFactory.make(sourceLocale, true); 167 pw.println(); 168 String title = sourceLocale; 169 checkItems(pw, title, sourceData, CLDRFile.LANGUAGE_NAME, targetLanguageSet); 170 checkItems(pw, title, sourceData, CLDRFile.SCRIPT_NAME, targetScriptSet); 171 checkItems(pw, title, sourceData, CLDRFile.TERRITORY_NAME, targetRegionSet); 172 checkItems(pw, title, sourceData, CLDRFile.CURRENCY_NAME, targetCurrencySet); 173 // only check timezones if exemplar characters don't include a-z 174 String v = sourceData.getStringValue("//ldml/characters/exemplarCharacters"); 175 UnicodeSet exemplars = new UnicodeSet(v); 176 if (exemplars.contains('a', 'z')) continue; 177 checkItems(pw, title, sourceData, CLDRFile.TZ_EXEMPLAR, targetTZSet); 178 } 179 pw.println(); 180 pw.println("Sizes - incremental"); 181 pw.println(); 182 int runningTotalCount = 0; 183 int runningMissingCount = 0; 184 NumberFormat percent = NumberFormat.getPercentInstance(); 185 percent.setMinimumFractionDigits(1); 186 NumberFormat nf = NumberFormat.getInstance(); 187 nf.setGroupingUsed(true); 188 nf.setMinimumFractionDigits(0); 189 for (Iterator<String> it = totalMap.keySet().iterator(); it.hasNext(); ) { 190 String key = it.next(); 191 Totals t = totalMap.get(key); 192 runningTotalCount = t.totalCount; 193 runningMissingCount = t.missingCount; 194 pw.println( 195 key.substring(0, 2) 196 + "\t" 197 + key.substring(2) 198 + "\t" 199 + runningMissingCount 200 + "\t" 201 + runningTotalCount 202 + "\t" 203 + percent.format(runningMissingCount / (0.0 + runningTotalCount))); 204 } 205 pw.close(); 206 System.out.println(); 207 System.out.println("Done"); 208 } 209 showLocales(int choice)210 private static boolean showLocales(int choice) throws Exception { 211 ULocale desiredDisplayLocale = ULocale.ENGLISH; 212 Set<String> testSet = new TreeSet<>(); 213 StandardCodes sc = StandardCodes.make(); 214 { 215 Set<String> countries = sc.getGoodAvailableCodes("territory"); 216 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 217 english = cldrFactory.make("en", true); 218 for (Iterator<String> it = countries.iterator(); it.hasNext(); ) { 219 String territory = it.next(); 220 if (territory.charAt(0) < 'A') continue; 221 String locale = "haw-" + territory; 222 System.out.print(locale + ": " + english.getName(locale) + ", "); 223 } 224 if (true) return true; 225 } 226 227 if (choice == -1) { 228 229 testSet.addAll(sc.getGoodAvailableCodes("currency")); 230 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 231 english = cldrFactory.make("en", false); 232 for (Iterator it = testSet.iterator(); it.hasNext(); ) { 233 String country = (String) it.next(); 234 System.out.println( 235 country + "\t" + english.getName(CLDRFile.CURRENCY_NAME, country)); 236 } 237 return true; 238 } else if (choice == 0) { // get available 239 ULocale[] list = BreakIterator.getAvailableULocales(); 240 for (int i = 0; i < list.length; ++i) { 241 testSet.add(list[i].toString()); 242 } 243 } else { 244 boolean USE_3066bis = choice == 2; 245 // produce random list of RFC3066 language tags 246 Set<String> legacy = sc.getAvailableCodes("legacy"); 247 List<String> language_subtags = new ArrayList<>(sc.getGoodAvailableCodes("language")); 248 List<String> script_subtags = new ArrayList<>(sc.getGoodAvailableCodes("script")); 249 List<String> region_subtags = new ArrayList<>(sc.getGoodAvailableCodes("territory")); 250 for (String possibility : legacy) { 251 System.out.println(possibility); 252 if (new ULocale(possibility).getScript().length() != 0) { 253 System.out.println("\tAdding"); 254 testSet.add(possibility); 255 } 256 } 257 if (!USE_3066bis) 258 for (Iterator it = region_subtags.iterator(); it.hasNext(); ) { 259 String possibility = (String) it.next(); 260 if (possibility.compareTo("A") < 0) it.remove(); 261 } 262 Random rand = new Random(); 263 for (int i = 0; i < 200; ++i) { 264 int r = rand.nextInt(language_subtags.size()); 265 String result = language_subtags.get(rand.nextInt(language_subtags.size())); 266 if (USE_3066bis && rand.nextDouble() > 0.5) { 267 result += "-" + script_subtags.get(rand.nextInt(script_subtags.size())); 268 } 269 if (rand.nextDouble() > 0.1) { 270 result += "-" + region_subtags.get(rand.nextInt(region_subtags.size())); 271 } 272 testSet.add(result); 273 } 274 } 275 for (Iterator<String> it = testSet.iterator(); it.hasNext(); ) { 276 ULocale language = new ULocale(it.next()); 277 System.out.println(language + " \t" + language.getDisplayName(desiredDisplayLocale)); 278 } 279 return true; 280 } 281 showCollator()282 private static void showCollator() throws Exception { 283 RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance(new ULocale("zh")); 284 showExample(col); 285 String rules = col.getRules(false); 286 // System.out.println(com.ibm.icu.impl.Utility.escape(rules)); 287 rules += "& \u93CA < A <<< a & \u7C3F < B <<< b"; 288 RuleBasedCollator col2 = new RuleBasedCollator(rules); 289 showExample(col2); 290 } 291 showExample(RuleBasedCollator col)292 private static void showExample(RuleBasedCollator col) { 293 String samples = "a A b B \u5416 \u93CA \u516b \u7C3F"; 294 Set<String> s = new TreeSet<>(col); 295 s.addAll(Arrays.asList(samples.split(" "))); 296 System.out.println(com.ibm.icu.impl.Utility.escape(s.toString())); 297 } 298 299 static Map<String, String> priorityMap = new TreeMap<>(); 300 addPriority(String priority, String code)301 static void addPriority(String priority, String code) { 302 if (code.length() == 0) return; 303 String oldPriority = priorityMap.get(code); 304 if (oldPriority == null || priority.compareTo(oldPriority) < 0) 305 priorityMap.put(code, priority); 306 System.out.println(code + ": " + priority); 307 } 308 309 static class Totals { 310 int totalCount; 311 int missingCount; 312 } 313 314 static Map<String, Totals> totalMap = new TreeMap<>(); 315 checkItems( PrintWriter pw, String sourceLocale, CLDRFile sourceData, int type, Set<String> targetItemSet)316 static void checkItems( 317 PrintWriter pw, 318 String sourceLocale, 319 CLDRFile sourceData, 320 int type, 321 Set<String> targetItemSet) { 322 for (Iterator<String> it2 = targetItemSet.iterator(); it2.hasNext(); ) { 323 String item = it2.next(); 324 if (item.length() == 0) continue; 325 String key = priorityMap.get(sourceLocale) + "" + priorityMap.get(item); 326 Totals t = totalMap.get(key); 327 if (t == null) totalMap.put(key, t = new Totals()); 328 t.totalCount++; 329 String translation = getItemName(sourceData, type, item); 330 String rootName = getItemName(root, type, item); 331 if (rootName.equals(translation)) { 332 t.missingCount++; 333 pw.println( 334 priorityMap.get(sourceLocale) 335 + "\t" 336 + sourceLocale 337 + "\t(" 338 + english.getName(sourceLocale) 339 + ": " 340 + sourceData.getName(sourceLocale) 341 + ")" 342 + "\t" 343 + priorityMap.get(item) 344 + "\t" 345 + item 346 + "\t(" 347 + getItemName(english, type, item) 348 + ")"); 349 } 350 } 351 } 352 getItemName(CLDRFile data, String item)353 private static String getItemName(CLDRFile data, String item) { 354 return getItemName(data, getType(item), item); 355 } 356 getType(String item)357 private static int getType(String item) { 358 int type = CLDRFile.LANGUAGE_NAME; 359 if (item.indexOf('/') >= 0) type = CLDRFile.TZ_EXEMPLAR; // America/Los_Angeles 360 else if (item.length() == 4) type = CLDRFile.SCRIPT_NAME; // Hant 361 else if (item.charAt(0) <= '9') type = CLDRFile.TERRITORY_NAME; // 001 362 else if (item.charAt(0) < 'a') { 363 if (item.length() == 3) type = CLDRFile.CURRENCY_NAME; 364 else type = CLDRFile.TERRITORY_NAME; // US or USD 365 } 366 return type; 367 } 368 getTypeName(String item)369 private static String getTypeName(String item) { 370 switch (getType(item)) { 371 case CLDRFile.LANGUAGE_NAME: 372 return "Lang"; 373 case CLDRFile.TZ_EXEMPLAR: 374 return "Zone"; 375 case CLDRFile.SCRIPT_NAME: 376 return "Script"; 377 case CLDRFile.TERRITORY_NAME: 378 return "Region"; 379 case CLDRFile.CURRENCY_NAME: 380 return "Curr."; 381 } 382 return "?"; 383 } 384 getItemName(CLDRFile data, int type, String item)385 private static String getItemName(CLDRFile data, int type, String item) { 386 String result; 387 if (type == CLDRFile.LANGUAGE_NAME) { 388 result = data.getName(item); 389 } else if (type != CLDRFile.TZ_EXEMPLAR) { 390 result = data.getName(type, item); 391 } else { 392 String prefix = "//ldml/dates/timeZoneNames/zone[@type=\"" + item + "\"]/exemplarCity"; 393 result = data.getStringValue(prefix); 394 } 395 return result == null ? item : result; 396 } 397 398 static Map<String, List<String>> territory_currency = null; 399 getCurrency(String territory)400 private static List<String> getCurrency(String territory) { 401 if (territory_currency == null) { 402 territory_currency = new TreeMap<>(); 403 Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 404 CLDRFile supp = cldrFactory.make(CLDRFile.SUPPLEMENTAL_NAME, false); 405 for (String path : supp) { 406 if (path.indexOf("/currencyData") >= 0) { 407 // <region iso3166="AR"> 408 // <currency iso4217="ARS" from="1992-01-01"/> 409 if (path.indexOf("/region") >= 0) { 410 XPathParts parts = XPathParts.getFrozenInstance(supp.getFullXPath(path)); 411 Map<String, String> attributes = parts.getAttributes(parts.size() - 2); 412 String iso3166 = attributes.get("iso3166"); 413 attributes = parts.getAttributes(parts.size() - 1); 414 String iso4217 = attributes.get("iso4217"); 415 String to = attributes.get("to"); 416 if (to != null) { 417 continue; 418 } 419 List<String> info = territory_currency.get(iso3166); 420 if (info == null) { 421 territory_currency.put(iso3166, info = new ArrayList<>()); 422 } 423 info.add(iso4217); 424 } 425 } 426 } 427 } 428 return territory_currency.get(territory); 429 } 430 } 431