1 /* 2 * Created on May 19, 2005 3 * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others. 4 * For terms of use, see http://www.unicode.org/terms_of_use.html 5 */ 6 package org.unicode.cldr.tool; 7 8 import java.io.BufferedReader; 9 import java.io.IOException; 10 import java.io.PrintWriter; 11 import java.util.Comparator; 12 import java.util.HashMap; 13 import java.util.HashSet; 14 import java.util.Iterator; 15 import java.util.Map; 16 import java.util.Set; 17 import java.util.TreeMap; 18 import java.util.TreeSet; 19 20 import org.unicode.cldr.draft.FileUtilities; 21 import org.unicode.cldr.util.ArrayComparator; 22 import org.unicode.cldr.util.CLDRFile; 23 import org.unicode.cldr.util.CldrUtility; 24 import org.unicode.cldr.util.Factory; 25 import org.unicode.cldr.util.LanguageTagParser; 26 import org.unicode.cldr.util.Log; 27 import org.unicode.cldr.util.StandardCodes; 28 import org.unicode.cldr.util.TransliteratorUtilities; 29 30 import com.ibm.icu.lang.UCharacter; 31 import com.ibm.icu.text.Collator; 32 import com.ibm.icu.text.Transliterator; 33 import com.ibm.icu.text.UnicodeSet; 34 import com.ibm.icu.util.ICUUncheckedIOException; 35 import com.ibm.icu.util.ULocale; 36 37 /** 38 * @throws IOException 39 * 40 */ 41 class GenerateStatistics { 42 static final boolean HACK = true; 43 static CLDRFile english; 44 static Factory factory; 45 static LanguageTagParser ltp = new LanguageTagParser(); 46 static Collator col = Collator.getInstance(ULocale.ENGLISH); 47 static boolean notitlecase = true; 48 generateSize(String sourceDir, String logDir, String match, boolean transliterate)49 public static void generateSize(String sourceDir, String logDir, String match, boolean transliterate) 50 throws IOException { 51 factory = Factory.make(sourceDir, match); 52 ToolUtilities.registerExtraTransliterators(); 53 54 PrintWriter logHtml = FileUtilities.openUTF8Writer(logDir, "test_generation_log.html"); 55 //String dir = logDir + "main" + File.separator; 56 // DraftChecker dc = new DraftChecker(dir); 57 english = factory.make("en", true); 58 Set<String> languages = new TreeSet<>(col), countries = new TreeSet<>(col), draftLanguages = new TreeSet<>( 59 col), draftCountries = new TreeSet<>(col); 60 Set<Object> nativeLanguages = new TreeSet<>(), nativeCountries = new TreeSet<>(), draftNativeLanguages = new TreeSet<>(), 61 draftNativeCountries = new TreeSet<>(); 62 int localeCount = 0; 63 int draftLocaleCount = 0; 64 65 Set<String> contents = removeSingleLanguagesWhereWeHaveScripts(factory.getAvailable()); 66 67 for (Iterator<String> it = contents.iterator(); it.hasNext();) { 68 String localeID = it.next(); 69 if (CLDRFile.isSupplementalName(localeID)) continue; 70 if (localeID.equals("root")) 71 continue; // skip root 72 System.out.println("Collecting info for:\t" + localeID.replace("_", "\t")); 73 boolean draft = false; // dc.isDraft(localeName); 74 if (draft) { 75 draftLocaleCount++; 76 addCounts(localeID, true, draftLanguages, 77 draftCountries, draftNativeLanguages, 78 draftNativeCountries); 79 } else { 80 localeCount++; 81 addCounts(localeID, false, languages, 82 countries, nativeLanguages, nativeCountries); 83 } 84 if (false) 85 Log.logln(draft + ", " + localeCount + ", " 86 + languages.size() + ", " + countries.size() + ", " 87 + draftLocaleCount + ", " + draftLanguages.size() 88 + ", " + draftCountries.size()); 89 } 90 draftLanguages.removeAll(languages); 91 for (Iterator<Object> it = nativeLanguages.iterator(); it.hasNext();) { 92 draftNativeLanguages.remove(it.next()); 93 } 94 logHtml.println("<html><head>"); 95 logHtml 96 .println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>"); 97 logHtml.println("</head><body>"); 98 logHtml.println("<p><b>Locales (" + localeCount + "):</b>"); 99 logHtml.println("<p><b>Languages (" + languages.size() + "):</b>"); 100 logHtml.println(showSet(nativeLanguages, transliterate, true)); 101 logHtml.println("<p><b>Territories (" + countries.size() + "):</b>"); 102 logHtml.println(showSet(nativeCountries, transliterate, false)); 103 logHtml.println("<p><b>Draft locales (" + draftLocaleCount + "):</b>"); 104 logHtml.println("<p><b>Draft languages (" + draftLanguages.size() 105 + "):</b>"); 106 logHtml.println(showSet(draftNativeLanguages, transliterate, true)); 107 logHtml.println("<p><b>Draft countries (" + draftCountries.size() 108 + "):</b>"); 109 logHtml.println(showSet(draftNativeCountries, transliterate, false)); 110 logHtml.println(CldrUtility.ANALYTICS); 111 logHtml.println("</body></html>"); 112 logHtml.close(); 113 } 114 115 /** 116 * 117 */ removeSingleLanguagesWhereWeHaveScripts(Set<String> contents)118 private static Set<String> removeSingleLanguagesWhereWeHaveScripts(Set<String> contents) { 119 StandardCodes sc = StandardCodes.make(); 120 contents = new TreeSet<>(contents); // make writable 121 if (false && HACK) { 122 contents.add("bs_Latn"); 123 contents.add("bs_Cyrl"); 124 contents.add("bs_Latn_BA"); 125 contents.add("bs_Cyrl_BA"); 126 } 127 // find the languages with scripts 128 Set<String> toRemove = new HashSet<>(); 129 if (HACK) toRemove.add("sh"); 130 131 for (Iterator<String> it = contents.iterator(); it.hasNext();) { 132 String localeID = it.next(); 133 if (CLDRFile.isSupplementalName(localeID)) { 134 continue; 135 } 136 // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script 137 String lang = ltp.set(localeID).getLanguage(); 138 String territory = ltp.set(localeID).getRegion(); 139 if (!sc.getGoodAvailableCodes("language").contains(lang)) { 140 System.out.println("Odd language, removing: " + localeID); 141 it.remove(); 142 continue; 143 } 144 if (territory.length() != 0 && !sc.getGoodAvailableCodes("territory").contains(territory)) { 145 System.out.println("Odd territory, removing: " + localeID); 146 it.remove(); 147 continue; 148 } 149 String langscript = ltp.set(localeID).getLanguageScript(); 150 if (!lang.equals(langscript)) toRemove.add(lang); 151 } 152 153 for (Iterator<String> it = contents.iterator(); it.hasNext();) { 154 String localeID = it.next(); 155 if (CLDRFile.isSupplementalName(localeID)) { 156 continue; 157 } 158 // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script 159 String lang = ltp.set(localeID).getLanguage(); 160 if (!toRemove.contains(lang)) continue; 161 String langscript = ltp.set(localeID).getLanguageScript(); 162 if (lang.equals(langscript)) it.remove(); 163 } 164 return contents; 165 } 166 167 static final UnicodeSet NON_LATIN = new UnicodeSet("[^[:latin:][:common:][:inherited:]]"); 168 169 /** 170 * @param nativeCountries 171 * @param transliterate 172 * TODO 173 * @param isLanguage 174 * TODO 175 */ 176 @SuppressWarnings({ "unchecked", "rawtypes" }) showSet(Set nativeCountries, boolean transliterate, boolean isLanguage)177 private static String showSet(Set nativeCountries, boolean transliterate, 178 boolean isLanguage) { 179 UnicodeSet BIDI_R = new UnicodeSet( 180 "[[:Bidi_Class=R:][:Bidi_Class=AL:]]"); 181 StringBuffer result = new StringBuffer(); 182 Map sb = new TreeMap(LanguageList.col); 183 // collect multiples by English name 184 for (Iterator it = nativeCountries.iterator(); it.hasNext();) { 185 LanguageList llist = (LanguageList) it.next(); 186 Set s = (Set) sb.get(llist.getEnglishName()); 187 if (s == null) 188 sb.put(llist.getEnglishName(), s = new TreeSet()); 189 s.add(llist); 190 } 191 192 Set<String> titleSet = new TreeSet<>(col); 193 Set<String> qualifierSet = new TreeSet<>(col); 194 195 for (Iterator<String> it = sb.keySet().iterator(); it.hasNext();) { 196 String englishName = it.next(); 197 Set s = (Set) sb.get(englishName); 198 if (result.length() != 0) { 199 result.append("; "); 200 } 201 String code = ""; 202 boolean needQualifier = s.size() != 1; 203 titleSet.clear(); 204 qualifierSet.clear(); 205 206 for (Iterator<LanguageList> it2 = s.iterator(); it2.hasNext();) { 207 LanguageList llist = it2.next(); 208 String localName = llist.getLocalName(); 209 String locale = llist.getLocale(); 210 211 // see if we need qualifier 212 String lang = locale, country = ""; 213 if (locale.length() > 3 214 && locale.charAt(locale.length() - 3) == '_') { 215 lang = locale.substring(0, locale.length() - 3); 216 country = locale.substring(locale.length() - 2); 217 } 218 219 // fix 220 if (BIDI_R.containsSome(localName)) 221 localName = '\u200E' + localName + '\u200E'; 222 223 // qualifiers += lang; 224 225 if (isLanguage) { 226 code = lang; 227 } else { 228 code = country; 229 } 230 231 if (!localName.equalsIgnoreCase(englishName)) { 232 needQualifier = true; 233 qualifierSet.add(localName); 234 235 if (transliterate && NON_LATIN.containsSome(localName) 236 && !lang.equals("ja")) { 237 String transName = localName; 238 try { 239 transName = fixedTitleCase("en", 240 toLatin.transliterate(localName)); 241 } catch (RuntimeException e) { 242 System.out.println("\t" + e.getMessage()); 243 } 244 if (NON_LATIN.containsSome(transName)) { 245 Log.logln("Can't transliterate " + localName 246 + ": " + transName); 247 } else { 248 titleSet.add(transName); 249 } 250 } 251 } 252 } 253 String title = code + (titleSet.isEmpty() ? "" : ": " + titleSet.toString()); 254 String before = "", after = ""; 255 if (title.length() != 0) { 256 before = "<span title=\'" 257 + TransliteratorUtilities.toHTML.transliterate(title) + "'>"; 258 after = "</span>"; 259 } 260 String qualifiers = qualifierSet.toString(); 261 if (!needQualifier || qualifierSet.isEmpty()) 262 qualifiers = ""; 263 else 264 qualifiers = " " + qualifiers; // qualifiers = " (" + qualifiers + ")"; 265 266 // fix 267 if (englishName.endsWith(", China")) { 268 englishName = englishName.substring(0, englishName.length() 269 - ", China".length()) 270 + " China"; 271 } 272 273 result.append(before) 274 .append( 275 TransliteratorUtilities.toHTML.transliterate(englishName 276 + qualifiers)) 277 .append(after); 278 } 279 return result.toString(); 280 } 281 282 /** 283 * @param localeID 284 * @param isDraft 285 * TODO 286 * @param draftLanguages 287 * @param draftCountries 288 * @param draftNativeLanguages 289 * @param draftNativeCountries 290 */ addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries, Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries)291 private static void addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries, 292 Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries) { 293 // ULocale uloc = new ULocale(localeName); 294 ltp.set(localeID); 295 String lang = ltp.getLanguage(); 296 String langScript = ltp.getLanguageScript(); 297 String country = ltp.getRegion(); 298 299 // dump aliases 300 // if ((country.equals("TW") || country.equals("HK") || country.equals("MO")) && lang.equals("zh")) return; 301 // if (lang.equals("zh_Hans") || lang.equals("sr_Cyrl") || lang.equals("sh")) return; 302 303 String nativeName, englishName; 304 draftLanguages.add(lang); 305 nativeName = getFixedLanguageName(localeID, langScript); 306 englishName = english.getName(langScript); 307 if (!lang.equals("en") && nativeName.equals(englishName)) { 308 Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + lang 309 + " equals English: " + nativeName); 310 } 311 312 draftNativeLanguages.add(new LanguageList(langScript, englishName, fixedTitleCase("en", nativeName))); 313 314 if (!country.equals("")) { 315 draftCountries.add(country); 316 nativeName = getFixedDisplayCountry(localeID, country); 317 englishName = getFixedDisplayCountry("en", country); 318 if (!lang.equals("en") && nativeName.equals(englishName)) { 319 Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + country 320 + " equals English: " + nativeName); 321 } 322 draftNativeCountries.add(new LanguageList(localeID, englishName, fixedTitleCase("en", nativeName))); 323 } 324 } 325 326 private static class LanguageList implements Comparable<Object> { 327 Object[] contents; 328 static Collator col = Collator.getInstance(ULocale.ENGLISH); 329 static Comparator<Object[]> comp = new ArrayComparator(new Collator[] { col, col, null }); 330 LanguageList(String locale, String englishName, String localName)331 LanguageList(String locale, String englishName, String localName) { 332 contents = new Object[] { englishName, locale, localName }; 333 } 334 335 @Override compareTo(Object o)336 public int compareTo(Object o) { 337 return comp.compare(contents, ((LanguageList) o).contents); 338 } 339 getLocale()340 String getLocale() { 341 return (String) contents[1]; 342 } 343 getEnglishName()344 String getEnglishName() { 345 return (String) contents[0]; 346 } 347 getLocalName()348 String getLocalName() { 349 return (String) contents[2]; 350 } 351 } 352 fixedTitleCase(String localeID, String in)353 static String fixedTitleCase(String localeID, String in) { 354 if (notitlecase) return in; 355 String result = UCharacter.toTitleCase(new ULocale(localeID), in, null); 356 if (HACK) { 357 result = result.replace("U.s.", "U.S."); 358 result = result.replace("S.a.r.", "S.A.R."); 359 } 360 return result; 361 } 362 363 /* 364 * static void addMapSet(Map m, Object key, Object value, Comparator com) { 365 * Set valueSet = (Set) m.get(key); 366 * if (valueSet == null) { 367 * valueSet = new TreeSet(com); 368 * m.put(key, valueSet); 369 * } 370 * valueSet.add(value); 371 * } 372 */ 373 374 /** 375 * 376 */ getFixedLanguageName(String localeID, String lang)377 private static String getFixedLanguageName(String localeID, String lang) { 378 if (HACK) { 379 if (localeID.equals("bs") || localeID.startsWith("bs_")) { 380 if (lang.equals("bs") || lang.startsWith("bs_")) return "Bosanski"; 381 } 382 } 383 CLDRFile cldr = factory.make(localeID, true); 384 return cldr.getName(lang); 385 } 386 387 /** 388 * @param uloc 389 * @return 390 */ getFixedDisplayCountry(String localeID, String country)391 private static String getFixedDisplayCountry(String localeID, String country) { 392 if (HACK) { 393 if (localeID.equals("bs") || localeID.startsWith("bs_")) { 394 if (country.equals("BA")) 395 return "\u0411\u043E\u0441\u043D\u0430 \u0438 \u0425\u0435\u0440\u0446\u0435\u0433\u043E\u0432\u0438\u043D\u0430"; 396 } 397 } 398 CLDRFile cldr = factory.make(localeID, true); 399 String name = cldr.getName("territory", country); 400 if (false && HACK) { 401 Object trial = fixCountryNames.get(name); 402 if (trial != null) { 403 return (String) trial; 404 } 405 } 406 return name; 407 } 408 409 static Map<String, String> fixCountryNames = new HashMap<>(); 410 static { 411 fixCountryNames.put("\u0408\u0443\u0433\u043E\u0441\u043B\u0430\u0432\u0438\u0458\u0430", 412 "\u0421\u0440\u0431\u0438\u0458\u0430 \u0438 \u0426\u0440\u043D\u0430 \u0413\u043E\u0440\u0430"); 413 fixCountryNames.put("Jugoslavija", "Srbija i Crna Gora"); 414 fixCountryNames.put("Yugoslavia", "Serbia and Montenegro"); 415 } 416 public static final Transliterator toLatin = Transliterator.getInstance("any-latin"); 417 418 public static class DraftChecker { 419 String dir; 420 Map<String, Object> cache = new HashMap<>(); 421 Object TRUE = new Object(); 422 Object FALSE = new Object(); 423 DraftChecker(String dir)424 public DraftChecker(String dir) { 425 this.dir = dir; 426 } 427 isDraft(String localeName)428 public boolean isDraft(String localeName) { 429 Object check = cache.get(localeName); 430 if (check != null) { 431 return check == TRUE; 432 } 433 BufferedReader pw = null; 434 //boolean result = true; 435 try { 436 pw = FileUtilities.openUTF8Reader(dir, localeName + ".xml"); 437 while (true) { 438 String line = pw.readLine(); 439 if (line == null) { 440 throw new IllegalArgumentException("Internal Error: should never get here."); 441 } 442 if (line.indexOf("<ldml") >= 0) { 443 if (line.indexOf("draft") >= 0) { 444 check = TRUE; 445 } else { 446 check = FALSE; 447 } 448 break; 449 } 450 } 451 pw.close(); 452 } catch (IOException e) { 453 throw new ICUUncheckedIOException("Failure on " + localeName + ": " + dir + localeName + ".xml", e); 454 } 455 cache.put(localeName, check); 456 return check == TRUE; 457 } 458 } 459 460 }