• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Created on May 19, 2005
3  * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
4  * For terms of use, see http://www.unicode.org/terms_of_use.html
5  */
6 package org.unicode.cldr.tool;
7 
8 import java.io.BufferedReader;
9 import java.io.IOException;
10 import java.io.PrintWriter;
11 import java.util.Comparator;
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.Iterator;
15 import java.util.Map;
16 import java.util.Set;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
19 
20 import org.unicode.cldr.draft.FileUtilities;
21 import org.unicode.cldr.util.ArrayComparator;
22 import org.unicode.cldr.util.CLDRFile;
23 import org.unicode.cldr.util.CldrUtility;
24 import org.unicode.cldr.util.Factory;
25 import org.unicode.cldr.util.LanguageTagParser;
26 import org.unicode.cldr.util.Log;
27 import org.unicode.cldr.util.StandardCodes;
28 import org.unicode.cldr.util.TransliteratorUtilities;
29 
30 import com.ibm.icu.lang.UCharacter;
31 import com.ibm.icu.text.Collator;
32 import com.ibm.icu.text.Transliterator;
33 import com.ibm.icu.text.UnicodeSet;
34 import com.ibm.icu.util.ICUUncheckedIOException;
35 import com.ibm.icu.util.ULocale;
36 
37 /**
38  * @throws IOException
39  *
40  */
41 class GenerateStatistics {
42     static final boolean HACK = true;
43     static CLDRFile english;
44     static Factory factory;
45     static LanguageTagParser ltp = new LanguageTagParser();
46     static Collator col = Collator.getInstance(ULocale.ENGLISH);
47     static boolean notitlecase = true;
48 
generateSize(String sourceDir, String logDir, String match, boolean transliterate)49     public static void generateSize(String sourceDir, String logDir, String match, boolean transliterate)
50         throws IOException {
51         factory = Factory.make(sourceDir, match);
52         ToolUtilities.registerExtraTransliterators();
53 
54         PrintWriter logHtml = FileUtilities.openUTF8Writer(logDir, "test_generation_log.html");
55         //String dir = logDir + "main" + File.separator;
56         // DraftChecker dc = new DraftChecker(dir);
57         english = factory.make("en", true);
58         Set<String> languages = new TreeSet<>(col), countries = new TreeSet<>(col), draftLanguages = new TreeSet<>(
59             col), draftCountries = new TreeSet<>(col);
60         Set<Object> nativeLanguages = new TreeSet<>(), nativeCountries = new TreeSet<>(), draftNativeLanguages = new TreeSet<>(),
61             draftNativeCountries = new TreeSet<>();
62         int localeCount = 0;
63         int draftLocaleCount = 0;
64 
65         Set<String> contents = removeSingleLanguagesWhereWeHaveScripts(factory.getAvailable());
66 
67         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
68             String localeID = it.next();
69             if (CLDRFile.isSupplementalName(localeID)) continue;
70             if (localeID.equals("root"))
71                 continue; // skip root
72             System.out.println("Collecting info for:\t" + localeID.replace("_", "\t"));
73             boolean draft = false; // dc.isDraft(localeName);
74             if (draft) {
75                 draftLocaleCount++;
76                 addCounts(localeID, true, draftLanguages,
77                     draftCountries, draftNativeLanguages,
78                     draftNativeCountries);
79             } else {
80                 localeCount++;
81                 addCounts(localeID, false, languages,
82                     countries, nativeLanguages, nativeCountries);
83             }
84             if (false)
85                 Log.logln(draft + ", " + localeCount + ", "
86                     + languages.size() + ", " + countries.size() + ", "
87                     + draftLocaleCount + ", " + draftLanguages.size()
88                     + ", " + draftCountries.size());
89         }
90         draftLanguages.removeAll(languages);
91         for (Iterator<Object> it = nativeLanguages.iterator(); it.hasNext();) {
92             draftNativeLanguages.remove(it.next());
93         }
94         logHtml.println("<html><head>");
95         logHtml
96             .println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
97         logHtml.println("</head><body>");
98         logHtml.println("<p><b>Locales (" + localeCount + "):</b>");
99         logHtml.println("<p><b>Languages (" + languages.size() + "):</b>");
100         logHtml.println(showSet(nativeLanguages, transliterate, true));
101         logHtml.println("<p><b>Territories (" + countries.size() + "):</b>");
102         logHtml.println(showSet(nativeCountries, transliterate, false));
103         logHtml.println("<p><b>Draft locales (" + draftLocaleCount + "):</b>");
104         logHtml.println("<p><b>Draft languages (" + draftLanguages.size()
105             + "):</b>");
106         logHtml.println(showSet(draftNativeLanguages, transliterate, true));
107         logHtml.println("<p><b>Draft countries (" + draftCountries.size()
108             + "):</b>");
109         logHtml.println(showSet(draftNativeCountries, transliterate, false));
110         logHtml.println(CldrUtility.ANALYTICS);
111         logHtml.println("</body></html>");
112         logHtml.close();
113     }
114 
115     /**
116      *
117      */
removeSingleLanguagesWhereWeHaveScripts(Set<String> contents)118     private static Set<String> removeSingleLanguagesWhereWeHaveScripts(Set<String> contents) {
119         StandardCodes sc = StandardCodes.make();
120         contents = new TreeSet<>(contents); // make writable
121         if (false && HACK) {
122             contents.add("bs_Latn");
123             contents.add("bs_Cyrl");
124             contents.add("bs_Latn_BA");
125             contents.add("bs_Cyrl_BA");
126         }
127         // find the languages with scripts
128         Set<String> toRemove = new HashSet<>();
129         if (HACK) toRemove.add("sh");
130 
131         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
132             String localeID = it.next();
133             if (CLDRFile.isSupplementalName(localeID)) {
134                 continue;
135             }
136             // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
137             String lang = ltp.set(localeID).getLanguage();
138             String territory = ltp.set(localeID).getRegion();
139             if (!sc.getGoodAvailableCodes("language").contains(lang)) {
140                 System.out.println("Odd language, removing: " + localeID);
141                 it.remove();
142                 continue;
143             }
144             if (territory.length() != 0 && !sc.getGoodAvailableCodes("territory").contains(territory)) {
145                 System.out.println("Odd territory, removing: " + localeID);
146                 it.remove();
147                 continue;
148             }
149             String langscript = ltp.set(localeID).getLanguageScript();
150             if (!lang.equals(langscript)) toRemove.add(lang);
151         }
152 
153         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
154             String localeID = it.next();
155             if (CLDRFile.isSupplementalName(localeID)) {
156                 continue;
157             }
158             // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
159             String lang = ltp.set(localeID).getLanguage();
160             if (!toRemove.contains(lang)) continue;
161             String langscript = ltp.set(localeID).getLanguageScript();
162             if (lang.equals(langscript)) it.remove();
163         }
164         return contents;
165     }
166 
167     static final UnicodeSet NON_LATIN = new UnicodeSet("[^[:latin:][:common:][:inherited:]]");
168 
169     /**
170      * @param nativeCountries
171      * @param transliterate
172      *            TODO
173      * @param isLanguage
174      *            TODO
175      */
176     @SuppressWarnings({ "unchecked", "rawtypes" })
showSet(Set nativeCountries, boolean transliterate, boolean isLanguage)177     private static String showSet(Set nativeCountries, boolean transliterate,
178         boolean isLanguage) {
179         UnicodeSet BIDI_R = new UnicodeSet(
180             "[[:Bidi_Class=R:][:Bidi_Class=AL:]]");
181         StringBuffer result = new StringBuffer();
182         Map sb = new TreeMap(LanguageList.col);
183         // collect multiples by English name
184         for (Iterator it = nativeCountries.iterator(); it.hasNext();) {
185             LanguageList llist = (LanguageList) it.next();
186             Set s = (Set) sb.get(llist.getEnglishName());
187             if (s == null)
188                 sb.put(llist.getEnglishName(), s = new TreeSet());
189             s.add(llist);
190         }
191 
192         Set<String> titleSet = new TreeSet<>(col);
193         Set<String> qualifierSet = new TreeSet<>(col);
194 
195         for (Iterator<String> it = sb.keySet().iterator(); it.hasNext();) {
196             String englishName = it.next();
197             Set s = (Set) sb.get(englishName);
198             if (result.length() != 0) {
199                 result.append("; ");
200             }
201             String code = "";
202             boolean needQualifier = s.size() != 1;
203             titleSet.clear();
204             qualifierSet.clear();
205 
206             for (Iterator<LanguageList> it2 = s.iterator(); it2.hasNext();) {
207                 LanguageList llist = it2.next();
208                 String localName = llist.getLocalName();
209                 String locale = llist.getLocale();
210 
211                 // see if we need qualifier
212                 String lang = locale, country = "";
213                 if (locale.length() > 3
214                     && locale.charAt(locale.length() - 3) == '_') {
215                     lang = locale.substring(0, locale.length() - 3);
216                     country = locale.substring(locale.length() - 2);
217                 }
218 
219                 // fix
220                 if (BIDI_R.containsSome(localName))
221                     localName = '\u200E' + localName + '\u200E';
222 
223                 // qualifiers += lang;
224 
225                 if (isLanguage) {
226                     code = lang;
227                 } else {
228                     code = country;
229                 }
230 
231                 if (!localName.equalsIgnoreCase(englishName)) {
232                     needQualifier = true;
233                     qualifierSet.add(localName);
234 
235                     if (transliterate && NON_LATIN.containsSome(localName)
236                         && !lang.equals("ja")) {
237                         String transName = localName;
238                         try {
239                             transName = fixedTitleCase("en",
240                                 toLatin.transliterate(localName));
241                         } catch (RuntimeException e) {
242                             System.out.println("\t" + e.getMessage());
243                         }
244                         if (NON_LATIN.containsSome(transName)) {
245                             Log.logln("Can't transliterate " + localName
246                                 + ": " + transName);
247                         } else {
248                             titleSet.add(transName);
249                         }
250                     }
251                 }
252             }
253             String title = code + (titleSet.isEmpty() ? "" : ": " + titleSet.toString());
254             String before = "", after = "";
255             if (title.length() != 0) {
256                 before = "<span title=\'"
257                     + TransliteratorUtilities.toHTML.transliterate(title) + "'>";
258                 after = "</span>";
259             }
260             String qualifiers = qualifierSet.toString();
261             if (!needQualifier || qualifierSet.isEmpty())
262                 qualifiers = "";
263             else
264                 qualifiers = " " + qualifiers; // qualifiers = " (" + qualifiers + ")";
265 
266             // fix
267             if (englishName.endsWith(", China")) {
268                 englishName = englishName.substring(0, englishName.length()
269                     - ", China".length())
270                     + " China";
271             }
272 
273             result.append(before)
274                 .append(
275                     TransliteratorUtilities.toHTML.transliterate(englishName
276                         + qualifiers))
277                 .append(after);
278         }
279         return result.toString();
280     }
281 
282     /**
283      * @param localeID
284      * @param isDraft
285      *            TODO
286      * @param draftLanguages
287      * @param draftCountries
288      * @param draftNativeLanguages
289      * @param draftNativeCountries
290      */
addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries, Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries)291     private static void addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries,
292         Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries) {
293         // ULocale uloc = new ULocale(localeName);
294         ltp.set(localeID);
295         String lang = ltp.getLanguage();
296         String langScript = ltp.getLanguageScript();
297         String country = ltp.getRegion();
298 
299         // dump aliases
300         // if ((country.equals("TW") || country.equals("HK") || country.equals("MO")) && lang.equals("zh")) return;
301         // if (lang.equals("zh_Hans") || lang.equals("sr_Cyrl") || lang.equals("sh")) return;
302 
303         String nativeName, englishName;
304         draftLanguages.add(lang);
305         nativeName = getFixedLanguageName(localeID, langScript);
306         englishName = english.getName(langScript);
307         if (!lang.equals("en") && nativeName.equals(englishName)) {
308             Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + lang
309                 + " equals English: " + nativeName);
310         }
311 
312         draftNativeLanguages.add(new LanguageList(langScript, englishName, fixedTitleCase("en", nativeName)));
313 
314         if (!country.equals("")) {
315             draftCountries.add(country);
316             nativeName = getFixedDisplayCountry(localeID, country);
317             englishName = getFixedDisplayCountry("en", country);
318             if (!lang.equals("en") && nativeName.equals(englishName)) {
319                 Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + country
320                     + " equals English: " + nativeName);
321             }
322             draftNativeCountries.add(new LanguageList(localeID, englishName, fixedTitleCase("en", nativeName)));
323         }
324     }
325 
326     private static class LanguageList implements Comparable<Object> {
327         Object[] contents;
328         static Collator col = Collator.getInstance(ULocale.ENGLISH);
329         static Comparator<Object[]> comp = new ArrayComparator(new Collator[] { col, col, null });
330 
LanguageList(String locale, String englishName, String localName)331         LanguageList(String locale, String englishName, String localName) {
332             contents = new Object[] { englishName, locale, localName };
333         }
334 
335         @Override
compareTo(Object o)336         public int compareTo(Object o) {
337             return comp.compare(contents, ((LanguageList) o).contents);
338         }
339 
getLocale()340         String getLocale() {
341             return (String) contents[1];
342         }
343 
getEnglishName()344         String getEnglishName() {
345             return (String) contents[0];
346         }
347 
getLocalName()348         String getLocalName() {
349             return (String) contents[2];
350         }
351     }
352 
fixedTitleCase(String localeID, String in)353     static String fixedTitleCase(String localeID, String in) {
354         if (notitlecase) return in;
355         String result = UCharacter.toTitleCase(new ULocale(localeID), in, null);
356         if (HACK) {
357             result = result.replace("U.s.", "U.S.");
358             result = result.replace("S.a.r.", "S.A.R.");
359         }
360         return result;
361     }
362 
363     /*
364      * static void addMapSet(Map m, Object key, Object value, Comparator com) {
365      * Set valueSet = (Set) m.get(key);
366      * if (valueSet == null) {
367      * valueSet = new TreeSet(com);
368      * m.put(key, valueSet);
369      * }
370      * valueSet.add(value);
371      * }
372      */
373 
374     /**
375      *
376      */
getFixedLanguageName(String localeID, String lang)377     private static String getFixedLanguageName(String localeID, String lang) {
378         if (HACK) {
379             if (localeID.equals("bs") || localeID.startsWith("bs_")) {
380                 if (lang.equals("bs") || lang.startsWith("bs_")) return "Bosanski";
381             }
382         }
383         CLDRFile cldr = factory.make(localeID, true);
384         return cldr.getName(lang);
385     }
386 
387     /**
388      * @param uloc
389      * @return
390      */
getFixedDisplayCountry(String localeID, String country)391     private static String getFixedDisplayCountry(String localeID, String country) {
392         if (HACK) {
393             if (localeID.equals("bs") || localeID.startsWith("bs_")) {
394                 if (country.equals("BA"))
395                     return "\u0411\u043E\u0441\u043D\u0430 \u0438 \u0425\u0435\u0440\u0446\u0435\u0433\u043E\u0432\u0438\u043D\u0430";
396             }
397         }
398         CLDRFile cldr = factory.make(localeID, true);
399         String name = cldr.getName("territory", country);
400         if (false && HACK) {
401             Object trial = fixCountryNames.get(name);
402             if (trial != null) {
403                 return (String) trial;
404             }
405         }
406         return name;
407     }
408 
409     static Map<String, String> fixCountryNames = new HashMap<>();
410     static {
411         fixCountryNames.put("\u0408\u0443\u0433\u043E\u0441\u043B\u0430\u0432\u0438\u0458\u0430",
412             "\u0421\u0440\u0431\u0438\u0458\u0430 \u0438 \u0426\u0440\u043D\u0430 \u0413\u043E\u0440\u0430");
413         fixCountryNames.put("Jugoslavija", "Srbija i Crna Gora");
414         fixCountryNames.put("Yugoslavia", "Serbia and Montenegro");
415     }
416     public static final Transliterator toLatin = Transliterator.getInstance("any-latin");
417 
418     public static class DraftChecker {
419         String dir;
420         Map<String, Object> cache = new HashMap<>();
421         Object TRUE = new Object();
422         Object FALSE = new Object();
423 
DraftChecker(String dir)424         public DraftChecker(String dir) {
425             this.dir = dir;
426         }
427 
isDraft(String localeName)428         public boolean isDraft(String localeName) {
429             Object check = cache.get(localeName);
430             if (check != null) {
431                 return check == TRUE;
432             }
433             BufferedReader pw = null;
434             //boolean result = true;
435             try {
436                 pw = FileUtilities.openUTF8Reader(dir, localeName + ".xml");
437                 while (true) {
438                     String line = pw.readLine();
439                     if (line == null) {
440                         throw new IllegalArgumentException("Internal Error: should never get here.");
441                     }
442                     if (line.indexOf("<ldml") >= 0) {
443                         if (line.indexOf("draft") >= 0) {
444                             check = TRUE;
445                         } else {
446                             check = FALSE;
447                         }
448                         break;
449                     }
450                 }
451                 pw.close();
452             } catch (IOException e) {
453                 throw new ICUUncheckedIOException("Failure on " + localeName + ": " + dir + localeName + ".xml", e);
454             }
455             cache.put(localeName, check);
456             return check == TRUE;
457         }
458     }
459 
460 }