• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Created on May 19, 2005
3  * Copyright (C) 2004-2005, Unicode, Inc., International Business Machines Corporation, and others.
4  * For terms of use, see http://www.unicode.org/terms_of_use.html
5  */
6 package org.unicode.cldr.tool;
7 
8 import java.io.BufferedReader;
9 import java.io.IOException;
10 import java.io.PrintWriter;
11 import java.util.Comparator;
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.Iterator;
15 import java.util.Map;
16 import java.util.Set;
17 import java.util.TreeMap;
18 import java.util.TreeSet;
19 
20 import org.unicode.cldr.draft.FileUtilities;
21 import org.unicode.cldr.util.ArrayComparator;
22 import org.unicode.cldr.util.CLDRFile;
23 import org.unicode.cldr.util.CldrUtility;
24 import org.unicode.cldr.util.Factory;
25 import org.unicode.cldr.util.LanguageTagParser;
26 import org.unicode.cldr.util.Log;
27 import org.unicode.cldr.util.StandardCodes;
28 import org.unicode.cldr.util.TransliteratorUtilities;
29 
30 import com.ibm.icu.lang.UCharacter;
31 import com.ibm.icu.text.Collator;
32 import com.ibm.icu.text.Transliterator;
33 import com.ibm.icu.text.UnicodeSet;
34 import com.ibm.icu.util.ICUUncheckedIOException;
35 import com.ibm.icu.util.ULocale;
36 
37 /**
38  * @throws IOException
39  *
40  */
41 class GenerateStatistics {
42     static final boolean HACK = true;
43     static CLDRFile english;
44     static Factory factory;
45     static LanguageTagParser ltp = new LanguageTagParser();
46     static Collator col = Collator.getInstance(ULocale.ENGLISH);
47     static boolean notitlecase = true;
48 
generateSize(String sourceDir, String logDir, String match, boolean transliterate)49     public static void generateSize(String sourceDir, String logDir, String match, boolean transliterate)
50         throws IOException {
51         factory = Factory.make(sourceDir, match);
52         ToolUtilities.registerExtraTransliterators();
53 
54         PrintWriter logHtml = FileUtilities.openUTF8Writer(logDir, "test_generation_log.html");
55         //String dir = logDir + "main" + File.separator;
56         // DraftChecker dc = new DraftChecker(dir);
57         english = factory.make("en", true);
58         Set<String> languages = new TreeSet<String>(col), countries = new TreeSet<String>(col), draftLanguages = new TreeSet<String>(
59             col), draftCountries = new TreeSet<String>(col);
60         Set<Object> nativeLanguages = new TreeSet<Object>(), nativeCountries = new TreeSet<Object>(), draftNativeLanguages = new TreeSet<Object>(),
61             draftNativeCountries = new TreeSet<Object>();
62         int localeCount = 0;
63         int draftLocaleCount = 0;
64 
65         Set<String> contents = removeSingleLanguagesWhereWeHaveScripts(factory.getAvailable());
66 
67         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
68             String localeID = it.next();
69             if (CLDRFile.isSupplementalName(localeID)) continue;
70             if (localeID.equals("root"))
71                 continue; // skip root
72             System.out.println("Collecting info for:\t" + localeID.replace("_", "\t"));
73             boolean draft = false; // dc.isDraft(localeName);
74             if (draft) {
75                 draftLocaleCount++;
76                 addCounts(localeID, true, draftLanguages,
77                     draftCountries, draftNativeLanguages,
78                     draftNativeCountries);
79             } else {
80                 localeCount++;
81                 addCounts(localeID, false, languages,
82                     countries, nativeLanguages, nativeCountries);
83             }
84             if (false)
85                 Log.logln(draft + ", " + localeCount + ", "
86                     + languages.size() + ", " + countries.size() + ", "
87                     + draftLocaleCount + ", " + draftLanguages.size()
88                     + ", " + draftCountries.size());
89         }
90         draftLanguages.removeAll(languages);
91         for (Iterator<Object> it = nativeLanguages.iterator(); it.hasNext();) {
92             draftNativeLanguages.remove(it.next());
93         }
94         logHtml.println("<html><head>");
95         logHtml
96             .println("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
97         logHtml.println("</head><body>");
98         logHtml.println("<p><b>Locales (" + localeCount + "):</b>");
99         logHtml.println("<p><b>Languages (" + languages.size() + "):</b>");
100         logHtml.println(showSet(nativeLanguages, transliterate, true));
101         logHtml.println("<p><b>Territories (" + countries.size() + "):</b>");
102         logHtml.println(showSet(nativeCountries, transliterate, false));
103         logHtml.println("<p><b>Draft locales (" + draftLocaleCount + "):</b>");
104         logHtml.println("<p><b>Draft languages (" + draftLanguages.size()
105             + "):</b>");
106         logHtml.println(showSet(draftNativeLanguages, transliterate, true));
107         logHtml.println("<p><b>Draft countries (" + draftCountries.size()
108             + "):</b>");
109         logHtml.println(showSet(draftNativeCountries, transliterate, false));
110         logHtml.println(CldrUtility.ANALYTICS);
111         logHtml.println("</body></html>");
112         logHtml.close();
113     }
114 
115     /**
116      *
117      */
removeSingleLanguagesWhereWeHaveScripts(Set<String> contents)118     private static Set<String> removeSingleLanguagesWhereWeHaveScripts(Set<String> contents) {
119         StandardCodes sc = StandardCodes.make();
120         contents = new TreeSet<String>(contents); // make writable
121         if (false && HACK) {
122             contents.add("bs_Latn");
123             contents.add("bs_Cyrl");
124             contents.add("bs_Latn_BA");
125             contents.add("bs_Cyrl_BA");
126         }
127         // find the languages with scripts
128         Set<String> toRemove = new HashSet<String>();
129         if (HACK) toRemove.add("sh");
130 
131         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
132             String localeID = it.next();
133             if (CLDRFile.isSupplementalName(localeID)) {
134                 continue;
135             }
136             // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
137             String lang = ltp.set(localeID).getLanguage();
138             String territory = ltp.set(localeID).getRegion();
139             if (!sc.getGoodAvailableCodes("language").contains(lang)) {
140                 System.out.println("Odd language, removing: " + localeID);
141                 it.remove();
142                 continue;
143             }
144             if (territory.length() != 0 && !sc.getGoodAvailableCodes("territory").contains(territory)) {
145                 System.out.println("Odd territory, removing: " + localeID);
146                 it.remove();
147                 continue;
148             }
149             String langscript = ltp.set(localeID).getLanguageScript();
150             if (!lang.equals(langscript)) toRemove.add(lang);
151         }
152 
153         for (Iterator<String> it = contents.iterator(); it.hasNext();) {
154             String localeID = it.next();
155             if (CLDRFile.isSupplementalName(localeID)) {
156                 continue;
157             }
158             // if there is a lang_script, then remove everything starting with lang that doesn't have "a" script
159             String lang = ltp.set(localeID).getLanguage();
160             if (!toRemove.contains(lang)) continue;
161             String langscript = ltp.set(localeID).getLanguageScript();
162             if (lang.equals(langscript)) it.remove();
163         }
164         return contents;
165     }
166 
167     static final UnicodeSet NON_LATIN = new UnicodeSet("[^[:latin:][:common:][:inherited:]]");
168 
169     /**
170      * @param nativeCountries
171      * @param transliterate
172      *            TODO
173      * @param isLanguage
174      *            TODO
175      */
176     @SuppressWarnings({ "unchecked", "rawtypes" })
showSet(Set nativeCountries, boolean transliterate, boolean isLanguage)177     private static String showSet(Set nativeCountries, boolean transliterate,
178         boolean isLanguage) {
179         UnicodeSet BIDI_R = new UnicodeSet(
180             "[[:Bidi_Class=R:][:Bidi_Class=AL:]]");
181         StringBuffer result = new StringBuffer();
182         Map sb = new TreeMap(LanguageList.col);
183         // collect multiples by English name
184         for (Iterator it = nativeCountries.iterator(); it.hasNext();) {
185             LanguageList llist = (LanguageList) it.next();
186             Set s = (Set) sb.get(llist.getEnglishName());
187             if (s == null)
188                 sb.put(llist.getEnglishName(), s = new TreeSet());
189             s.add(llist);
190         }
191 
192         Set<String> titleSet = new TreeSet<String>(col);
193         Set<String> qualifierSet = new TreeSet<String>(col);
194 
195         for (Iterator<String> it = sb.keySet().iterator(); it.hasNext();) {
196             String englishName = it.next();
197             Set s = (Set) sb.get(englishName);
198             if (result.length() != 0) {
199                 result.append("; ");
200             }
201             String code = "";
202             boolean needQualifier = s.size() != 1;
203             titleSet.clear();
204             qualifierSet.clear();
205 
206             for (Iterator<LanguageList> it2 = s.iterator(); it2.hasNext();) {
207                 LanguageList llist = it2.next();
208                 String localName = llist.getLocalName();
209                 String locale = llist.getLocale();
210 
211                 // see if we need qualifier
212                 String lang = locale, country = "";
213                 if (locale.length() > 3
214                     && locale.charAt(locale.length() - 3) == '_') {
215                     lang = locale.substring(0, locale.length() - 3);
216                     country = locale.substring(locale.length() - 2);
217                 }
218 
219                 // fix
220                 if (BIDI_R.containsSome(localName))
221                     localName = '\u200E' + localName + '\u200E';
222 
223                 // qualifiers += lang;
224 
225                 if (isLanguage) {
226                     code = lang;
227                 } else {
228                     code = country;
229                 }
230 
231                 if (!localName.equalsIgnoreCase(englishName)) {
232                     needQualifier = true;
233                     qualifierSet.add(localName);
234 
235                     if (transliterate && NON_LATIN.containsSome(localName)
236                         && !lang.equals("ja")) {
237                         String transName = localName;
238                         try {
239                             transName = fixedTitleCase("en",
240                                 toLatin.transliterate(localName));
241                         } catch (RuntimeException e) {
242                             System.out.println("\t" + e.getMessage());
243                         }
244                         if (NON_LATIN.containsSome(transName)) {
245                             Log.logln("Can't transliterate " + localName
246                                 + ": " + transName);
247                         } else {
248                             titleSet.add(transName);
249                         }
250                     }
251                 }
252             }
253             String title = code + (titleSet.isEmpty() ? "" : ": " + titleSet.toString());
254             String before = "", after = "";
255             if (title.length() != 0) {
256                 before = "<span title=\'"
257                     + TransliteratorUtilities.toHTML.transliterate(title) + "'>";
258                 after = "</span>";
259             }
260             String qualifiers = qualifierSet.toString();
261             if (!needQualifier || qualifierSet.isEmpty())
262                 qualifiers = "";
263             else
264                 qualifiers = " " + qualifiers; // qualifiers = " (" + qualifiers + ")";
265 
266             // fix
267             if (englishName.endsWith(", China")) {
268                 englishName = englishName.substring(0, englishName.length()
269                     - ", China".length())
270                     + " China";
271             }
272 
273             result.append(before)
274                 .append(
275                     TransliteratorUtilities.toHTML.transliterate(englishName
276                         + qualifiers))
277                 .append(after);
278         }
279         return result.toString();
280     }
281 
282     /**
283      * @param localeID
284      * @param isDraft
285      *            TODO
286      * @param draftLanguages
287      * @param draftCountries
288      * @param draftNativeLanguages
289      * @param draftNativeCountries
290      */
addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries, Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries)291     private static void addCounts(String localeID, boolean isDraft, Set<String> draftLanguages, Set<String> draftCountries,
292         Set<Object> draftNativeLanguages, Set<Object> draftNativeCountries) {
293         // ULocale uloc = new ULocale(localeName);
294         ltp.set(localeID);
295         String lang = ltp.getLanguage();
296         String langScript = ltp.getLanguageScript();
297         String country = ltp.getRegion();
298 
299         // dump aliases
300         // if ((country.equals("TW") || country.equals("HK") || country.equals("MO")) && lang.equals("zh")) return;
301         // if (lang.equals("zh_Hans") || lang.equals("sr_Cyrl") || lang.equals("sh")) return;
302 
303         String nativeName, englishName;
304         draftLanguages.add(lang);
305         nativeName = getFixedLanguageName(localeID, langScript);
306         englishName = english.getName(langScript);
307         if (!lang.equals("en") && nativeName.equals(englishName)) {
308             Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + lang
309                 + " equals English: " + nativeName);
310         }
311 
312         draftNativeLanguages.add(new LanguageList(langScript, englishName, fixedTitleCase("en", nativeName)));
313 
314         if (!country.equals("")) {
315             draftCountries.add(country);
316             nativeName = getFixedDisplayCountry(localeID, country);
317             englishName = getFixedDisplayCountry("en", country);
318             if (!lang.equals("en") && nativeName.equals(englishName)) {
319                 Log.logln((isDraft ? "D" : "") + "\tWarning: in " + localeID + ", display name for " + country
320                     + " equals English: " + nativeName);
321             }
322             draftNativeCountries.add(new LanguageList(localeID, englishName, fixedTitleCase("en", nativeName)));
323         }
324     }
325 
326     private static class LanguageList implements Comparable<Object> {
327         Object[] contents;
328         static Collator col = Collator.getInstance(ULocale.ENGLISH);
329         static Comparator<Object[]> comp = new ArrayComparator(new Collator[] { col, col, null });
330 
LanguageList(String locale, String englishName, String localName)331         LanguageList(String locale, String englishName, String localName) {
332             contents = new Object[] { englishName, locale, localName };
333         }
334 
compareTo(Object o)335         public int compareTo(Object o) {
336             return comp.compare(contents, ((LanguageList) o).contents);
337         }
338 
getLocale()339         String getLocale() {
340             return (String) contents[1];
341         }
342 
getEnglishName()343         String getEnglishName() {
344             return (String) contents[0];
345         }
346 
getLocalName()347         String getLocalName() {
348             return (String) contents[2];
349         }
350     }
351 
fixedTitleCase(String localeID, String in)352     static String fixedTitleCase(String localeID, String in) {
353         if (notitlecase) return in;
354         String result = UCharacter.toTitleCase(new ULocale(localeID), in, null);
355         if (HACK) {
356             result = GenerateCldrTests.replace(result, "U.s.", "U.S.");
357             result = GenerateCldrTests.replace(result, "S.a.r.", "S.A.R.");
358         }
359         return result;
360     }
361 
362     /*
363      * static void addMapSet(Map m, Object key, Object value, Comparator com) {
364      * Set valueSet = (Set) m.get(key);
365      * if (valueSet == null) {
366      * valueSet = new TreeSet(com);
367      * m.put(key, valueSet);
368      * }
369      * valueSet.add(value);
370      * }
371      */
372 
373     /**
374      *
375      */
getFixedLanguageName(String localeID, String lang)376     private static String getFixedLanguageName(String localeID, String lang) {
377         if (HACK) {
378             if (localeID.equals("bs") || localeID.startsWith("bs_")) {
379                 if (lang.equals("bs") || lang.startsWith("bs_")) return "Bosanski";
380             }
381         }
382         CLDRFile cldr = factory.make(localeID, true);
383         return cldr.getName(lang);
384     }
385 
386     /**
387      * @param uloc
388      * @return
389      */
getFixedDisplayCountry(String localeID, String country)390     private static String getFixedDisplayCountry(String localeID, String country) {
391         if (HACK) {
392             if (localeID.equals("bs") || localeID.startsWith("bs_")) {
393                 if (country.equals("BA"))
394                     return "\u0411\u043E\u0441\u043D\u0430 \u0438 \u0425\u0435\u0440\u0446\u0435\u0433\u043E\u0432\u0438\u043D\u0430";
395             }
396         }
397         CLDRFile cldr = factory.make(localeID, true);
398         String name = cldr.getName("territory", country);
399         if (false && HACK) {
400             Object trial = fixCountryNames.get(name);
401             if (trial != null) {
402                 return (String) trial;
403             }
404         }
405         return name;
406     }
407 
408     static Map<String, String> fixCountryNames = new HashMap<String, String>();
409     static {
410         fixCountryNames.put("\u0408\u0443\u0433\u043E\u0441\u043B\u0430\u0432\u0438\u0458\u0430",
411             "\u0421\u0440\u0431\u0438\u0458\u0430 \u0438 \u0426\u0440\u043D\u0430 \u0413\u043E\u0440\u0430");
412         fixCountryNames.put("Jugoslavija", "Srbija i Crna Gora");
413         fixCountryNames.put("Yugoslavia", "Serbia and Montenegro");
414     }
415     public static final Transliterator toLatin = Transliterator.getInstance("any-latin");
416 
417     public static class DraftChecker {
418         String dir;
419         Map<String, Object> cache = new HashMap<String, Object>();
420         Object TRUE = new Object();
421         Object FALSE = new Object();
422 
DraftChecker(String dir)423         public DraftChecker(String dir) {
424             this.dir = dir;
425         }
426 
isDraft(String localeName)427         public boolean isDraft(String localeName) {
428             Object check = cache.get(localeName);
429             if (check != null) {
430                 return check == TRUE;
431             }
432             BufferedReader pw = null;
433             //boolean result = true;
434             try {
435                 pw = FileUtilities.openUTF8Reader(dir, localeName + ".xml");
436                 while (true) {
437                     String line = pw.readLine();
438                     if (line == null) {
439                         throw new IllegalArgumentException("Internal Error: should never get here.");
440                     }
441                     if (line.indexOf("<ldml") >= 0) {
442                         if (line.indexOf("draft") >= 0) {
443                             check = TRUE;
444                         } else {
445                             check = FALSE;
446                         }
447                         break;
448                     }
449                 }
450                 pw.close();
451             } catch (IOException e) {
452                 throw new ICUUncheckedIOException("Failure on " + localeName + ": " + dir + localeName + ".xml", e);
453             }
454             cache.put(localeName, check);
455             return check == TRUE;
456         }
457     }
458 
459 }