• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import java.util.Arrays;
4 import java.util.Collections;
5 import java.util.HashMap;
6 import java.util.HashSet;
7 import java.util.LinkedHashSet;
8 import java.util.List;
9 import java.util.Locale;
10 import java.util.Map;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.TreeSet;
14 
15 import org.unicode.cldr.tool.CountryCodeConverter;
16 import org.unicode.cldr.util.CldrUtility;
17 import org.unicode.cldr.util.Containment;
18 import org.unicode.cldr.util.SemiFileReader;
19 import org.unicode.cldr.util.StandardCodes;
20 import org.unicode.cldr.util.With;
21 
22 import com.ibm.icu.dev.util.CollectionUtilities;
23 import com.ibm.icu.impl.Relation;
24 import com.ibm.icu.lang.UScript;
25 import com.ibm.icu.text.Transform;
26 import com.ibm.icu.text.UTF16;
27 import com.ibm.icu.util.ICUException;
28 import com.ibm.icu.util.VersionInfo;
29 
30 public class ScriptMetadata {
31     private static final int MAX_RANK = 33;
32     private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv";
33     private static final VersionInfo UNICODE_VERSION = VersionInfo.getInstance(
34         CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "13"));
35 
36     // To get the data, go do the Script MetaData spreadsheet
37     // Download As Comma Separated Items into DATA_FILE
38     // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for Unicode beta).
39     // Run TestScriptMetadata.
40     // Then run GenerateScriptMetadata.
41     // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata
42     private enum Column {
43         // must match the spreadsheet header (caseless compare) or have the alternate header as an argument.
44         // doesn't have to be in order
45         WR, AGE, SAMPLE_CODE, ID_USAGE("ID Usage (UAX31)"), RTL("RTL?"), LB_LETTERS("LB letters?"), SHAPING_REQ("Shaping Req?"), IME("IME?"), ORIGIN_COUNTRY(
46             "Origin Country"), DENSITY("~Density"), LANG_CODE, HAS_CASE("Has Case?");
47 
48         int columnNumber = -1;
49         final Set<String> names = new HashSet<String>();
50 
Column(String... alternateNames)51         Column(String... alternateNames) {
52             names.add(this.name());
53             for (String name : alternateNames) {
54                 names.add(name.toUpperCase(Locale.ENGLISH));
55             }
56         }
57 
setColumns(String[] headers)58         static void setColumns(String[] headers) {
59             for (int i = 0; i < headers.length; ++i) {
60                 String header = headers[i].toUpperCase(Locale.ENGLISH);
61                 for (Column v : values()) {
62                     if (v.names.contains(header)) {
63                         v.columnNumber = i;
64                     }
65                 }
66             }
67             for (Column v : values()) {
68                 if (v.columnNumber == -1) {
69                     throw new IllegalArgumentException("Missing field for " + v
70                         + ", may need to add additional column alias");
71                 }
72             }
73         }
74 
getItem(String[] items)75         String getItem(String[] items) {
76             return items[columnNumber];
77         }
78 
getInt(String[] items, int defaultValue)79         int getInt(String[] items, int defaultValue) {
80             final String item = getItem(items);
81             return item.isEmpty() || item.equalsIgnoreCase("n/a") ? defaultValue : Integer.parseInt(item);
82         }
83     }
84 
85     public enum IdUsage {
86         UNKNOWN("Other"), EXCLUSION("Historic"), LIMITED_USE("Limited Use"), ASPIRATIONAL("Aspirational"), RECOMMENDED("Major Use");
87 
88         public final String name;
89 
IdUsage(String name)90         private IdUsage(String name) {
91             this.name = name;
92         }
93     }
94 
95     public enum Trinary {
96         UNKNOWN, NO, YES
97     }
98 
99     public enum Shaping {
100         UNKNOWN, NO, MIN, YES
101     }
102 
103     static StandardCodes SC = StandardCodes.make();
104     // static HashMap<String,String> NAME_TO_REGION_CODE = new HashMap<String,String>();
105     // static HashMap<String,String> NAME_TO_LANGUAGE_CODE = new HashMap<String,String>();
106     static EnumLookup<Shaping> shapingLookup = EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN);
107     static EnumLookup<Trinary> trinaryLookup = EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN);
108     static EnumLookup<IdUsage> idUsageLookup = EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN);
109     static {
110         // addNameToCode("language", NAME_TO_LANGUAGE_CODE);
111         // // NAME_TO_LANGUAGE_CODE.put("", "und");
112         // NAME_TO_LANGUAGE_CODE.put("N/A", "und");
113         // addSynonym(NAME_TO_LANGUAGE_CODE, "Ancient Greek", "Ancient Greek (to 1453)");
114         // //addSynonym(NAME_TO_LANGUAGE_CODE, "Khmer", "Cambodian");
115         // addSynonym(NAME_TO_LANGUAGE_CODE, "Old Irish", "Old Irish (to 900)");
116 
117         // addNameToCode("region", NAME_TO_REGION_CODE);
118         // // NAME_TO_REGION_CODE.put("UNKNOWN", "ZZ");
119         // // NAME_TO_REGION_CODE.put("", "ZZ");
120         // NAME_TO_REGION_CODE.put("N/A", "ZZ");
121         // addSynonym(NAME_TO_REGION_CODE, "Laos", "Lao People's Democratic Republic");
122     }
123 
addNameToCode(String type, Map<String, String> hashMap)124     public static void addNameToCode(String type, Map<String, String> hashMap) {
125         for (String language : SC.getAvailableCodes(type)) {
126             Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language);
127             String name = fullData.get("Description");
128             hashMap.put(name.toUpperCase(Locale.ENGLISH), language);
129         }
130     }
131 
addSynonym(Map<String, String> map, String newTerm, String oldTerm)132     public static void addSynonym(Map<String, String> map, String newTerm, String oldTerm) {
133         String code = map.get(oldTerm.toUpperCase(Locale.ENGLISH));
134         map.put(newTerm.toUpperCase(Locale.ENGLISH), code);
135     }
136 
137     public static final class SkipNewUnicodeException extends ICUException {
138     }
139 
140     public static class Info implements Comparable<Info> {
141         public final int rank;
142         public final VersionInfo age;
143         public final String sampleChar;
144         public final IdUsage idUsage;
145         public final Trinary rtl;
146         public final Trinary lbLetters;
147         public final Trinary hasCase;
148         public final Shaping shapingReq;
149         public final Trinary ime;
150         public final int density;
151         public final String originCountry;
152         public final String likelyLanguage;
153 
Info(String[] items)154         private Info(String[] items) {
155             // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no
156             rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK);
157             age = VersionInfo.getInstance(Column.AGE.getItem(items));
158             if (age.compareTo(UNICODE_VERSION) > 0) {
159                 throw new SkipNewUnicodeException();
160             }
161             // Parse the code point of the sample character, rather than the sample character itself.
162             // The code point is more reliable, especially when the spreadsheet has a bug
163             // for supplementary characters.
164             int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16);
165             sampleChar = UTF16.valueOf(sampleCode);
166             idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items));
167             rtl = trinaryLookup.forString(Column.RTL.getItem(items));
168             lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items));
169             shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items));
170             ime = trinaryLookup.forString(Column.IME.getItem(items));
171             hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items));
172             density = Column.DENSITY.getInt(items, -1);
173 
174             final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items);
175             String country = CountryCodeConverter.getCodeFromName(countryRaw);
176             // NAME_TO_REGION_CODE.get(countryRaw.toUpperCase(Locale.ENGLISH));
177             if (country == null) {
178                 errors.add("Can't map " + countryRaw + " to country/region");
179             }
180             originCountry = country == null ? "ZZ" : country;
181 
182             String langCode = Column.LANG_CODE.getItem(items);
183             if (langCode.equals("n/a")) {
184                 langCode = null;
185             }
186             likelyLanguage = langCode == null ? "und" : langCode;
187         }
188 
Info(Info other, String string, String sampleCharacter)189         public Info(Info other, String string, String sampleCharacter) {
190             rank = other.rank;
191             age = other.age;
192             sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter;
193             idUsage = other.idUsage;
194             rtl = other.rtl;
195             lbLetters = other.lbLetters;
196             hasCase = other.hasCase;
197             shapingReq = other.shapingReq;
198             ime = "IME:YES".equals(string) ? Trinary.YES : other.ime;
199             density = other.density;
200             originCountry = other.originCountry;
201             likelyLanguage = other.likelyLanguage;
202         }
203 
204         // public Trinary parseTrinary(Column title, String[] items) {
205         // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH));
206         // }
fix(String in)207         String fix(String in) {
208             return in.toUpperCase(Locale.ENGLISH).replace("N/A", "UNKNOWN").replace("?", "UNKNOWN")
209                 .replace("RTL", "YES");
210         }
211 
toString()212         public String toString() {
213             return rank
214                 + "\tSample: " + sampleChar
215                 + "\tCountry: " + getName("territory", originCountry) + " (" + originCountry + ")"
216                 + "\tLanguage: " + getName("language", likelyLanguage) + " (" + likelyLanguage + ")"
217                 + "\tId: " + idUsage
218                 + "\tRtl: " + rtl
219                 + "\tLb: " + lbLetters
220                 + "\tShape: " + shapingReq
221                 + "\tIme: " + ime
222                 + "\tCase: " + hasCase
223                 + "\tDensity: " + density;
224         }
225 
getName(String type, String code)226         public Object getName(String type, String code) {
227             List<String> fullData = SC.getFullData(type, code);
228             if (fullData == null) {
229                 return "unavailable";
230             }
231             return fullData.get(0);
232         }
233 
234         @Override
compareTo(Info o)235         public int compareTo(Info o) {
236             // we don't actually care what the comparison value is, as long as it is transitive and consistent with equals.
237             return toString().compareTo(o.toString());
238         }
239     }
240 
241     public static Set<String> errors = new LinkedHashSet<String>();
242     static HashMap<String, Integer> titleToColumn = new HashMap<String, Integer>();
243 
244     private static class MyFileReader extends SemiFileReader {
245         private Map<String, Info> data = new HashMap<String, Info>();
246 
247         @Override
isCodePoint()248         protected boolean isCodePoint() {
249             return false;
250         }
251 
252         @Override
splitLine(String line)253         protected String[] splitLine(String line) {
254             return CldrUtility.splitCommaSeparated(line);
255         };
256 
257         @Override
handleLine(int lineCount, int start, int end, String[] items)258         protected boolean handleLine(int lineCount, int start, int end, String[] items) {
259             if (items[0].startsWith("For help") || items[0].isEmpty()) {
260                 return true; // header lines
261             }
262             if (items[0].equals("WR")) {
263                 Column.setColumns(items);
264                 return true;
265             }
266             Info info;
267             try {
268                 info = new Info(items);
269             } catch (SkipNewUnicodeException e) {
270                 return true;
271             } catch (Exception e) {
272                 errors.add(e.getClass().getName() + "\t" + e.getMessage() + "\t" + Arrays.asList(items));
273                 return true;
274             }
275 
276             String script = items[2];
277             data.put(script, info);
278             Set<String> extras = EXTRAS.get(script);
279             if (extras != null) {
280                 for (String script2 : extras) {
281                     Info info2 = info;
282                     if (script2.equals("Jpan")) {
283                         // HACK
284                         info2 = new Info(info, "IME:YES", null);
285                     } else if (script2.equals("Jamo")) {
286                         info2 = new Info(info, null, "ᄒ");
287                     }
288                     data.put(script2, info2);
289                 }
290             }
291             return true;
292         }
293 
294         @Override
process(Class<?> classLocation, String fileName)295         public MyFileReader process(Class<?> classLocation, String fileName) {
296             super.process(classLocation, fileName);
297             return this;
298         }
299 
getData()300         private Map<String, Info> getData() {
301             if (!errors.isEmpty()) {
302                 throw new RuntimeException(CollectionUtilities.join(errors, "\n\t"));
303             }
304             return Collections.unmodifiableMap(data);
305         }
306     }
307 
308     public enum Groupings {
309         EUROPEAN("150"),
310         MIDDLE_EASTERN("145"),
311         CENTRAL_ASIAN("143"),
312         SOUTH_ASIAN("034"),
313         SOUTHEAST_ASIAN("035"),
314         EAST_ASIAN("030"),
315         AFRICAN("002"),
316         AMERICAN("019"),;
317         public final Set<String> scripts;
318 
Groupings(String... regions)319         private Groupings(String... regions) {
320             scripts = With
321                 .in(getScripts())
322                 .toUnmodifiableCollection(
323                     new ScriptMetadata.RegionFilter(regions), new TreeSet<String>());
324         }
325     }
326 
327     static class RegionFilter implements com.ibm.icu.text.Transform<String, String> {
328         final String[] containingRegion;
329 
RegionFilter(String... containingRegion)330         RegionFilter(String... containingRegion) {
331             this.containingRegion = containingRegion;
332         }
333 
334         @Override
transform(String script)335         public String transform(String script) {
336             String currentRegion = getInfo(script).originCountry;
337             while (true) {
338                 for (String s : containingRegion) {
339                     if (s.equals(currentRegion)) {
340                         return script;
341                     }
342                 }
343                 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) {
344                     return null;
345                 }
346                 currentRegion = Containment.getContainer(currentRegion);
347             }
348         }
349     }
350 
351     static Relation<String, String> EXTRAS = Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
352     static {
353         EXTRAS.put("Hani", "Hans");
354         EXTRAS.put("Hani", "Hant");
355         EXTRAS.put("Hani", "Hanb");
356         EXTRAS.put("Hang", "Kore");
357         EXTRAS.put("Hang", "Jamo");
358         EXTRAS.put("Hira", "Jpan");
EXTRAS.freeze()359         EXTRAS.freeze();
360     }
361     static final Map<String, Info> data = new MyFileReader()
362         .process(ScriptMetadata.class, DATA_FILE).getData();
363 
getInfo(String s)364     public static Info getInfo(String s) {
365         Info result = data.get(s);
366         if (result == null) {
367             try {
368                 String name2 = UScript.getShortName(UScript.getCodeFromName(s));
369                 result = data.get(name2);
370             } catch (Exception e) {
371             }
372         }
373         return result;
374     }
375 
getScripts()376     public static Set<String> getScripts() {
377         return data.keySet();
378     }
379 
getInfo(int i)380     public static Info getInfo(int i) {
381         return data.get(UScript.getShortName(i));
382     }
383 
iterable()384     public static Set<Entry<String, Info>> iterable() {
385         return data.entrySet();
386     }
387 
388     /**
389      * Specialized scripts
390      * @return
391      */
getExtras()392     public static Set<String> getExtras() {
393         return EXTRAS.values();
394     }
395 
396     public static Transform<String, String> TO_SHORT_SCRIPT = new Transform<String, String>() {
397         @Override
398         public String transform(String source) {
399             return UScript.getShortName(UScript.getCodeFromName(source));
400         }
401     };
402     public static Transform<String, String> TO_LONG_SCRIPT = new Transform<String, String>() {
403         @Override
404         public String transform(String source) {
405             return UScript.getName(UScript.getCodeFromName(source));
406         }
407     };
408 }
409