• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import java.util.Arrays;
4 import java.util.Collections;
5 import java.util.HashMap;
6 import java.util.HashSet;
7 import java.util.LinkedHashSet;
8 import java.util.List;
9 import java.util.Locale;
10 import java.util.Map;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.TreeSet;
14 
15 import org.unicode.cldr.tool.CountryCodeConverter;
16 import org.unicode.cldr.util.CldrUtility;
17 import org.unicode.cldr.util.Containment;
18 import org.unicode.cldr.util.SemiFileReader;
19 import org.unicode.cldr.util.StandardCodes;
20 import org.unicode.cldr.util.With;
21 
22 import com.google.common.base.Joiner;
23 import com.ibm.icu.impl.Relation;
24 import com.ibm.icu.lang.UScript;
25 import com.ibm.icu.text.Transform;
26 import com.ibm.icu.text.UTF16;
27 import com.ibm.icu.util.ICUException;
28 import com.ibm.icu.util.VersionInfo;
29 
30 public class ScriptMetadata {
31     private static final int MAX_RANK = 33;
32     private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv";
33     private static final VersionInfo UNICODE_VERSION = VersionInfo.getInstance(
34         CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "13"));
35 
36     // To get the data, go do the Script MetaData spreadsheet
37     // Download As Comma Separated Items into DATA_FILE
38     // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for Unicode beta).
39     // Run TestScriptMetadata.
40     // Then run GenerateScriptMetadata.
41     // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata
42     private enum Column {
43         // must match the spreadsheet header (caseless compare) or have the alternate header as an argument.
44         // doesn't have to be in order
45         WR, AGE, SAMPLE_CODE, ID_USAGE("ID Usage (UAX31)"), RTL("RTL?"), LB_LETTERS("LB letters?"), SHAPING_REQ("Shaping Req?"), IME("IME?"), ORIGIN_COUNTRY(
46             "Origin Country"), DENSITY("~Density"), LANG_CODE, HAS_CASE("Has Case?");
47 
48         int columnNumber = -1;
49         final Set<String> names = new HashSet<>();
50 
Column(String... alternateNames)51         Column(String... alternateNames) {
52             names.add(this.name());
53             for (String name : alternateNames) {
54                 names.add(name.toUpperCase(Locale.ENGLISH));
55             }
56         }
57 
setColumns(String[] headers)58         static void setColumns(String[] headers) {
59             for (int i = 0; i < headers.length; ++i) {
60                 String header = headers[i].toUpperCase(Locale.ENGLISH);
61                 for (Column v : values()) {
62                     if (v.names.contains(header)) {
63                         v.columnNumber = i;
64                     }
65                 }
66             }
67             for (Column v : values()) {
68                 if (v.columnNumber == -1) {
69                     throw new IllegalArgumentException("Missing field for " + v
70                         + ", may need to add additional column alias");
71                 }
72             }
73         }
74 
getItem(String[] items)75         String getItem(String[] items) {
76             return items[columnNumber];
77         }
78 
getInt(String[] items, int defaultValue)79         int getInt(String[] items, int defaultValue) {
80             final String item = getItem(items);
81             return item.isEmpty() || item.equalsIgnoreCase("n/a") ? defaultValue : Integer.parseInt(item);
82         }
83     }
84 
85     public enum IdUsage {
86         UNKNOWN("Other"), EXCLUSION("Historic"), LIMITED_USE("Limited Use"), ASPIRATIONAL("Aspirational"), RECOMMENDED("Major Use");
87 
88         public final String name;
89 
IdUsage(String name)90         private IdUsage(String name) {
91             this.name = name;
92         }
93     }
94 
95     public enum Trinary {
96         UNKNOWN, NO, YES
97     }
98 
99     public enum Shaping {
100         UNKNOWN, NO, MIN, YES
101     }
102 
103     static StandardCodes SC = StandardCodes.make();
104     // static HashMap<String,String> NAME_TO_REGION_CODE = new HashMap<String,String>();
105     // static HashMap<String,String> NAME_TO_LANGUAGE_CODE = new HashMap<String,String>();
106     static EnumLookup<Shaping> shapingLookup = EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN);
107     static EnumLookup<Trinary> trinaryLookup = EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN);
108     static EnumLookup<IdUsage> idUsageLookup = EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN);
109     static {
110         // addNameToCode("language", NAME_TO_LANGUAGE_CODE);
111         // // NAME_TO_LANGUAGE_CODE.put("", "und");
112         // NAME_TO_LANGUAGE_CODE.put("N/A", "und");
113         // addSynonym(NAME_TO_LANGUAGE_CODE, "Ancient Greek", "Ancient Greek (to 1453)");
114         // //addSynonym(NAME_TO_LANGUAGE_CODE, "Khmer", "Cambodian");
115         // addSynonym(NAME_TO_LANGUAGE_CODE, "Old Irish", "Old Irish (to 900)");
116 
117         // addNameToCode("region", NAME_TO_REGION_CODE);
118         // // NAME_TO_REGION_CODE.put("UNKNOWN", "ZZ");
119         // // NAME_TO_REGION_CODE.put("", "ZZ");
120         // NAME_TO_REGION_CODE.put("N/A", "ZZ");
121         // addSynonym(NAME_TO_REGION_CODE, "Laos", "Lao People's Democratic Republic");
122     }
123 
addNameToCode(String type, Map<String, String> hashMap)124     public static void addNameToCode(String type, Map<String, String> hashMap) {
125         for (String language : SC.getAvailableCodes(type)) {
126             Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language);
127             String name = fullData.get("Description");
128             hashMap.put(name.toUpperCase(Locale.ENGLISH), language);
129         }
130     }
131 
addSynonym(Map<String, String> map, String newTerm, String oldTerm)132     public static void addSynonym(Map<String, String> map, String newTerm, String oldTerm) {
133         String code = map.get(oldTerm.toUpperCase(Locale.ENGLISH));
134         map.put(newTerm.toUpperCase(Locale.ENGLISH), code);
135     }
136 
137     public static final class SkipNewUnicodeException extends ICUException {
138     }
139 
140     public static class Info implements Comparable<Info> {
141         public final int rank;
142         public final VersionInfo age;
143         public final String sampleChar;
144         public final IdUsage idUsage;
145         public final Trinary rtl;
146         public final Trinary lbLetters;
147         public final Trinary hasCase;
148         public final Shaping shapingReq;
149         public final Trinary ime;
150         public final int density;
151         public final String originCountry;
152         public final String likelyLanguage;
153 
Info(String[] items)154         private Info(String[] items) {
155             // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no
156             rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK);
157             age = VersionInfo.getInstance(Column.AGE.getItem(items));
158             if (age.compareTo(UNICODE_VERSION) > 0) {
159                 throw new SkipNewUnicodeException();
160             }
161             // Parse the code point of the sample character, rather than the sample character itself.
162             // The code point is more reliable, especially when the spreadsheet has a bug
163             // for supplementary characters.
164             int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16);
165             sampleChar = UTF16.valueOf(sampleCode);
166             idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items));
167             rtl = trinaryLookup.forString(Column.RTL.getItem(items));
168             lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items));
169             shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items));
170             ime = trinaryLookup.forString(Column.IME.getItem(items));
171             hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items));
172             density = Column.DENSITY.getInt(items, -1);
173 
174             final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items);
175             String country = CountryCodeConverter.getCodeFromName(countryRaw, false);
176             // NAME_TO_REGION_CODE.get(countryRaw.toUpperCase(Locale.ENGLISH));
177             if (country == null) {
178                 errors.add("Can't map " + countryRaw + " to country/region");
179             }
180             originCountry = country == null ? "ZZ" : country;
181 
182             String langCode = Column.LANG_CODE.getItem(items);
183             if (langCode.equals("n/a")) {
184                 langCode = null;
185             }
186             likelyLanguage = langCode == null ? "und" : langCode;
187         }
188 
Info(Info other, String string, String sampleCharacter)189         public Info(Info other, String string, String sampleCharacter) {
190             rank = other.rank;
191             age = other.age;
192             sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter;
193             idUsage = other.idUsage;
194             rtl = other.rtl;
195             lbLetters = other.lbLetters;
196             hasCase = other.hasCase;
197             shapingReq = other.shapingReq;
198             ime = "IME:YES".equals(string) ? Trinary.YES : other.ime;
199             density = other.density;
200             originCountry = other.originCountry;
201             likelyLanguage = other.likelyLanguage;
202         }
203 
204         // public Trinary parseTrinary(Column title, String[] items) {
205         // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH));
206         // }
fix(String in)207         String fix(String in) {
208             return in.toUpperCase(Locale.ENGLISH).replace("N/A", "UNKNOWN").replace("?", "UNKNOWN")
209                 .replace("RTL", "YES");
210         }
211 
212         @Override
toString()213         public String toString() {
214             return rank
215                 + "\tSample: " + sampleChar
216                 + "\tCountry: " + getName("territory", originCountry) + " (" + originCountry + ")"
217                 + "\tLanguage: " + getName("language", likelyLanguage) + " (" + likelyLanguage + ")"
218                 + "\tId: " + idUsage
219                 + "\tRtl: " + rtl
220                 + "\tLb: " + lbLetters
221                 + "\tShape: " + shapingReq
222                 + "\tIme: " + ime
223                 + "\tCase: " + hasCase
224                 + "\tDensity: " + density;
225         }
226 
getName(String type, String code)227         public Object getName(String type, String code) {
228             List<String> fullData = SC.getFullData(type, code);
229             if (fullData == null) {
230                 return "unavailable";
231             }
232             return fullData.get(0);
233         }
234 
235         @Override
compareTo(Info o)236         public int compareTo(Info o) {
237             // we don't actually care what the comparison value is, as long as it is transitive and consistent with equals.
238             return toString().compareTo(o.toString());
239         }
240     }
241 
242     public static Set<String> errors = new LinkedHashSet<>();
243     static HashMap<String, Integer> titleToColumn = new HashMap<>();
244 
245     private static class MyFileReader extends SemiFileReader {
246         private Map<String, Info> data = new HashMap<>();
247 
248         @Override
isCodePoint()249         protected boolean isCodePoint() {
250             return false;
251         }
252 
253         @Override
splitLine(String line)254         protected String[] splitLine(String line) {
255             return CldrUtility.splitCommaSeparated(line);
256         }
257 
258         @Override
handleLine(int lineCount, int start, int end, String[] items)259         protected boolean handleLine(int lineCount, int start, int end, String[] items) {
260             if (items[0].startsWith("For help") || items[0].isEmpty()) {
261                 return true; // header lines
262             }
263             if (items[0].equals("WR")) {
264                 Column.setColumns(items);
265                 return true;
266             }
267             Info info;
268             try {
269                 info = new Info(items);
270             } catch (SkipNewUnicodeException e) {
271                 return true;
272             } catch (Exception e) {
273                 errors.add(e.getClass().getName() + "\t" + e.getMessage() + "\t" + Arrays.asList(items));
274                 return true;
275             }
276 
277             String script = items[2];
278             data.put(script, info);
279             Set<String> extras = EXTRAS.get(script);
280             if (extras != null) {
281                 for (String script2 : extras) {
282                     Info info2 = info;
283                     if (script2.equals("Jpan")) {
284                         // HACK
285                         info2 = new Info(info, "IME:YES", null);
286                     } else if (script2.equals("Jamo")) {
287                         info2 = new Info(info, null, "ᄒ");
288                     }
289                     data.put(script2, info2);
290                 }
291             }
292             return true;
293         }
294 
295         @Override
process(Class<?> classLocation, String fileName)296         public MyFileReader process(Class<?> classLocation, String fileName) {
297             super.process(classLocation, fileName);
298             return this;
299         }
300 
getData()301         private Map<String, Info> getData() {
302             if (!errors.isEmpty()) {
303                 throw new RuntimeException(Joiner.on("\n\t").join(errors));
304             }
305             return Collections.unmodifiableMap(data);
306         }
307     }
308 
309     public enum Groupings {
310         EUROPEAN("150"),
311         MIDDLE_EASTERN("145"),
312         CENTRAL_ASIAN("143"),
313         SOUTH_ASIAN("034"),
314         SOUTHEAST_ASIAN("035"),
315         EAST_ASIAN("030"),
316         AFRICAN("002"),
317         AMERICAN("019"),;
318         public final Set<String> scripts;
319 
Groupings(String... regions)320         private Groupings(String... regions) {
321             scripts = With
322                 .in(getScripts())
323                 .toUnmodifiableCollection(
324                     new ScriptMetadata.RegionFilter(regions), new TreeSet<String>());
325         }
326     }
327 
328     static class RegionFilter implements com.ibm.icu.text.Transform<String, String> {
329         final String[] containingRegion;
330 
RegionFilter(String... containingRegion)331         RegionFilter(String... containingRegion) {
332             this.containingRegion = containingRegion;
333         }
334 
335         @Override
transform(String script)336         public String transform(String script) {
337             String currentRegion = getInfo(script).originCountry;
338             while (true) {
339                 for (String s : containingRegion) {
340                     if (s.equals(currentRegion)) {
341                         return script;
342                     }
343                 }
344                 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) {
345                     return null;
346                 }
347                 currentRegion = Containment.getContainer(currentRegion);
348             }
349         }
350     }
351 
352     static Relation<String, String> EXTRAS = Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
353     static {
354         EXTRAS.put("Hani", "Hans");
355         EXTRAS.put("Hani", "Hant");
356         EXTRAS.put("Hani", "Hanb");
357         EXTRAS.put("Hang", "Kore");
358         EXTRAS.put("Hang", "Jamo");
359         EXTRAS.put("Hira", "Jpan");
EXTRAS.freeze()360         EXTRAS.freeze();
361     }
362     static final Map<String, Info> data = new MyFileReader()
363         .process(ScriptMetadata.class, DATA_FILE).getData();
364 
getInfo(String s)365     public static Info getInfo(String s) {
366         Info result = data.get(s);
367         if (result == null) {
368             try {
369                 String name2 = UScript.getShortName(UScript.getCodeFromName(s));
370                 result = data.get(name2);
371             } catch (Exception e) {
372             }
373         }
374         return result;
375     }
376 
getScripts()377     public static Set<String> getScripts() {
378         return data.keySet();
379     }
380 
getInfo(int i)381     public static Info getInfo(int i) {
382         return data.get(UScript.getShortName(i));
383     }
384 
iterable()385     public static Set<Entry<String, Info>> iterable() {
386         return data.entrySet();
387     }
388 
389     /**
390      * Specialized scripts
391      * @return
392      */
getExtras()393     public static Set<String> getExtras() {
394         return EXTRAS.values();
395     }
396 
397     public static Transform<String, String> TO_SHORT_SCRIPT = new Transform<String, String>() {
398         @Override
399         public String transform(String source) {
400             return UScript.getShortName(UScript.getCodeFromName(source));
401         }
402     };
403     public static Transform<String, String> TO_LONG_SCRIPT = new Transform<String, String>() {
404         @Override
405         public String transform(String source) {
406             return UScript.getName(UScript.getCodeFromName(source));
407         }
408     };
409 }
410