• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import com.google.common.base.Joiner;
4 import com.ibm.icu.impl.Relation;
5 import com.ibm.icu.lang.UScript;
6 import com.ibm.icu.text.Transform;
7 import com.ibm.icu.text.UTF16;
8 import com.ibm.icu.util.ICUException;
9 import com.ibm.icu.util.VersionInfo;
10 import java.util.Arrays;
11 import java.util.Collections;
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.LinkedHashSet;
15 import java.util.List;
16 import java.util.Locale;
17 import java.util.Map;
18 import java.util.Map.Entry;
19 import java.util.Set;
20 import java.util.TreeSet;
21 import org.unicode.cldr.tool.CountryCodeConverter;
22 import org.unicode.cldr.util.CldrUtility;
23 import org.unicode.cldr.util.Containment;
24 import org.unicode.cldr.util.SemiFileReader;
25 import org.unicode.cldr.util.StandardCodes;
26 import org.unicode.cldr.util.StandardCodes.LstrType;
27 import org.unicode.cldr.util.Validity;
28 import org.unicode.cldr.util.Validity.Status;
29 import org.unicode.cldr.util.With;
30 
31 public class ScriptMetadata {
32     private static final int MAX_RANK = 33;
33     private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv";
34     private static final VersionInfo UNICODE_VERSION =
35             VersionInfo.getInstance(CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "16"));
36 
37     // To get the data, go do the Script MetaData spreadsheet
38     // Download As Comma Separated Items into DATA_FILE
39     // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for
40     // Unicode beta).
41     // Run TestScriptMetadata.
42     // Then run GenerateScriptMetadata.
43     // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata
44     private enum Column {
45         // must match the spreadsheet header (caseless compare) or have the alternate header as an
46         // argument.
47         // doesn't have to be in order
48         WR,
49         AGE,
50         SAMPLE_CODE,
51         ID_USAGE("ID Usage (UAX31)"),
52         RTL("RTL?"),
53         LB_LETTERS("LB letters?"),
54         SHAPING_REQ("Shaping Req?"),
55         IME("IME?"),
56         ORIGIN_COUNTRY("Origin Country"),
57         DENSITY("~Density"),
58         LANG_CODE,
59         HAS_CASE("Has Case?");
60 
61         int columnNumber = -1;
62         final Set<String> names = new HashSet<>();
63 
Column(String... alternateNames)64         Column(String... alternateNames) {
65             names.add(this.name());
66             for (String name : alternateNames) {
67                 names.add(name.toUpperCase(Locale.ENGLISH));
68             }
69         }
70 
setColumns(String[] headers)71         static void setColumns(String[] headers) {
72             for (int i = 0; i < headers.length; ++i) {
73                 String header = headers[i].toUpperCase(Locale.ENGLISH);
74                 for (Column v : values()) {
75                     if (v.names.contains(header)) {
76                         v.columnNumber = i;
77                     }
78                 }
79             }
80             for (Column v : values()) {
81                 if (v.columnNumber == -1) {
82                     throw new IllegalArgumentException(
83                             "Missing field for " + v + ", may need to add additional column alias");
84                 }
85             }
86         }
87 
getItem(String[] items)88         String getItem(String[] items) {
89             return items[columnNumber];
90         }
91 
getInt(String[] items, int defaultValue)92         int getInt(String[] items, int defaultValue) {
93             final String item = getItem(items);
94             return item.isEmpty() || item.equalsIgnoreCase("n/a")
95                     ? defaultValue
96                     : Integer.parseInt(item);
97         }
98     }
99 
100     public enum IdUsage {
101         UNKNOWN("Other"),
102         EXCLUSION("Historic"),
103         LIMITED_USE("Limited Use"),
104         ASPIRATIONAL("Aspirational"),
105         RECOMMENDED("Major Use");
106 
107         public final String name;
108 
IdUsage(String name)109         private IdUsage(String name) {
110             this.name = name;
111         }
112     }
113 
114     public enum Trinary {
115         UNKNOWN,
116         NO,
117         YES
118     }
119 
120     public enum Shaping {
121         UNKNOWN,
122         NO,
123         MIN,
124         YES
125     }
126 
127     static StandardCodes SC = StandardCodes.make();
128     static EnumLookup<Shaping> shapingLookup =
129             EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN);
130     static EnumLookup<Trinary> trinaryLookup =
131             EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN);
132     static EnumLookup<IdUsage> idUsageLookup =
133             EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN);
134 
addNameToCode(String type, Map<String, String> hashMap)135     public static void addNameToCode(String type, Map<String, String> hashMap) {
136         for (String language : SC.getAvailableCodes(type)) {
137             Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language);
138             String name = fullData.get("Description");
139             hashMap.put(name.toUpperCase(Locale.ENGLISH), language);
140         }
141     }
142 
143     public static final class SkipNewUnicodeException extends ICUException {}
144 
145     /**
146      * Scripts that either have no known languages as yet (Cpmn) or are used for any language
147      * (Brai).
148      */
149     public static final Set<String> SCRIPTS_WITH_NO_LANGUAGES = Set.of("Brai", "Cpmn");
150 
151     public static class Info implements Comparable<Info> {
152         public final int rank;
153         public final VersionInfo age;
154         public final String sampleChar;
155         public final IdUsage idUsage;
156         public final Trinary rtl;
157         public final Trinary lbLetters;
158         public final Trinary hasCase;
159         public final Shaping shapingReq;
160         public final Trinary ime;
161         public final int density;
162         public final String originCountry;
163         public final String likelyLanguage;
164 
Info(String[] items)165         private Info(String[] items) {
166             // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no
167             rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK);
168             age = VersionInfo.getInstance(Column.AGE.getItem(items));
169             if (age.compareTo(UNICODE_VERSION) > 0) {
170                 throw new SkipNewUnicodeException();
171             }
172             // Parse the code point of the sample character, rather than the sample character
173             // itself.
174             // The code point is more reliable, especially when the spreadsheet has a bug
175             // for supplementary characters.
176             int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16);
177             sampleChar = UTF16.valueOf(sampleCode);
178             idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items));
179             rtl = trinaryLookup.forString(Column.RTL.getItem(items));
180             lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items));
181             shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items));
182             ime = trinaryLookup.forString(Column.IME.getItem(items));
183             hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items));
184             density = Column.DENSITY.getInt(items, -1);
185             String script = items[2];
186 
187             final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items);
188             String country = CountryCodeConverter.getCodeFromName(countryRaw, false);
189             if (country == null) {
190                 // Give context when throwing an error. Because this is run in a static init
191                 // context, the stack trace is typically incorrect when something goes wrong.
192                 errors.add(
193                         "ScriptMetadata.java: Can't map "
194                                 + countryRaw
195                                 + " to country/region. Try updating external/alternate_country_names.txt");
196             }
197             originCountry = country == null ? "ZZ" : country;
198 
199             String langCode = Column.LANG_CODE.getItem(items);
200             if (langCode.equals("n/a")) {
201                 langCode = null;
202             }
203             likelyLanguage = langCode == null ? "und" : langCode;
204 
205             // check for bad countries, bad languages
206 
207             final Status scriptStatus =
208                     Validity.getInstance().getCodeToStatus(LstrType.script).get(script);
209             if (!(scriptStatus == Status.special || scriptStatus == Status.unknown)) {
210                 final Status countryStatus =
211                         Validity.getInstance().getCodeToStatus(LstrType.region).get(originCountry);
212                 if (countryStatus != Status.regular) {
213                     errors.add(
214                             "ScriptMetadata.java: the country ("
215                                     + originCountry
216                                     + ") for "
217                                     + script
218                                     + " is not valid: "
219                                     + countryStatus);
220                 }
221                 final Status languageStatus =
222                         Validity.getInstance()
223                                 .getCodeToStatus(LstrType.language)
224                                 .get(likelyLanguage);
225                 if (languageStatus != Status.regular
226                         // make exception for scripts that has no known languages
227                         && !SCRIPTS_WITH_NO_LANGUAGES.contains(script)) {
228                     errors.add(
229                             "ScriptMetadata.java: the likely language ("
230                                     + likelyLanguage
231                                     + ") for "
232                                     + script
233                                     + " is not valid: "
234                                     + languageStatus);
235                 }
236             }
237         }
238 
Info(Info other, String string, String sampleCharacter)239         public Info(Info other, String string, String sampleCharacter) {
240             rank = other.rank;
241             age = other.age;
242             sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter;
243             idUsage = other.idUsage;
244             rtl = other.rtl;
245             lbLetters = other.lbLetters;
246             hasCase = other.hasCase;
247             shapingReq = other.shapingReq;
248             ime = "IME:YES".equals(string) ? Trinary.YES : other.ime;
249             density = other.density;
250             originCountry = other.originCountry;
251             likelyLanguage = other.likelyLanguage;
252         }
253 
254         // public Trinary parseTrinary(Column title, String[] items) {
255         // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH));
256         // }
fix(String in)257         String fix(String in) {
258             return in.toUpperCase(Locale.ENGLISH)
259                     .replace("N/A", "UNKNOWN")
260                     .replace("?", "UNKNOWN")
261                     .replace("RTL", "YES");
262         }
263 
264         @Override
toString()265         public String toString() {
266             return rank
267                     + "\tSample: "
268                     + sampleChar
269                     + "\tCountry: "
270                     + getName("territory", originCountry)
271                     + " ("
272                     + originCountry
273                     + ")"
274                     + "\tLanguage: "
275                     + getName("language", likelyLanguage)
276                     + " ("
277                     + likelyLanguage
278                     + ")"
279                     + "\tId: "
280                     + idUsage
281                     + "\tRtl: "
282                     + rtl
283                     + "\tLb: "
284                     + lbLetters
285                     + "\tShape: "
286                     + shapingReq
287                     + "\tIme: "
288                     + ime
289                     + "\tCase: "
290                     + hasCase
291                     + "\tDensity: "
292                     + density;
293         }
294 
getName(String type, String code)295         public Object getName(String type, String code) {
296             List<String> fullData = SC.getFullData(type, code);
297             if (fullData == null) {
298                 return "unavailable";
299             }
300             return fullData.get(0);
301         }
302 
303         @Override
compareTo(Info o)304         public int compareTo(Info o) {
305             // we don't actually care what the comparison value is, as long as it is transitive and
306             // consistent with equals.
307             return toString().compareTo(o.toString());
308         }
309     }
310 
311     public static Set<String> errors = new LinkedHashSet<>();
312     static HashMap<String, Integer> titleToColumn = new HashMap<>();
313 
314     private static class MyFileReader extends SemiFileReader {
315         private Map<String, Info> data = new HashMap<>();
316 
317         @Override
isCodePoint()318         protected boolean isCodePoint() {
319             return false;
320         }
321 
322         @Override
splitLine(String line)323         protected String[] splitLine(String line) {
324             return CldrUtility.splitCommaSeparated(line);
325         }
326 
327         @Override
handleLine(int lineCount, int start, int end, String[] items)328         protected boolean handleLine(int lineCount, int start, int end, String[] items) {
329             if (items[0].startsWith("For help") || items[0].isEmpty()) {
330                 return true; // header lines
331             }
332             if (items[0].equals("WR")) {
333                 Column.setColumns(items);
334                 return true;
335             }
336             Info info;
337             try {
338                 info = new Info(items);
339             } catch (SkipNewUnicodeException e) {
340                 return true;
341             } catch (Exception e) {
342                 errors.add(
343                         e.getClass().getName()
344                                 + "\t"
345                                 + e.getMessage()
346                                 + "\t"
347                                 + Arrays.asList(items));
348                 return true;
349             }
350 
351             String script = items[2];
352             data.put(script, info);
353             Set<String> extras = EXTRAS.get(script);
354             if (extras != null) {
355                 for (String script2 : extras) {
356                     Info info2 = info;
357                     if (script2.equals("Jpan")) {
358                         // HACK
359                         info2 = new Info(info, "IME:YES", null);
360                     } else if (script2.equals("Jamo")) {
361                         info2 = new Info(info, null, "ᄒ");
362                     }
363                     data.put(script2, info2);
364                 }
365             }
366             return true;
367         }
368 
369         @Override
process(Class<?> classLocation, String fileName)370         public MyFileReader process(Class<?> classLocation, String fileName) {
371             super.process(classLocation, fileName);
372             return this;
373         }
374 
getData()375         private Map<String, Info> getData() {
376             if (!errors.isEmpty()) {
377                 throw new RuntimeException(Joiner.on("\n\t").join(errors));
378             }
379             return Collections.unmodifiableMap(data);
380         }
381     }
382 
383     public enum Groupings {
384         EUROPEAN("150"),
385         MIDDLE_EASTERN("145"),
386         CENTRAL_ASIAN("143"),
387         SOUTH_ASIAN("034"),
388         SOUTHEAST_ASIAN("035"),
389         EAST_ASIAN("030"),
390         AFRICAN("002"),
391         AMERICAN("019"),
392         ;
393         public final Set<String> scripts;
394 
Groupings(String... regions)395         private Groupings(String... regions) {
396             scripts =
397                     With.in(getScripts())
398                             .toUnmodifiableCollection(
399                                     new ScriptMetadata.RegionFilter(regions),
400                                     new TreeSet<String>());
401         }
402     }
403 
404     static class RegionFilter implements com.ibm.icu.text.Transform<String, String> {
405         final String[] containingRegion;
406 
RegionFilter(String... containingRegion)407         RegionFilter(String... containingRegion) {
408             this.containingRegion = containingRegion;
409         }
410 
411         @Override
transform(String script)412         public String transform(String script) {
413             String currentRegion = getInfo(script).originCountry;
414             while (true) {
415                 for (String s : containingRegion) {
416                     if (s.equals(currentRegion)) {
417                         return script;
418                     }
419                 }
420                 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) {
421                     return null;
422                 }
423                 currentRegion = Containment.getContainer(currentRegion);
424             }
425         }
426     }
427 
428     static Relation<String, String> EXTRAS =
429             Relation.of(new HashMap<String, Set<String>>(), HashSet.class);
430 
431     static {
432         EXTRAS.put("Hani", "Hans");
433         EXTRAS.put("Hani", "Hant");
434         EXTRAS.put("Hani", "Hanb");
435         EXTRAS.put("Hang", "Kore");
436         EXTRAS.put("Hang", "Jamo");
437         EXTRAS.put("Hira", "Jpan");
EXTRAS.freeze()438         EXTRAS.freeze();
439     }
440 
441     static final Map<String, Info> data =
442             new MyFileReader().process(ScriptMetadata.class, DATA_FILE).getData();
443 
getInfo(String s)444     public static Info getInfo(String s) {
445         Info result = data.get(s);
446         if (result == null) {
447             try {
448                 String name2 = UScript.getShortName(UScript.getCodeFromName(s));
449                 result = data.get(name2);
450             } catch (Exception e) {
451             }
452         }
453         return result;
454     }
455 
getScripts()456     public static Set<String> getScripts() {
457         return data.keySet();
458     }
459 
getInfo(int i)460     public static Info getInfo(int i) {
461         return data.get(UScript.getShortName(i));
462     }
463 
iterable()464     public static Set<Entry<String, Info>> iterable() {
465         return data.entrySet();
466     }
467 
468     /**
469      * Specialized scripts
470      *
471      * @return
472      */
getExtras()473     public static Set<String> getExtras() {
474         return EXTRAS.values();
475     }
476 
477     public static Transform<String, String> TO_SHORT_SCRIPT =
478             new Transform<>() {
479                 @Override
480                 public String transform(String source) {
481                     return UScript.getShortName(UScript.getCodeFromName(source));
482                 }
483             };
484     public static Transform<String, String> TO_LONG_SCRIPT =
485             new Transform<>() {
486                 @Override
487                 public String transform(String source) {
488                     return UScript.getName(UScript.getCodeFromName(source));
489                 }
490             };
491 }
492