1 package org.unicode.cldr.draft; 2 3 import java.util.Arrays; 4 import java.util.Collections; 5 import java.util.HashMap; 6 import java.util.HashSet; 7 import java.util.LinkedHashSet; 8 import java.util.List; 9 import java.util.Locale; 10 import java.util.Map; 11 import java.util.Map.Entry; 12 import java.util.Set; 13 import java.util.TreeSet; 14 15 import org.unicode.cldr.tool.CountryCodeConverter; 16 import org.unicode.cldr.util.CldrUtility; 17 import org.unicode.cldr.util.Containment; 18 import org.unicode.cldr.util.SemiFileReader; 19 import org.unicode.cldr.util.StandardCodes; 20 import org.unicode.cldr.util.With; 21 22 import com.ibm.icu.dev.util.CollectionUtilities; 23 import com.ibm.icu.impl.Relation; 24 import com.ibm.icu.lang.UScript; 25 import com.ibm.icu.text.Transform; 26 import com.ibm.icu.text.UTF16; 27 import com.ibm.icu.util.ICUException; 28 import com.ibm.icu.util.VersionInfo; 29 30 public class ScriptMetadata { 31 private static final int MAX_RANK = 33; 32 private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv"; 33 private static final VersionInfo UNICODE_VERSION = VersionInfo.getInstance(CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "11")); 34 35 // To get the data, go do the Script MetaData spreadsheet 36 // Download As Comma Separated Items into DATA_FILE 37 // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for Unicode beta). 38 // Run TestScriptMetadata. 39 // Then run GenerateScriptMetadata. 40 // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata 41 private enum Column { 42 // must match the spreadsheet header (caseless compare) or have the alternate header as an argument. 43 // doesn't have to be in order 44 WR, AGE, SAMPLE_CODE, ID_USAGE("ID Usage (UAX31)"), RTL("RTL?"), LB_LETTERS("LB letters?"), SHAPING_REQ("Shaping Req?"), IME("IME?"), ORIGIN_COUNTRY( 45 "Origin Country"), DENSITY("~Density"), LANG_CODE, HAS_CASE("Has Case?"); 46 47 int columnNumber = -1; 48 final Set<String> names = new HashSet<String>(); 49 Column(String... alternateNames)50 Column(String... alternateNames) { 51 names.add(this.name()); 52 for (String name : alternateNames) { 53 names.add(name.toUpperCase(Locale.ENGLISH)); 54 } 55 } 56 setColumns(String[] headers)57 static void setColumns(String[] headers) { 58 for (int i = 0; i < headers.length; ++i) { 59 String header = headers[i].toUpperCase(Locale.ENGLISH); 60 for (Column v : values()) { 61 if (v.names.contains(header)) { 62 v.columnNumber = i; 63 } 64 } 65 } 66 for (Column v : values()) { 67 if (v.columnNumber == -1) { 68 throw new IllegalArgumentException("Missing field for " + v 69 + ", may need to add additional column alias"); 70 } 71 } 72 } 73 getItem(String[] items)74 String getItem(String[] items) { 75 return items[columnNumber]; 76 } 77 getInt(String[] items, int defaultValue)78 int getInt(String[] items, int defaultValue) { 79 final String item = getItem(items); 80 return item.isEmpty() || item.equalsIgnoreCase("n/a") ? defaultValue : Integer.parseInt(item); 81 } 82 } 83 84 public enum IdUsage { 85 UNKNOWN("Other"), EXCLUSION("Historic"), LIMITED_USE("Limited Use"), ASPIRATIONAL("Aspirational"), RECOMMENDED("Major Use"); 86 87 public final String name; 88 IdUsage(String name)89 private IdUsage(String name) { 90 this.name = name; 91 } 92 } 93 94 public enum Trinary { 95 UNKNOWN, NO, YES 96 } 97 98 public enum Shaping { 99 UNKNOWN, NO, MIN, YES 100 } 101 102 static StandardCodes SC = StandardCodes.make(); 103 // static HashMap<String,String> NAME_TO_REGION_CODE = new HashMap<String,String>(); 104 // static HashMap<String,String> NAME_TO_LANGUAGE_CODE = new HashMap<String,String>(); 105 static EnumLookup<Shaping> shapingLookup = EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN); 106 static EnumLookup<Trinary> trinaryLookup = EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN); 107 static EnumLookup<IdUsage> idUsageLookup = EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN); 108 static { 109 // addNameToCode("language", NAME_TO_LANGUAGE_CODE); 110 // // NAME_TO_LANGUAGE_CODE.put("", "und"); 111 // NAME_TO_LANGUAGE_CODE.put("N/A", "und"); 112 // addSynonym(NAME_TO_LANGUAGE_CODE, "Ancient Greek", "Ancient Greek (to 1453)"); 113 // //addSynonym(NAME_TO_LANGUAGE_CODE, "Khmer", "Cambodian"); 114 // addSynonym(NAME_TO_LANGUAGE_CODE, "Old Irish", "Old Irish (to 900)"); 115 116 // addNameToCode("region", NAME_TO_REGION_CODE); 117 // // NAME_TO_REGION_CODE.put("UNKNOWN", "ZZ"); 118 // // NAME_TO_REGION_CODE.put("", "ZZ"); 119 // NAME_TO_REGION_CODE.put("N/A", "ZZ"); 120 // addSynonym(NAME_TO_REGION_CODE, "Laos", "Lao People's Democratic Republic"); 121 } 122 addNameToCode(String type, Map<String, String> hashMap)123 public static void addNameToCode(String type, Map<String, String> hashMap) { 124 for (String language : SC.getAvailableCodes(type)) { 125 Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language); 126 String name = fullData.get("Description"); 127 hashMap.put(name.toUpperCase(Locale.ENGLISH), language); 128 } 129 } 130 addSynonym(Map<String, String> map, String newTerm, String oldTerm)131 public static void addSynonym(Map<String, String> map, String newTerm, String oldTerm) { 132 String code = map.get(oldTerm.toUpperCase(Locale.ENGLISH)); 133 map.put(newTerm.toUpperCase(Locale.ENGLISH), code); 134 } 135 136 public static final class SkipNewUnicodeException extends ICUException { 137 } 138 139 public static class Info implements Comparable<Info> { 140 public final int rank; 141 public final VersionInfo age; 142 public final String sampleChar; 143 public final IdUsage idUsage; 144 public final Trinary rtl; 145 public final Trinary lbLetters; 146 public final Trinary hasCase; 147 public final Shaping shapingReq; 148 public final Trinary ime; 149 public final int density; 150 public final String originCountry; 151 public final String likelyLanguage; 152 Info(String[] items)153 private Info(String[] items) { 154 // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no 155 rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK); 156 age = VersionInfo.getInstance(Column.AGE.getItem(items)); 157 if (age.compareTo(UNICODE_VERSION) > 0) { 158 throw new SkipNewUnicodeException(); 159 } 160 // Parse the code point of the sample character, rather than the sample character itself. 161 // The code point is more reliable, especially when the spreadsheet has a bug 162 // for supplementary characters. 163 int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16); 164 sampleChar = UTF16.valueOf(sampleCode); 165 idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items)); 166 rtl = trinaryLookup.forString(Column.RTL.getItem(items)); 167 lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items)); 168 shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items)); 169 ime = trinaryLookup.forString(Column.IME.getItem(items)); 170 hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items)); 171 density = Column.DENSITY.getInt(items, -1); 172 173 final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items); 174 String country = CountryCodeConverter.getCodeFromName(countryRaw); 175 // NAME_TO_REGION_CODE.get(countryRaw.toUpperCase(Locale.ENGLISH)); 176 if (country == null) { 177 errors.add("Can't map " + countryRaw + " to country/region"); 178 } 179 originCountry = country == null ? "ZZ" : country; 180 181 String langCode = Column.LANG_CODE.getItem(items); 182 if (langCode.equals("n/a")) { 183 langCode = null; 184 } 185 likelyLanguage = langCode == null ? "und" : langCode; 186 } 187 Info(Info other, String string, String sampleCharacter)188 public Info(Info other, String string, String sampleCharacter) { 189 rank = other.rank; 190 age = other.age; 191 sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter; 192 idUsage = other.idUsage; 193 rtl = other.rtl; 194 lbLetters = other.lbLetters; 195 hasCase = other.hasCase; 196 shapingReq = other.shapingReq; 197 ime = "IME:YES".equals(string) ? Trinary.YES : other.ime; 198 density = other.density; 199 originCountry = other.originCountry; 200 likelyLanguage = other.likelyLanguage; 201 } 202 203 // public Trinary parseTrinary(Column title, String[] items) { 204 // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH)); 205 // } fix(String in)206 String fix(String in) { 207 return in.toUpperCase(Locale.ENGLISH).replace("N/A", "UNKNOWN").replace("?", "UNKNOWN") 208 .replace("RTL", "YES"); 209 } 210 toString()211 public String toString() { 212 return rank 213 + "\tSample: " + sampleChar 214 + "\tCountry: " + getName("territory", originCountry) + " (" + originCountry + ")" 215 + "\tLanguage: " + getName("language", likelyLanguage) + " (" + likelyLanguage + ")" 216 + "\tId: " + idUsage 217 + "\tRtl: " + rtl 218 + "\tLb: " + lbLetters 219 + "\tShape: " + shapingReq 220 + "\tIme: " + ime 221 + "\tCase: " + hasCase 222 + "\tDensity: " + density; 223 } 224 getName(String type, String code)225 public Object getName(String type, String code) { 226 List<String> fullData = SC.getFullData(type, code); 227 if (fullData == null) { 228 return "unavailable"; 229 } 230 return fullData.get(0); 231 } 232 233 @Override compareTo(Info o)234 public int compareTo(Info o) { 235 // we don't actually care what the comparison value is, as long as it is transitive and consistent with equals. 236 return toString().compareTo(o.toString()); 237 } 238 } 239 240 public static Set<String> errors = new LinkedHashSet<String>(); 241 static HashMap<String, Integer> titleToColumn = new HashMap<String, Integer>(); 242 243 private static class MyFileReader extends SemiFileReader { 244 private Map<String, Info> data = new HashMap<String, Info>(); 245 246 @Override isCodePoint()247 protected boolean isCodePoint() { 248 return false; 249 } 250 251 @Override splitLine(String line)252 protected String[] splitLine(String line) { 253 return CldrUtility.splitCommaSeparated(line); 254 }; 255 256 @Override handleLine(int lineCount, int start, int end, String[] items)257 protected boolean handleLine(int lineCount, int start, int end, String[] items) { 258 if (items[0].startsWith("For help") || items[0].isEmpty()) { 259 return true; // header lines 260 } 261 if (items[0].equals("WR")) { 262 Column.setColumns(items); 263 return true; 264 } 265 Info info; 266 try { 267 info = new Info(items); 268 } catch (SkipNewUnicodeException e) { 269 return true; 270 } catch (Exception e) { 271 errors.add(e.getClass().getName() + "\t" + e.getMessage() + "\t" + Arrays.asList(items)); 272 return true; 273 } 274 275 String script = items[2]; 276 data.put(script, info); 277 Set<String> extras = EXTRAS.get(script); 278 if (extras != null) { 279 for (String script2 : extras) { 280 Info info2 = info; 281 if (script2.equals("Jpan")) { 282 // HACK 283 info2 = new Info(info, "IME:YES", null); 284 } else if (script2.equals("Jamo")) { 285 info2 = new Info(info, null, "ᄒ"); 286 } 287 data.put(script2, info2); 288 } 289 } 290 return true; 291 } 292 293 @Override process(Class<?> classLocation, String fileName)294 public MyFileReader process(Class<?> classLocation, String fileName) { 295 super.process(classLocation, fileName); 296 return this; 297 } 298 getData()299 private Map<String, Info> getData() { 300 if (!errors.isEmpty()) { 301 throw new RuntimeException(CollectionUtilities.join(errors, "\n\t")); 302 } 303 return Collections.unmodifiableMap(data); 304 } 305 } 306 307 public enum Groupings { 308 EUROPEAN("150"), 309 MIDDLE_EASTERN("145"), 310 CENTRAL_ASIAN("143"), 311 SOUTH_ASIAN("034"), 312 SOUTHEAST_ASIAN("035"), 313 EAST_ASIAN("030"), 314 AFRICAN("002"), 315 AMERICAN("019"),; 316 public final Set<String> scripts; 317 Groupings(String... regions)318 private Groupings(String... regions) { 319 scripts = With 320 .in(getScripts()) 321 .toUnmodifiableCollection( 322 new ScriptMetadata.RegionFilter(regions), new TreeSet<String>()); 323 } 324 } 325 326 static class RegionFilter implements com.ibm.icu.text.Transform<String, String> { 327 final String[] containingRegion; 328 RegionFilter(String... containingRegion)329 RegionFilter(String... containingRegion) { 330 this.containingRegion = containingRegion; 331 } 332 333 @Override transform(String script)334 public String transform(String script) { 335 String currentRegion = getInfo(script).originCountry; 336 while (true) { 337 for (String s : containingRegion) { 338 if (s.equals(currentRegion)) { 339 return script; 340 } 341 } 342 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) { 343 return null; 344 } 345 currentRegion = Containment.getContainer(currentRegion); 346 } 347 } 348 } 349 350 static Relation<String, String> EXTRAS = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 351 static { 352 EXTRAS.put("Hani", "Hans"); 353 EXTRAS.put("Hani", "Hant"); 354 EXTRAS.put("Hani", "Hanb"); 355 EXTRAS.put("Hang", "Kore"); 356 EXTRAS.put("Hang", "Jamo"); 357 EXTRAS.put("Hira", "Jpan"); EXTRAS.freeze()358 EXTRAS.freeze(); 359 } 360 static final Map<String, Info> data = new MyFileReader() 361 .process(ScriptMetadata.class, DATA_FILE).getData(); 362 getInfo(String s)363 public static Info getInfo(String s) { 364 Info result = data.get(s); 365 if (result == null) { 366 try { 367 String name2 = UScript.getShortName(UScript.getCodeFromName(s)); 368 result = data.get(name2); 369 } catch (Exception e) { 370 } 371 } 372 return result; 373 } 374 getScripts()375 public static Set<String> getScripts() { 376 return data.keySet(); 377 } 378 getInfo(int i)379 public static Info getInfo(int i) { 380 return data.get(UScript.getShortName(i)); 381 } 382 iterable()383 public static Set<Entry<String, Info>> iterable() { 384 return data.entrySet(); 385 } 386 387 /** 388 * Specialized scripts 389 * @return 390 */ getExtras()391 public static Set<String> getExtras() { 392 return EXTRAS.values(); 393 } 394 395 public static Transform<String, String> TO_SHORT_SCRIPT = new Transform<String, String>() { 396 @Override 397 public String transform(String source) { 398 return UScript.getShortName(UScript.getCodeFromName(source)); 399 } 400 }; 401 public static Transform<String, String> TO_LONG_SCRIPT = new Transform<String, String>() { 402 @Override 403 public String transform(String source) { 404 return UScript.getName(UScript.getCodeFromName(source)); 405 } 406 }; 407 } 408