1 package org.unicode.cldr.draft; 2 3 import com.google.common.base.Joiner; 4 import com.ibm.icu.impl.Relation; 5 import com.ibm.icu.lang.UScript; 6 import com.ibm.icu.text.Transform; 7 import com.ibm.icu.text.UTF16; 8 import com.ibm.icu.util.ICUException; 9 import com.ibm.icu.util.VersionInfo; 10 import java.util.Arrays; 11 import java.util.Collections; 12 import java.util.HashMap; 13 import java.util.HashSet; 14 import java.util.LinkedHashSet; 15 import java.util.List; 16 import java.util.Locale; 17 import java.util.Map; 18 import java.util.Map.Entry; 19 import java.util.Set; 20 import java.util.TreeSet; 21 import org.unicode.cldr.tool.CountryCodeConverter; 22 import org.unicode.cldr.util.CldrUtility; 23 import org.unicode.cldr.util.Containment; 24 import org.unicode.cldr.util.SemiFileReader; 25 import org.unicode.cldr.util.StandardCodes; 26 import org.unicode.cldr.util.StandardCodes.LstrType; 27 import org.unicode.cldr.util.Validity; 28 import org.unicode.cldr.util.Validity.Status; 29 import org.unicode.cldr.util.With; 30 31 public class ScriptMetadata { 32 private static final int MAX_RANK = 33; 33 private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv"; 34 private static final VersionInfo UNICODE_VERSION = 35 VersionInfo.getInstance(CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "16")); 36 37 // To get the data, go do the Script MetaData spreadsheet 38 // Download As Comma Separated Items into DATA_FILE 39 // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for 40 // Unicode beta). 41 // Run TestScriptMetadata. 42 // Then run GenerateScriptMetadata. 43 // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata 44 private enum Column { 45 // must match the spreadsheet header (caseless compare) or have the alternate header as an 46 // argument. 47 // doesn't have to be in order 48 WR, 49 AGE, 50 SAMPLE_CODE, 51 ID_USAGE("ID Usage (UAX31)"), 52 RTL("RTL?"), 53 LB_LETTERS("LB letters?"), 54 SHAPING_REQ("Shaping Req?"), 55 IME("IME?"), 56 ORIGIN_COUNTRY("Origin Country"), 57 DENSITY("~Density"), 58 LANG_CODE, 59 HAS_CASE("Has Case?"); 60 61 int columnNumber = -1; 62 final Set<String> names = new HashSet<>(); 63 Column(String... alternateNames)64 Column(String... alternateNames) { 65 names.add(this.name()); 66 for (String name : alternateNames) { 67 names.add(name.toUpperCase(Locale.ENGLISH)); 68 } 69 } 70 setColumns(String[] headers)71 static void setColumns(String[] headers) { 72 for (int i = 0; i < headers.length; ++i) { 73 String header = headers[i].toUpperCase(Locale.ENGLISH); 74 for (Column v : values()) { 75 if (v.names.contains(header)) { 76 v.columnNumber = i; 77 } 78 } 79 } 80 for (Column v : values()) { 81 if (v.columnNumber == -1) { 82 throw new IllegalArgumentException( 83 "Missing field for " + v + ", may need to add additional column alias"); 84 } 85 } 86 } 87 getItem(String[] items)88 String getItem(String[] items) { 89 return items[columnNumber]; 90 } 91 getInt(String[] items, int defaultValue)92 int getInt(String[] items, int defaultValue) { 93 final String item = getItem(items); 94 return item.isEmpty() || item.equalsIgnoreCase("n/a") 95 ? defaultValue 96 : Integer.parseInt(item); 97 } 98 } 99 100 public enum IdUsage { 101 UNKNOWN("Other"), 102 EXCLUSION("Historic"), 103 LIMITED_USE("Limited Use"), 104 ASPIRATIONAL("Aspirational"), 105 RECOMMENDED("Major Use"); 106 107 public final String name; 108 IdUsage(String name)109 private IdUsage(String name) { 110 this.name = name; 111 } 112 } 113 114 public enum Trinary { 115 UNKNOWN, 116 NO, 117 YES 118 } 119 120 public enum Shaping { 121 UNKNOWN, 122 NO, 123 MIN, 124 YES 125 } 126 127 static StandardCodes SC = StandardCodes.make(); 128 static EnumLookup<Shaping> shapingLookup = 129 EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN); 130 static EnumLookup<Trinary> trinaryLookup = 131 EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN); 132 static EnumLookup<IdUsage> idUsageLookup = 133 EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN); 134 addNameToCode(String type, Map<String, String> hashMap)135 public static void addNameToCode(String type, Map<String, String> hashMap) { 136 for (String language : SC.getAvailableCodes(type)) { 137 Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language); 138 String name = fullData.get("Description"); 139 hashMap.put(name.toUpperCase(Locale.ENGLISH), language); 140 } 141 } 142 143 public static final class SkipNewUnicodeException extends ICUException {} 144 145 /** 146 * Scripts that either have no known languages as yet (Cpmn) or are used for any language 147 * (Brai). 148 */ 149 public static final Set<String> SCRIPTS_WITH_NO_LANGUAGES = Set.of("Brai", "Cpmn"); 150 151 public static class Info implements Comparable<Info> { 152 public final int rank; 153 public final VersionInfo age; 154 public final String sampleChar; 155 public final IdUsage idUsage; 156 public final Trinary rtl; 157 public final Trinary lbLetters; 158 public final Trinary hasCase; 159 public final Shaping shapingReq; 160 public final Trinary ime; 161 public final int density; 162 public final String originCountry; 163 public final String likelyLanguage; 164 Info(String[] items)165 private Info(String[] items) { 166 // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no 167 rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK); 168 age = VersionInfo.getInstance(Column.AGE.getItem(items)); 169 if (age.compareTo(UNICODE_VERSION) > 0) { 170 throw new SkipNewUnicodeException(); 171 } 172 // Parse the code point of the sample character, rather than the sample character 173 // itself. 174 // The code point is more reliable, especially when the spreadsheet has a bug 175 // for supplementary characters. 176 int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16); 177 sampleChar = UTF16.valueOf(sampleCode); 178 idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items)); 179 rtl = trinaryLookup.forString(Column.RTL.getItem(items)); 180 lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items)); 181 shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items)); 182 ime = trinaryLookup.forString(Column.IME.getItem(items)); 183 hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items)); 184 density = Column.DENSITY.getInt(items, -1); 185 String script = items[2]; 186 187 final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items); 188 String country = CountryCodeConverter.getCodeFromName(countryRaw, false); 189 if (country == null) { 190 // Give context when throwing an error. Because this is run in a static init 191 // context, the stack trace is typically incorrect when something goes wrong. 192 errors.add( 193 "ScriptMetadata.java: Can't map " 194 + countryRaw 195 + " to country/region. Try updating external/alternate_country_names.txt"); 196 } 197 originCountry = country == null ? "ZZ" : country; 198 199 String langCode = Column.LANG_CODE.getItem(items); 200 if (langCode.equals("n/a")) { 201 langCode = null; 202 } 203 likelyLanguage = langCode == null ? "und" : langCode; 204 205 // check for bad countries, bad languages 206 207 final Status scriptStatus = 208 Validity.getInstance().getCodeToStatus(LstrType.script).get(script); 209 if (!(scriptStatus == Status.special || scriptStatus == Status.unknown)) { 210 final Status countryStatus = 211 Validity.getInstance().getCodeToStatus(LstrType.region).get(originCountry); 212 if (countryStatus != Status.regular) { 213 errors.add( 214 "ScriptMetadata.java: the country (" 215 + originCountry 216 + ") for " 217 + script 218 + " is not valid: " 219 + countryStatus); 220 } 221 final Status languageStatus = 222 Validity.getInstance() 223 .getCodeToStatus(LstrType.language) 224 .get(likelyLanguage); 225 if (languageStatus != Status.regular 226 // make exception for scripts that has no known languages 227 && !SCRIPTS_WITH_NO_LANGUAGES.contains(script)) { 228 errors.add( 229 "ScriptMetadata.java: the likely language (" 230 + likelyLanguage 231 + ") for " 232 + script 233 + " is not valid: " 234 + languageStatus); 235 } 236 } 237 } 238 Info(Info other, String string, String sampleCharacter)239 public Info(Info other, String string, String sampleCharacter) { 240 rank = other.rank; 241 age = other.age; 242 sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter; 243 idUsage = other.idUsage; 244 rtl = other.rtl; 245 lbLetters = other.lbLetters; 246 hasCase = other.hasCase; 247 shapingReq = other.shapingReq; 248 ime = "IME:YES".equals(string) ? Trinary.YES : other.ime; 249 density = other.density; 250 originCountry = other.originCountry; 251 likelyLanguage = other.likelyLanguage; 252 } 253 254 // public Trinary parseTrinary(Column title, String[] items) { 255 // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH)); 256 // } fix(String in)257 String fix(String in) { 258 return in.toUpperCase(Locale.ENGLISH) 259 .replace("N/A", "UNKNOWN") 260 .replace("?", "UNKNOWN") 261 .replace("RTL", "YES"); 262 } 263 264 @Override toString()265 public String toString() { 266 return rank 267 + "\tSample: " 268 + sampleChar 269 + "\tCountry: " 270 + getName("territory", originCountry) 271 + " (" 272 + originCountry 273 + ")" 274 + "\tLanguage: " 275 + getName("language", likelyLanguage) 276 + " (" 277 + likelyLanguage 278 + ")" 279 + "\tId: " 280 + idUsage 281 + "\tRtl: " 282 + rtl 283 + "\tLb: " 284 + lbLetters 285 + "\tShape: " 286 + shapingReq 287 + "\tIme: " 288 + ime 289 + "\tCase: " 290 + hasCase 291 + "\tDensity: " 292 + density; 293 } 294 getName(String type, String code)295 public Object getName(String type, String code) { 296 List<String> fullData = SC.getFullData(type, code); 297 if (fullData == null) { 298 return "unavailable"; 299 } 300 return fullData.get(0); 301 } 302 303 @Override compareTo(Info o)304 public int compareTo(Info o) { 305 // we don't actually care what the comparison value is, as long as it is transitive and 306 // consistent with equals. 307 return toString().compareTo(o.toString()); 308 } 309 } 310 311 public static Set<String> errors = new LinkedHashSet<>(); 312 static HashMap<String, Integer> titleToColumn = new HashMap<>(); 313 314 private static class MyFileReader extends SemiFileReader { 315 private Map<String, Info> data = new HashMap<>(); 316 317 @Override isCodePoint()318 protected boolean isCodePoint() { 319 return false; 320 } 321 322 @Override splitLine(String line)323 protected String[] splitLine(String line) { 324 return CldrUtility.splitCommaSeparated(line); 325 } 326 327 @Override handleLine(int lineCount, int start, int end, String[] items)328 protected boolean handleLine(int lineCount, int start, int end, String[] items) { 329 if (items[0].startsWith("For help") || items[0].isEmpty()) { 330 return true; // header lines 331 } 332 if (items[0].equals("WR")) { 333 Column.setColumns(items); 334 return true; 335 } 336 Info info; 337 try { 338 info = new Info(items); 339 } catch (SkipNewUnicodeException e) { 340 return true; 341 } catch (Exception e) { 342 errors.add( 343 e.getClass().getName() 344 + "\t" 345 + e.getMessage() 346 + "\t" 347 + Arrays.asList(items)); 348 return true; 349 } 350 351 String script = items[2]; 352 data.put(script, info); 353 Set<String> extras = EXTRAS.get(script); 354 if (extras != null) { 355 for (String script2 : extras) { 356 Info info2 = info; 357 if (script2.equals("Jpan")) { 358 // HACK 359 info2 = new Info(info, "IME:YES", null); 360 } else if (script2.equals("Jamo")) { 361 info2 = new Info(info, null, "ᄒ"); 362 } 363 data.put(script2, info2); 364 } 365 } 366 return true; 367 } 368 369 @Override process(Class<?> classLocation, String fileName)370 public MyFileReader process(Class<?> classLocation, String fileName) { 371 super.process(classLocation, fileName); 372 return this; 373 } 374 getData()375 private Map<String, Info> getData() { 376 if (!errors.isEmpty()) { 377 throw new RuntimeException(Joiner.on("\n\t").join(errors)); 378 } 379 return Collections.unmodifiableMap(data); 380 } 381 } 382 383 public enum Groupings { 384 EUROPEAN("150"), 385 MIDDLE_EASTERN("145"), 386 CENTRAL_ASIAN("143"), 387 SOUTH_ASIAN("034"), 388 SOUTHEAST_ASIAN("035"), 389 EAST_ASIAN("030"), 390 AFRICAN("002"), 391 AMERICAN("019"), 392 ; 393 public final Set<String> scripts; 394 Groupings(String... regions)395 private Groupings(String... regions) { 396 scripts = 397 With.in(getScripts()) 398 .toUnmodifiableCollection( 399 new ScriptMetadata.RegionFilter(regions), 400 new TreeSet<String>()); 401 } 402 } 403 404 static class RegionFilter implements com.ibm.icu.text.Transform<String, String> { 405 final String[] containingRegion; 406 RegionFilter(String... containingRegion)407 RegionFilter(String... containingRegion) { 408 this.containingRegion = containingRegion; 409 } 410 411 @Override transform(String script)412 public String transform(String script) { 413 String currentRegion = getInfo(script).originCountry; 414 while (true) { 415 for (String s : containingRegion) { 416 if (s.equals(currentRegion)) { 417 return script; 418 } 419 } 420 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) { 421 return null; 422 } 423 currentRegion = Containment.getContainer(currentRegion); 424 } 425 } 426 } 427 428 static Relation<String, String> EXTRAS = 429 Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 430 431 static { 432 EXTRAS.put("Hani", "Hans"); 433 EXTRAS.put("Hani", "Hant"); 434 EXTRAS.put("Hani", "Hanb"); 435 EXTRAS.put("Hang", "Kore"); 436 EXTRAS.put("Hang", "Jamo"); 437 EXTRAS.put("Hira", "Jpan"); EXTRAS.freeze()438 EXTRAS.freeze(); 439 } 440 441 static final Map<String, Info> data = 442 new MyFileReader().process(ScriptMetadata.class, DATA_FILE).getData(); 443 getInfo(String s)444 public static Info getInfo(String s) { 445 Info result = data.get(s); 446 if (result == null) { 447 try { 448 String name2 = UScript.getShortName(UScript.getCodeFromName(s)); 449 result = data.get(name2); 450 } catch (Exception e) { 451 } 452 } 453 return result; 454 } 455 getScripts()456 public static Set<String> getScripts() { 457 return data.keySet(); 458 } 459 getInfo(int i)460 public static Info getInfo(int i) { 461 return data.get(UScript.getShortName(i)); 462 } 463 iterable()464 public static Set<Entry<String, Info>> iterable() { 465 return data.entrySet(); 466 } 467 468 /** 469 * Specialized scripts 470 * 471 * @return 472 */ getExtras()473 public static Set<String> getExtras() { 474 return EXTRAS.values(); 475 } 476 477 public static Transform<String, String> TO_SHORT_SCRIPT = 478 new Transform<>() { 479 @Override 480 public String transform(String source) { 481 return UScript.getShortName(UScript.getCodeFromName(source)); 482 } 483 }; 484 public static Transform<String, String> TO_LONG_SCRIPT = 485 new Transform<>() { 486 @Override 487 public String transform(String source) { 488 return UScript.getName(UScript.getCodeFromName(source)); 489 } 490 }; 491 } 492