1 package org.unicode.cldr.draft; 2 3 import java.util.Arrays; 4 import java.util.Collections; 5 import java.util.HashMap; 6 import java.util.HashSet; 7 import java.util.LinkedHashSet; 8 import java.util.List; 9 import java.util.Locale; 10 import java.util.Map; 11 import java.util.Map.Entry; 12 import java.util.Set; 13 import java.util.TreeSet; 14 15 import org.unicode.cldr.tool.CountryCodeConverter; 16 import org.unicode.cldr.util.CldrUtility; 17 import org.unicode.cldr.util.Containment; 18 import org.unicode.cldr.util.SemiFileReader; 19 import org.unicode.cldr.util.StandardCodes; 20 import org.unicode.cldr.util.With; 21 22 import com.ibm.icu.dev.util.CollectionUtilities; 23 import com.ibm.icu.impl.Relation; 24 import com.ibm.icu.lang.UScript; 25 import com.ibm.icu.text.Transform; 26 import com.ibm.icu.text.UTF16; 27 import com.ibm.icu.util.ICUException; 28 import com.ibm.icu.util.VersionInfo; 29 30 public class ScriptMetadata { 31 private static final int MAX_RANK = 33; 32 private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv"; 33 private static final VersionInfo UNICODE_VERSION = VersionInfo.getInstance( 34 CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "13")); 35 36 // To get the data, go do the Script MetaData spreadsheet 37 // Download As Comma Separated Items into DATA_FILE 38 // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for Unicode beta). 39 // Run TestScriptMetadata. 40 // Then run GenerateScriptMetadata. 41 // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata 42 private enum Column { 43 // must match the spreadsheet header (caseless compare) or have the alternate header as an argument. 44 // doesn't have to be in order 45 WR, AGE, SAMPLE_CODE, ID_USAGE("ID Usage (UAX31)"), RTL("RTL?"), LB_LETTERS("LB letters?"), SHAPING_REQ("Shaping Req?"), IME("IME?"), ORIGIN_COUNTRY( 46 "Origin Country"), DENSITY("~Density"), LANG_CODE, HAS_CASE("Has Case?"); 47 48 int columnNumber = -1; 49 final Set<String> names = new HashSet<String>(); 50 Column(String... alternateNames)51 Column(String... alternateNames) { 52 names.add(this.name()); 53 for (String name : alternateNames) { 54 names.add(name.toUpperCase(Locale.ENGLISH)); 55 } 56 } 57 setColumns(String[] headers)58 static void setColumns(String[] headers) { 59 for (int i = 0; i < headers.length; ++i) { 60 String header = headers[i].toUpperCase(Locale.ENGLISH); 61 for (Column v : values()) { 62 if (v.names.contains(header)) { 63 v.columnNumber = i; 64 } 65 } 66 } 67 for (Column v : values()) { 68 if (v.columnNumber == -1) { 69 throw new IllegalArgumentException("Missing field for " + v 70 + ", may need to add additional column alias"); 71 } 72 } 73 } 74 getItem(String[] items)75 String getItem(String[] items) { 76 return items[columnNumber]; 77 } 78 getInt(String[] items, int defaultValue)79 int getInt(String[] items, int defaultValue) { 80 final String item = getItem(items); 81 return item.isEmpty() || item.equalsIgnoreCase("n/a") ? defaultValue : Integer.parseInt(item); 82 } 83 } 84 85 public enum IdUsage { 86 UNKNOWN("Other"), EXCLUSION("Historic"), LIMITED_USE("Limited Use"), ASPIRATIONAL("Aspirational"), RECOMMENDED("Major Use"); 87 88 public final String name; 89 IdUsage(String name)90 private IdUsage(String name) { 91 this.name = name; 92 } 93 } 94 95 public enum Trinary { 96 UNKNOWN, NO, YES 97 } 98 99 public enum Shaping { 100 UNKNOWN, NO, MIN, YES 101 } 102 103 static StandardCodes SC = StandardCodes.make(); 104 // static HashMap<String,String> NAME_TO_REGION_CODE = new HashMap<String,String>(); 105 // static HashMap<String,String> NAME_TO_LANGUAGE_CODE = new HashMap<String,String>(); 106 static EnumLookup<Shaping> shapingLookup = EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN); 107 static EnumLookup<Trinary> trinaryLookup = EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN); 108 static EnumLookup<IdUsage> idUsageLookup = EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN); 109 static { 110 // addNameToCode("language", NAME_TO_LANGUAGE_CODE); 111 // // NAME_TO_LANGUAGE_CODE.put("", "und"); 112 // NAME_TO_LANGUAGE_CODE.put("N/A", "und"); 113 // addSynonym(NAME_TO_LANGUAGE_CODE, "Ancient Greek", "Ancient Greek (to 1453)"); 114 // //addSynonym(NAME_TO_LANGUAGE_CODE, "Khmer", "Cambodian"); 115 // addSynonym(NAME_TO_LANGUAGE_CODE, "Old Irish", "Old Irish (to 900)"); 116 117 // addNameToCode("region", NAME_TO_REGION_CODE); 118 // // NAME_TO_REGION_CODE.put("UNKNOWN", "ZZ"); 119 // // NAME_TO_REGION_CODE.put("", "ZZ"); 120 // NAME_TO_REGION_CODE.put("N/A", "ZZ"); 121 // addSynonym(NAME_TO_REGION_CODE, "Laos", "Lao People's Democratic Republic"); 122 } 123 addNameToCode(String type, Map<String, String> hashMap)124 public static void addNameToCode(String type, Map<String, String> hashMap) { 125 for (String language : SC.getAvailableCodes(type)) { 126 Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language); 127 String name = fullData.get("Description"); 128 hashMap.put(name.toUpperCase(Locale.ENGLISH), language); 129 } 130 } 131 addSynonym(Map<String, String> map, String newTerm, String oldTerm)132 public static void addSynonym(Map<String, String> map, String newTerm, String oldTerm) { 133 String code = map.get(oldTerm.toUpperCase(Locale.ENGLISH)); 134 map.put(newTerm.toUpperCase(Locale.ENGLISH), code); 135 } 136 137 public static final class SkipNewUnicodeException extends ICUException { 138 } 139 140 public static class Info implements Comparable<Info> { 141 public final int rank; 142 public final VersionInfo age; 143 public final String sampleChar; 144 public final IdUsage idUsage; 145 public final Trinary rtl; 146 public final Trinary lbLetters; 147 public final Trinary hasCase; 148 public final Shaping shapingReq; 149 public final Trinary ime; 150 public final int density; 151 public final String originCountry; 152 public final String likelyLanguage; 153 Info(String[] items)154 private Info(String[] items) { 155 // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no 156 rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK); 157 age = VersionInfo.getInstance(Column.AGE.getItem(items)); 158 if (age.compareTo(UNICODE_VERSION) > 0) { 159 throw new SkipNewUnicodeException(); 160 } 161 // Parse the code point of the sample character, rather than the sample character itself. 162 // The code point is more reliable, especially when the spreadsheet has a bug 163 // for supplementary characters. 164 int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16); 165 sampleChar = UTF16.valueOf(sampleCode); 166 idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items)); 167 rtl = trinaryLookup.forString(Column.RTL.getItem(items)); 168 lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items)); 169 shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items)); 170 ime = trinaryLookup.forString(Column.IME.getItem(items)); 171 hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items)); 172 density = Column.DENSITY.getInt(items, -1); 173 174 final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items); 175 String country = CountryCodeConverter.getCodeFromName(countryRaw); 176 // NAME_TO_REGION_CODE.get(countryRaw.toUpperCase(Locale.ENGLISH)); 177 if (country == null) { 178 errors.add("Can't map " + countryRaw + " to country/region"); 179 } 180 originCountry = country == null ? "ZZ" : country; 181 182 String langCode = Column.LANG_CODE.getItem(items); 183 if (langCode.equals("n/a")) { 184 langCode = null; 185 } 186 likelyLanguage = langCode == null ? "und" : langCode; 187 } 188 Info(Info other, String string, String sampleCharacter)189 public Info(Info other, String string, String sampleCharacter) { 190 rank = other.rank; 191 age = other.age; 192 sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter; 193 idUsage = other.idUsage; 194 rtl = other.rtl; 195 lbLetters = other.lbLetters; 196 hasCase = other.hasCase; 197 shapingReq = other.shapingReq; 198 ime = "IME:YES".equals(string) ? Trinary.YES : other.ime; 199 density = other.density; 200 originCountry = other.originCountry; 201 likelyLanguage = other.likelyLanguage; 202 } 203 204 // public Trinary parseTrinary(Column title, String[] items) { 205 // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH)); 206 // } fix(String in)207 String fix(String in) { 208 return in.toUpperCase(Locale.ENGLISH).replace("N/A", "UNKNOWN").replace("?", "UNKNOWN") 209 .replace("RTL", "YES"); 210 } 211 toString()212 public String toString() { 213 return rank 214 + "\tSample: " + sampleChar 215 + "\tCountry: " + getName("territory", originCountry) + " (" + originCountry + ")" 216 + "\tLanguage: " + getName("language", likelyLanguage) + " (" + likelyLanguage + ")" 217 + "\tId: " + idUsage 218 + "\tRtl: " + rtl 219 + "\tLb: " + lbLetters 220 + "\tShape: " + shapingReq 221 + "\tIme: " + ime 222 + "\tCase: " + hasCase 223 + "\tDensity: " + density; 224 } 225 getName(String type, String code)226 public Object getName(String type, String code) { 227 List<String> fullData = SC.getFullData(type, code); 228 if (fullData == null) { 229 return "unavailable"; 230 } 231 return fullData.get(0); 232 } 233 234 @Override compareTo(Info o)235 public int compareTo(Info o) { 236 // we don't actually care what the comparison value is, as long as it is transitive and consistent with equals. 237 return toString().compareTo(o.toString()); 238 } 239 } 240 241 public static Set<String> errors = new LinkedHashSet<String>(); 242 static HashMap<String, Integer> titleToColumn = new HashMap<String, Integer>(); 243 244 private static class MyFileReader extends SemiFileReader { 245 private Map<String, Info> data = new HashMap<String, Info>(); 246 247 @Override isCodePoint()248 protected boolean isCodePoint() { 249 return false; 250 } 251 252 @Override splitLine(String line)253 protected String[] splitLine(String line) { 254 return CldrUtility.splitCommaSeparated(line); 255 }; 256 257 @Override handleLine(int lineCount, int start, int end, String[] items)258 protected boolean handleLine(int lineCount, int start, int end, String[] items) { 259 if (items[0].startsWith("For help") || items[0].isEmpty()) { 260 return true; // header lines 261 } 262 if (items[0].equals("WR")) { 263 Column.setColumns(items); 264 return true; 265 } 266 Info info; 267 try { 268 info = new Info(items); 269 } catch (SkipNewUnicodeException e) { 270 return true; 271 } catch (Exception e) { 272 errors.add(e.getClass().getName() + "\t" + e.getMessage() + "\t" + Arrays.asList(items)); 273 return true; 274 } 275 276 String script = items[2]; 277 data.put(script, info); 278 Set<String> extras = EXTRAS.get(script); 279 if (extras != null) { 280 for (String script2 : extras) { 281 Info info2 = info; 282 if (script2.equals("Jpan")) { 283 // HACK 284 info2 = new Info(info, "IME:YES", null); 285 } else if (script2.equals("Jamo")) { 286 info2 = new Info(info, null, "ᄒ"); 287 } 288 data.put(script2, info2); 289 } 290 } 291 return true; 292 } 293 294 @Override process(Class<?> classLocation, String fileName)295 public MyFileReader process(Class<?> classLocation, String fileName) { 296 super.process(classLocation, fileName); 297 return this; 298 } 299 getData()300 private Map<String, Info> getData() { 301 if (!errors.isEmpty()) { 302 throw new RuntimeException(CollectionUtilities.join(errors, "\n\t")); 303 } 304 return Collections.unmodifiableMap(data); 305 } 306 } 307 308 public enum Groupings { 309 EUROPEAN("150"), 310 MIDDLE_EASTERN("145"), 311 CENTRAL_ASIAN("143"), 312 SOUTH_ASIAN("034"), 313 SOUTHEAST_ASIAN("035"), 314 EAST_ASIAN("030"), 315 AFRICAN("002"), 316 AMERICAN("019"),; 317 public final Set<String> scripts; 318 Groupings(String... regions)319 private Groupings(String... regions) { 320 scripts = With 321 .in(getScripts()) 322 .toUnmodifiableCollection( 323 new ScriptMetadata.RegionFilter(regions), new TreeSet<String>()); 324 } 325 } 326 327 static class RegionFilter implements com.ibm.icu.text.Transform<String, String> { 328 final String[] containingRegion; 329 RegionFilter(String... containingRegion)330 RegionFilter(String... containingRegion) { 331 this.containingRegion = containingRegion; 332 } 333 334 @Override transform(String script)335 public String transform(String script) { 336 String currentRegion = getInfo(script).originCountry; 337 while (true) { 338 for (String s : containingRegion) { 339 if (s.equals(currentRegion)) { 340 return script; 341 } 342 } 343 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) { 344 return null; 345 } 346 currentRegion = Containment.getContainer(currentRegion); 347 } 348 } 349 } 350 351 static Relation<String, String> EXTRAS = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 352 static { 353 EXTRAS.put("Hani", "Hans"); 354 EXTRAS.put("Hani", "Hant"); 355 EXTRAS.put("Hani", "Hanb"); 356 EXTRAS.put("Hang", "Kore"); 357 EXTRAS.put("Hang", "Jamo"); 358 EXTRAS.put("Hira", "Jpan"); EXTRAS.freeze()359 EXTRAS.freeze(); 360 } 361 static final Map<String, Info> data = new MyFileReader() 362 .process(ScriptMetadata.class, DATA_FILE).getData(); 363 getInfo(String s)364 public static Info getInfo(String s) { 365 Info result = data.get(s); 366 if (result == null) { 367 try { 368 String name2 = UScript.getShortName(UScript.getCodeFromName(s)); 369 result = data.get(name2); 370 } catch (Exception e) { 371 } 372 } 373 return result; 374 } 375 getScripts()376 public static Set<String> getScripts() { 377 return data.keySet(); 378 } 379 getInfo(int i)380 public static Info getInfo(int i) { 381 return data.get(UScript.getShortName(i)); 382 } 383 iterable()384 public static Set<Entry<String, Info>> iterable() { 385 return data.entrySet(); 386 } 387 388 /** 389 * Specialized scripts 390 * @return 391 */ getExtras()392 public static Set<String> getExtras() { 393 return EXTRAS.values(); 394 } 395 396 public static Transform<String, String> TO_SHORT_SCRIPT = new Transform<String, String>() { 397 @Override 398 public String transform(String source) { 399 return UScript.getShortName(UScript.getCodeFromName(source)); 400 } 401 }; 402 public static Transform<String, String> TO_LONG_SCRIPT = new Transform<String, String>() { 403 @Override 404 public String transform(String source) { 405 return UScript.getName(UScript.getCodeFromName(source)); 406 } 407 }; 408 } 409