1 package org.unicode.cldr.draft; 2 3 import java.util.Arrays; 4 import java.util.Collections; 5 import java.util.HashMap; 6 import java.util.HashSet; 7 import java.util.LinkedHashSet; 8 import java.util.List; 9 import java.util.Locale; 10 import java.util.Map; 11 import java.util.Map.Entry; 12 import java.util.Set; 13 import java.util.TreeSet; 14 15 import org.unicode.cldr.tool.CountryCodeConverter; 16 import org.unicode.cldr.util.CldrUtility; 17 import org.unicode.cldr.util.Containment; 18 import org.unicode.cldr.util.SemiFileReader; 19 import org.unicode.cldr.util.StandardCodes; 20 import org.unicode.cldr.util.With; 21 22 import com.google.common.base.Joiner; 23 import com.ibm.icu.impl.Relation; 24 import com.ibm.icu.lang.UScript; 25 import com.ibm.icu.text.Transform; 26 import com.ibm.icu.text.UTF16; 27 import com.ibm.icu.util.ICUException; 28 import com.ibm.icu.util.VersionInfo; 29 30 public class ScriptMetadata { 31 private static final int MAX_RANK = 33; 32 private static final String DATA_FILE = "/org/unicode/cldr/util/data/Script_Metadata.csv"; 33 private static final VersionInfo UNICODE_VERSION = VersionInfo.getInstance( 34 CldrUtility.getProperty("SCRIPT_UNICODE_VERSION", "13")); 35 36 // To get the data, go do the Script MetaData spreadsheet 37 // Download As Comma Separated Items into DATA_FILE 38 // Set the last string in the UNICODE_VERSION line above to the right Unicode Version (for Unicode beta). 39 // Run TestScriptMetadata. 40 // Then run GenerateScriptMetadata. 41 // See http://cldr.unicode.org/development/updating-codes/updating-script-metadata 42 private enum Column { 43 // must match the spreadsheet header (caseless compare) or have the alternate header as an argument. 44 // doesn't have to be in order 45 WR, AGE, SAMPLE_CODE, ID_USAGE("ID Usage (UAX31)"), RTL("RTL?"), LB_LETTERS("LB letters?"), SHAPING_REQ("Shaping Req?"), IME("IME?"), ORIGIN_COUNTRY( 46 "Origin Country"), DENSITY("~Density"), LANG_CODE, HAS_CASE("Has Case?"); 47 48 int columnNumber = -1; 49 final Set<String> names = new HashSet<>(); 50 Column(String... alternateNames)51 Column(String... alternateNames) { 52 names.add(this.name()); 53 for (String name : alternateNames) { 54 names.add(name.toUpperCase(Locale.ENGLISH)); 55 } 56 } 57 setColumns(String[] headers)58 static void setColumns(String[] headers) { 59 for (int i = 0; i < headers.length; ++i) { 60 String header = headers[i].toUpperCase(Locale.ENGLISH); 61 for (Column v : values()) { 62 if (v.names.contains(header)) { 63 v.columnNumber = i; 64 } 65 } 66 } 67 for (Column v : values()) { 68 if (v.columnNumber == -1) { 69 throw new IllegalArgumentException("Missing field for " + v 70 + ", may need to add additional column alias"); 71 } 72 } 73 } 74 getItem(String[] items)75 String getItem(String[] items) { 76 return items[columnNumber]; 77 } 78 getInt(String[] items, int defaultValue)79 int getInt(String[] items, int defaultValue) { 80 final String item = getItem(items); 81 return item.isEmpty() || item.equalsIgnoreCase("n/a") ? defaultValue : Integer.parseInt(item); 82 } 83 } 84 85 public enum IdUsage { 86 UNKNOWN("Other"), EXCLUSION("Historic"), LIMITED_USE("Limited Use"), ASPIRATIONAL("Aspirational"), RECOMMENDED("Major Use"); 87 88 public final String name; 89 IdUsage(String name)90 private IdUsage(String name) { 91 this.name = name; 92 } 93 } 94 95 public enum Trinary { 96 UNKNOWN, NO, YES 97 } 98 99 public enum Shaping { 100 UNKNOWN, NO, MIN, YES 101 } 102 103 static StandardCodes SC = StandardCodes.make(); 104 // static HashMap<String,String> NAME_TO_REGION_CODE = new HashMap<String,String>(); 105 // static HashMap<String,String> NAME_TO_LANGUAGE_CODE = new HashMap<String,String>(); 106 static EnumLookup<Shaping> shapingLookup = EnumLookup.of(Shaping.class, null, "n/a", Shaping.UNKNOWN); 107 static EnumLookup<Trinary> trinaryLookup = EnumLookup.of(Trinary.class, null, "n/a", Trinary.UNKNOWN); 108 static EnumLookup<IdUsage> idUsageLookup = EnumLookup.of(IdUsage.class, null, "n/a", IdUsage.UNKNOWN); 109 static { 110 // addNameToCode("language", NAME_TO_LANGUAGE_CODE); 111 // // NAME_TO_LANGUAGE_CODE.put("", "und"); 112 // NAME_TO_LANGUAGE_CODE.put("N/A", "und"); 113 // addSynonym(NAME_TO_LANGUAGE_CODE, "Ancient Greek", "Ancient Greek (to 1453)"); 114 // //addSynonym(NAME_TO_LANGUAGE_CODE, "Khmer", "Cambodian"); 115 // addSynonym(NAME_TO_LANGUAGE_CODE, "Old Irish", "Old Irish (to 900)"); 116 117 // addNameToCode("region", NAME_TO_REGION_CODE); 118 // // NAME_TO_REGION_CODE.put("UNKNOWN", "ZZ"); 119 // // NAME_TO_REGION_CODE.put("", "ZZ"); 120 // NAME_TO_REGION_CODE.put("N/A", "ZZ"); 121 // addSynonym(NAME_TO_REGION_CODE, "Laos", "Lao People's Democratic Republic"); 122 } 123 addNameToCode(String type, Map<String, String> hashMap)124 public static void addNameToCode(String type, Map<String, String> hashMap) { 125 for (String language : SC.getAvailableCodes(type)) { 126 Map<String, String> fullData = StandardCodes.getLStreg().get(type).get(language); 127 String name = fullData.get("Description"); 128 hashMap.put(name.toUpperCase(Locale.ENGLISH), language); 129 } 130 } 131 addSynonym(Map<String, String> map, String newTerm, String oldTerm)132 public static void addSynonym(Map<String, String> map, String newTerm, String oldTerm) { 133 String code = map.get(oldTerm.toUpperCase(Locale.ENGLISH)); 134 map.put(newTerm.toUpperCase(Locale.ENGLISH), code); 135 } 136 137 public static final class SkipNewUnicodeException extends ICUException { 138 } 139 140 public static class Info implements Comparable<Info> { 141 public final int rank; 142 public final VersionInfo age; 143 public final String sampleChar; 144 public final IdUsage idUsage; 145 public final Trinary rtl; 146 public final Trinary lbLetters; 147 public final Trinary hasCase; 148 public final Shaping shapingReq; 149 public final Trinary ime; 150 public final int density; 151 public final String originCountry; 152 public final String likelyLanguage; 153 Info(String[] items)154 private Info(String[] items) { 155 // 3,Han,Hani,1.1,"75,963",字,5B57,China,3,Chinese,zh,Recommended,no,Yes,no,Yes,no 156 rank = Math.min(Column.WR.getInt(items, 999), MAX_RANK); 157 age = VersionInfo.getInstance(Column.AGE.getItem(items)); 158 if (age.compareTo(UNICODE_VERSION) > 0) { 159 throw new SkipNewUnicodeException(); 160 } 161 // Parse the code point of the sample character, rather than the sample character itself. 162 // The code point is more reliable, especially when the spreadsheet has a bug 163 // for supplementary characters. 164 int sampleCode = Integer.parseInt(Column.SAMPLE_CODE.getItem(items), 16); 165 sampleChar = UTF16.valueOf(sampleCode); 166 idUsage = idUsageLookup.forString(Column.ID_USAGE.getItem(items)); 167 rtl = trinaryLookup.forString(Column.RTL.getItem(items)); 168 lbLetters = trinaryLookup.forString(Column.LB_LETTERS.getItem(items)); 169 shapingReq = shapingLookup.forString(Column.SHAPING_REQ.getItem(items)); 170 ime = trinaryLookup.forString(Column.IME.getItem(items)); 171 hasCase = trinaryLookup.forString(Column.HAS_CASE.getItem(items)); 172 density = Column.DENSITY.getInt(items, -1); 173 174 final String countryRaw = Column.ORIGIN_COUNTRY.getItem(items); 175 String country = CountryCodeConverter.getCodeFromName(countryRaw, false); 176 // NAME_TO_REGION_CODE.get(countryRaw.toUpperCase(Locale.ENGLISH)); 177 if (country == null) { 178 errors.add("Can't map " + countryRaw + " to country/region"); 179 } 180 originCountry = country == null ? "ZZ" : country; 181 182 String langCode = Column.LANG_CODE.getItem(items); 183 if (langCode.equals("n/a")) { 184 langCode = null; 185 } 186 likelyLanguage = langCode == null ? "und" : langCode; 187 } 188 Info(Info other, String string, String sampleCharacter)189 public Info(Info other, String string, String sampleCharacter) { 190 rank = other.rank; 191 age = other.age; 192 sampleChar = sampleCharacter == null ? other.sampleChar : sampleCharacter; 193 idUsage = other.idUsage; 194 rtl = other.rtl; 195 lbLetters = other.lbLetters; 196 hasCase = other.hasCase; 197 shapingReq = other.shapingReq; 198 ime = "IME:YES".equals(string) ? Trinary.YES : other.ime; 199 density = other.density; 200 originCountry = other.originCountry; 201 likelyLanguage = other.likelyLanguage; 202 } 203 204 // public Trinary parseTrinary(Column title, String[] items) { 205 // return Trinary.valueOf(fix(title.getItem(items)).toUpperCase(Locale.ENGLISH)); 206 // } fix(String in)207 String fix(String in) { 208 return in.toUpperCase(Locale.ENGLISH).replace("N/A", "UNKNOWN").replace("?", "UNKNOWN") 209 .replace("RTL", "YES"); 210 } 211 212 @Override toString()213 public String toString() { 214 return rank 215 + "\tSample: " + sampleChar 216 + "\tCountry: " + getName("territory", originCountry) + " (" + originCountry + ")" 217 + "\tLanguage: " + getName("language", likelyLanguage) + " (" + likelyLanguage + ")" 218 + "\tId: " + idUsage 219 + "\tRtl: " + rtl 220 + "\tLb: " + lbLetters 221 + "\tShape: " + shapingReq 222 + "\tIme: " + ime 223 + "\tCase: " + hasCase 224 + "\tDensity: " + density; 225 } 226 getName(String type, String code)227 public Object getName(String type, String code) { 228 List<String> fullData = SC.getFullData(type, code); 229 if (fullData == null) { 230 return "unavailable"; 231 } 232 return fullData.get(0); 233 } 234 235 @Override compareTo(Info o)236 public int compareTo(Info o) { 237 // we don't actually care what the comparison value is, as long as it is transitive and consistent with equals. 238 return toString().compareTo(o.toString()); 239 } 240 } 241 242 public static Set<String> errors = new LinkedHashSet<>(); 243 static HashMap<String, Integer> titleToColumn = new HashMap<>(); 244 245 private static class MyFileReader extends SemiFileReader { 246 private Map<String, Info> data = new HashMap<>(); 247 248 @Override isCodePoint()249 protected boolean isCodePoint() { 250 return false; 251 } 252 253 @Override splitLine(String line)254 protected String[] splitLine(String line) { 255 return CldrUtility.splitCommaSeparated(line); 256 } 257 258 @Override handleLine(int lineCount, int start, int end, String[] items)259 protected boolean handleLine(int lineCount, int start, int end, String[] items) { 260 if (items[0].startsWith("For help") || items[0].isEmpty()) { 261 return true; // header lines 262 } 263 if (items[0].equals("WR")) { 264 Column.setColumns(items); 265 return true; 266 } 267 Info info; 268 try { 269 info = new Info(items); 270 } catch (SkipNewUnicodeException e) { 271 return true; 272 } catch (Exception e) { 273 errors.add(e.getClass().getName() + "\t" + e.getMessage() + "\t" + Arrays.asList(items)); 274 return true; 275 } 276 277 String script = items[2]; 278 data.put(script, info); 279 Set<String> extras = EXTRAS.get(script); 280 if (extras != null) { 281 for (String script2 : extras) { 282 Info info2 = info; 283 if (script2.equals("Jpan")) { 284 // HACK 285 info2 = new Info(info, "IME:YES", null); 286 } else if (script2.equals("Jamo")) { 287 info2 = new Info(info, null, "ᄒ"); 288 } 289 data.put(script2, info2); 290 } 291 } 292 return true; 293 } 294 295 @Override process(Class<?> classLocation, String fileName)296 public MyFileReader process(Class<?> classLocation, String fileName) { 297 super.process(classLocation, fileName); 298 return this; 299 } 300 getData()301 private Map<String, Info> getData() { 302 if (!errors.isEmpty()) { 303 throw new RuntimeException(Joiner.on("\n\t").join(errors)); 304 } 305 return Collections.unmodifiableMap(data); 306 } 307 } 308 309 public enum Groupings { 310 EUROPEAN("150"), 311 MIDDLE_EASTERN("145"), 312 CENTRAL_ASIAN("143"), 313 SOUTH_ASIAN("034"), 314 SOUTHEAST_ASIAN("035"), 315 EAST_ASIAN("030"), 316 AFRICAN("002"), 317 AMERICAN("019"),; 318 public final Set<String> scripts; 319 Groupings(String... regions)320 private Groupings(String... regions) { 321 scripts = With 322 .in(getScripts()) 323 .toUnmodifiableCollection( 324 new ScriptMetadata.RegionFilter(regions), new TreeSet<String>()); 325 } 326 } 327 328 static class RegionFilter implements com.ibm.icu.text.Transform<String, String> { 329 final String[] containingRegion; 330 RegionFilter(String... containingRegion)331 RegionFilter(String... containingRegion) { 332 this.containingRegion = containingRegion; 333 } 334 335 @Override transform(String script)336 public String transform(String script) { 337 String currentRegion = getInfo(script).originCountry; 338 while (true) { 339 for (String s : containingRegion) { 340 if (s.equals(currentRegion)) { 341 return script; 342 } 343 } 344 if (currentRegion.equals("001") || currentRegion.equals("ZZ")) { 345 return null; 346 } 347 currentRegion = Containment.getContainer(currentRegion); 348 } 349 } 350 } 351 352 static Relation<String, String> EXTRAS = Relation.of(new HashMap<String, Set<String>>(), HashSet.class); 353 static { 354 EXTRAS.put("Hani", "Hans"); 355 EXTRAS.put("Hani", "Hant"); 356 EXTRAS.put("Hani", "Hanb"); 357 EXTRAS.put("Hang", "Kore"); 358 EXTRAS.put("Hang", "Jamo"); 359 EXTRAS.put("Hira", "Jpan"); EXTRAS.freeze()360 EXTRAS.freeze(); 361 } 362 static final Map<String, Info> data = new MyFileReader() 363 .process(ScriptMetadata.class, DATA_FILE).getData(); 364 getInfo(String s)365 public static Info getInfo(String s) { 366 Info result = data.get(s); 367 if (result == null) { 368 try { 369 String name2 = UScript.getShortName(UScript.getCodeFromName(s)); 370 result = data.get(name2); 371 } catch (Exception e) { 372 } 373 } 374 return result; 375 } 376 getScripts()377 public static Set<String> getScripts() { 378 return data.keySet(); 379 } 380 getInfo(int i)381 public static Info getInfo(int i) { 382 return data.get(UScript.getShortName(i)); 383 } 384 iterable()385 public static Set<Entry<String, Info>> iterable() { 386 return data.entrySet(); 387 } 388 389 /** 390 * Specialized scripts 391 * @return 392 */ getExtras()393 public static Set<String> getExtras() { 394 return EXTRAS.values(); 395 } 396 397 public static Transform<String, String> TO_SHORT_SCRIPT = new Transform<String, String>() { 398 @Override 399 public String transform(String source) { 400 return UScript.getShortName(UScript.getCodeFromName(source)); 401 } 402 }; 403 public static Transform<String, String> TO_LONG_SCRIPT = new Transform<String, String>() { 404 @Override 405 public String transform(String source) { 406 return UScript.getName(UScript.getCodeFromName(source)); 407 } 408 }; 409 } 410