1 /** 2 * 3 */ 4 package org.unicode.cldr.util; 5 6 import java.io.BufferedReader; 7 import java.io.File; 8 import java.io.FilenameFilter; 9 import java.io.IOException; 10 import java.io.Writer; 11 import java.util.ArrayList; 12 import java.util.Arrays; 13 import java.util.Collection; 14 import java.util.Collections; 15 import java.util.Enumeration; 16 import java.util.HashSet; 17 import java.util.Iterator; 18 import java.util.LinkedHashMap; 19 import java.util.LinkedHashSet; 20 import java.util.LinkedList; 21 import java.util.List; 22 import java.util.Locale; 23 import java.util.Map; 24 import java.util.Set; 25 import java.util.TreeMap; 26 import java.util.TreeSet; 27 import java.util.regex.Matcher; 28 import java.util.regex.Pattern; 29 30 import org.unicode.cldr.draft.FileUtilities; 31 import org.unicode.cldr.test.TestTransforms; 32 import org.unicode.cldr.tool.LikelySubtags; 33 34 import com.google.common.collect.BiMap; 35 import com.google.common.collect.HashBiMap; 36 import com.ibm.icu.impl.Relation; 37 import com.ibm.icu.lang.UScript; 38 import com.ibm.icu.text.Transliterator; 39 import com.ibm.icu.text.UnicodeFilter; 40 import com.ibm.icu.util.ICUUncheckedIOException; 41 42 public class CLDRTransforms { 43 44 public static final String TRANSFORM_DIR = (CLDRPaths.COMMON_DIRECTORY + "transforms/"); 45 46 static final CLDRTransforms SINGLETON = new CLDRTransforms(); 47 getInstance()48 public static CLDRTransforms getInstance() { 49 return SINGLETON; 50 } 51 getShowProgress()52 public Appendable getShowProgress() { 53 return showProgress; 54 } 55 setShowProgress(Appendable showProgress)56 public CLDRTransforms setShowProgress(Appendable showProgress) { 57 this.showProgress = showProgress; 58 return this; 59 } 60 61 final Set<String> overridden = new HashSet<>(); 62 final DependencyOrder dependencyOrder = new DependencyOrder(); 63 64 static public class RegexFindFilenameFilter implements FilenameFilter { 65 Matcher matcher; 66 RegexFindFilenameFilter(Matcher filter)67 public RegexFindFilenameFilter(Matcher filter) { 68 matcher = filter; 69 } 70 71 @Override accept(File dir, String name)72 public boolean accept(File dir, String name) { 73 return matcher.reset(name).find(); 74 } 75 } 76 77 /** 78 * 79 * @param dir 80 * TODO 81 * @param namesMatchingRegex 82 * TODO 83 * @param showProgress 84 * null if no progress needed 85 * @param skipDashTIds TODO 86 * @return 87 */ 88 registerCldrTransforms(String dir, String namesMatchingRegex, Appendable showProgress, boolean keepDashTIds)89 public static void registerCldrTransforms(String dir, String namesMatchingRegex, Appendable showProgress, boolean keepDashTIds) { 90 CLDRTransforms r = getInstance(); 91 if (dir == null) { 92 dir = TRANSFORM_DIR; 93 } 94 // reorder to preload some 95 r.showProgress = showProgress; 96 List<String> files; 97 Set<String> ordered; 98 99 if (namesMatchingRegex == null) { 100 files = getAvailableIds(); 101 ordered = r.dependencyOrder.getOrderedItems(files, null, true); 102 } else { 103 Matcher filter = PatternCache.get(namesMatchingRegex).matcher(""); 104 r.deregisterIcuTransliterators(filter); 105 files = Arrays.asList(new File(TRANSFORM_DIR).list(new RegexFindFilenameFilter(filter))); 106 ordered = r.dependencyOrder.getOrderedItems(files, filter, true); 107 } 108 109 // System.out.println(ordered); 110 for (String cldrFileName : ordered) { 111 r.registerTransliteratorsFromXML(dir, cldrFileName, files, keepDashTIds); 112 } 113 Transliterator.registerAny(); // do this last! 114 115 } 116 getAvailableIds()117 public static List<String> getAvailableIds() { 118 return Arrays.asList(new File(TRANSFORM_DIR).list()); 119 } 120 getOverriddenTransliterators()121 public Set<String> getOverriddenTransliterators() { 122 return Collections.unmodifiableSet(overridden); 123 } 124 125 static Transliterator fixup = Transliterator.getInstance("[:Mn:]any-hex/java"); 126 127 class DependencyOrder { 128 // String[] doFirst = {"Latin-ConjoiningJamo"}; 129 // the following are file names, not IDs, so the dependencies have to go both directions 130 // List<String> extras = new ArrayList<String>(); 131 132 Relation<Matcher, String> dependsOn = Relation.of(new LinkedHashMap<Matcher, Set<String>>(), LinkedHashSet.class); 133 { 134 addDependency("Latin-(Jamo|Hangul)(/.*)?", "Latin-ConjoiningJamo", "ConjoiningJamo-Latin"); 135 addDependency("(Jamo|Hangul)-Latin(/.*)?", "Latin-ConjoiningJamo", "ConjoiningJamo-Latin"); 136 addDependency("Latin-Han(/.*)", "Han-Spacedhan"); 137 addDependency(".*(Hiragana|Katakana|Han|han).*", "Fullwidth-Halfwidth", "Halfwidth-Fullwidth"); 138 addDependency(".*(Hiragana).*", "Latin-Katakana", "Katakana-Latin"); 139 140 addInterIndicDependency("Arabic"); 141 addInterIndicDependency("Bengali"); 142 addInterIndicDependency("Devanagari"); 143 addInterIndicDependency("Gujarati"); 144 addInterIndicDependency("Gurmukhi"); 145 addInterIndicDependency("Kannada"); 146 addInterIndicDependency("Malayalam"); 147 addInterIndicDependency("Oriya"); 148 addInterIndicDependency("Tamil"); 149 addInterIndicDependency("Telugu"); 150 addInterIndicDependency("ur"); 151 152 addDependency(".*Digit.*", "NumericPinyin-Pinyin", "Pinyin-NumericPinyin"); 153 addDependency("Latin-NumericPinyin(/.*)?", "Tone-Digit", "Digit-Tone"); 154 addDependency("NumericPinyin-Latin(/.*)?", "Tone-Digit", "Digit-Tone"); 155 addDependency("am-ar", "am-am_FONIPA", "und_FONIPA-ar"); 156 addDependency("am-chr", "am-am_FONIPA", "und_FONIPA-chr"); 157 addDependency("am-fa", "am-am_FONIPA", "und_FONIPA-fa"); 158 addDependency("ch-am", "ch-ch_FONIPA", "am-am_FONIPA"); 159 addDependency("ch-ar", "ch-ch_FONIPA", "und_FONIPA-ar"); 160 addDependency("ch-chr", "ch-ch_FONIPA", "und_FONIPA-chr"); 161 addDependency("ch-fa", "ch-ch_FONIPA", "und_FONIPA-fa"); 162 addDependency("cs-am", "cs-cs_FONIPA", "am-am_FONIPA"); 163 addDependency("cs-ar", "cs-cs_FONIPA", "und_FONIPA-ar"); 164 addDependency("cs-chr", "cs-cs_FONIPA", "und_FONIPA-chr"); 165 addDependency("cs-fa", "cs-cs_FONIPA", "und_FONIPA-fa"); 166 addDependency("cs-ja", "cs-cs_FONIPA", "cs_FONIPA-ja"); 167 addDependency("cs_FONIPA-ko", "Latin-Hangul"); 168 addDependency("cs-ko", "cs-cs_FONIPA", "cs_FONIPA-ko"); 169 addDependency("de-ASCII", "Any-ASCII"); 170 addDependency("eo-am", "eo-eo_FONIPA", "am-am_FONIPA"); 171 addDependency("eo-ar", "eo-eo_FONIPA", "und_FONIPA-ar"); 172 addDependency("eo-chr", "eo-eo_FONIPA", "und_FONIPA-chr"); 173 addDependency("eo-fa", "eo-eo_FONIPA", "und_FONIPA-fa"); 174 addDependency("es-am", "es-es_FONIPA", "am-am_FONIPA"); 175 addDependency("es-ar", "es-es_FONIPA", "und_FONIPA-ar"); 176 addDependency("es-chr", "es-es_FONIPA", "und_FONIPA-chr"); 177 addDependency("es-fa", "es-es_FONIPA", "und_FONIPA-fa"); 178 addDependency("es_419-am", "es-es_FONIPA", "es_FONIPA-es_419_FONIPA", "am-am_FONIPA"); 179 addDependency("es_419-ar", "es-es_FONIPA", "es_FONIPA-es_419_FONIPA", "und_FONIPA-ar"); 180 addDependency("es_419-chr", "es-es_FONIPA", "es_FONIPA-es_419_FONIPA", "und_FONIPA-chr"); 181 addDependency("es_419-fa", "es-es_FONIPA", "es_FONIPA-es_419_FONIPA", "und_FONIPA-fa"); 182 addDependency("es_419-ja", "es-es_FONIPA", "es_FONIPA-es_419_FONIPA", "es_FONIPA-ja"); 183 addDependency("es-am", "es-es_FONIPA", "es_FONIPA-am"); 184 addDependency("es-ja", "es-es_FONIPA", "es_FONIPA-ja"); 185 addDependency("es-zh", "es-es_FONIPA", "es_FONIPA-zh"); 186 187 addDependency("Han-Latin-Names", "Han-Latin"); 188 189 addDependency("hy-am", "hy-hy_FONIPA", "am-am_FONIPA"); 190 addDependency("hy-ar", "hy-hy_FONIPA", "und_FONIPA-ar"); 191 addDependency("hy-chr", "hy-hy_FONIPA", "und_FONIPA-chr"); 192 addDependency("hy-fa", "hy-hy_FONIPA", "und_FONIPA-fa"); 193 addDependency("hy_AREVMDA-am", "hy_AREVMDA-hy_AREVMDA_FONIPA", "am-am_FONIPA"); 194 addDependency("hy_AREVMDA-ar", "hy_AREVMDA-hy_AREVMDA_FONIPA", "und_FONIPA-ar"); 195 addDependency("hy_AREVMDA-chr", "hy_AREVMDA-hy_AREVMDA_FONIPA", "und_FONIPA-chr"); 196 addDependency("hy_AREVMDA-fa", "hy_AREVMDA-hy_AREVMDA_FONIPA", "und_FONIPA-fa"); 197 addDependency("ia-am", "ia-ia_FONIPA", "am-am_FONIPA"); 198 addDependency("ia-ar", "ia-ia_FONIPA", "und_FONIPA-ar"); 199 addDependency("ia-chr", "ia-ia_FONIPA", "und_FONIPA-chr"); 200 addDependency("ia-fa", "ia-ia_FONIPA", "und_FONIPA-fa"); 201 addDependency("kk-am", "kk-kk_FONIPA", "am-am_FONIPA"); 202 addDependency("kk-ar", "kk-kk_FONIPA", "und_FONIPA-ar"); 203 addDependency("kk-chr", "kk-kk_FONIPA", "und_FONIPA-chr"); 204 addDependency("kk-fa", "kk-kk_FONIPA", "und_FONIPA-fa"); 205 addDependency("ky-am", "ky-ky_FONIPA", "am-am_FONIPA"); 206 addDependency("ky-ar", "ky-ky_FONIPA", "und_FONIPA-ar"); 207 addDependency("ky-chr", "ky-ky_FONIPA", "und_FONIPA-chr"); 208 addDependency("ky-fa", "ky-ky_FONIPA", "und_FONIPA-fa"); 209 addDependency("my-am", "my-my_FONIPA", "am-am_FONIPA"); 210 addDependency("my-ar", "my-my_FONIPA", "und_FONIPA-ar"); 211 addDependency("my-chr", "my-my_FONIPA", "und_FONIPA-chr"); 212 addDependency("my-fa", "my-my_FONIPA", "und_FONIPA-fa"); 213 addDependency("pl-am", "pl-pl_FONIPA", "am-am_FONIPA"); 214 addDependency("pl-ar", "pl-pl_FONIPA", "und_FONIPA-ar"); 215 addDependency("pl-chr", "pl-pl_FONIPA", "und_FONIPA-chr"); 216 addDependency("pl-fa", "pl-pl_FONIPA", "und_FONIPA-fa"); 217 addDependency("pl-ja", "pl-pl_FONIPA", "pl_FONIPA-ja"); 218 addDependency("rm_SURSILV-am", "rm_SURSILV-rm_FONIPA_SURSILV", "am-am_FONIPA"); 219 addDependency("rm_SURSILV-ar", "rm_SURSILV-rm_FONIPA_SURSILV", "und_FONIPA-ar"); 220 addDependency("rm_SURSILV-chr", "rm_SURSILV-rm_FONIPA_SURSILV", "und_FONIPA-chr"); 221 addDependency("rm_SURSILV-fa", "rm_SURSILV-rm_FONIPA_SURSILV", "und_FONIPA-fa"); 222 addDependency("ro-am", "ro-ro_FONIPA", "am-am_FONIPA"); 223 addDependency("ro-ar", "ro-ro_FONIPA", "und_FONIPA-ar"); 224 addDependency("ro-chr", "ro-ro_FONIPA", "und_FONIPA-chr"); 225 addDependency("ro-fa", "ro-ro_FONIPA", "und_FONIPA-fa"); 226 addDependency("ro-ja", "ro-ro_FONIPA", "ro_FONIPA-ja"); 227 addDependency("sat-am", "sat_Olck-sat_FONIPA", "am-am_FONIPA"); 228 addDependency("sat-ar", "sat_Olck-sat_FONIPA", "und_FONIPA-ar"); 229 addDependency("sat-chr", "sat_Olck-sat_FONIPA", "und_FONIPA-chr"); 230 addDependency("sat-fa", "sat_Olck-sat_FONIPA", "und_FONIPA-fa"); 231 addDependency("si-am", "si-si_FONIPA", "am-am_FONIPA"); 232 addDependency("si-ar", "si-si_FONIPA", "und_FONIPA-ar"); 233 addDependency("si-chr", "si-si_FONIPA", "und_FONIPA-chr"); 234 addDependency("si-fa", "si-si_FONIPA", "und_FONIPA-fa"); 235 addDependency("sk-am", "sk-sk_FONIPA", "am-am_FONIPA"); 236 addDependency("sk-ar", "sk-sk_FONIPA", "und_FONIPA-ar"); 237 addDependency("sk-chr", "sk-sk_FONIPA", "und_FONIPA-chr"); 238 addDependency("sk-fa", "sk-sk_FONIPA", "und_FONIPA-fa"); 239 addDependency("sk-ja", "sk-sk_FONIPA", "sk_FONIPA-ja"); 240 addDependency("tlh-am", "tlh-tlh_FONIPA", "am-am_FONIPA"); 241 addDependency("tlh-ar", "tlh-tlh_FONIPA", "und_FONIPA-ar"); 242 addDependency("tlh-chr", "tlh-tlh_FONIPA", "und_FONIPA-chr"); 243 addDependency("tlh-fa", "tlh-tlh_FONIPA", "und_FONIPA-fa"); 244 addDependency("xh-am", "xh-xh_FONIPA", "am-am_FONIPA"); 245 addDependency("xh-ar", "xh-xh_FONIPA", "und_FONIPA-ar"); 246 addDependency("xh-chr", "xh-xh_FONIPA", "und_FONIPA-chr"); 247 addDependency("xh-fa", "xh-xh_FONIPA", "und_FONIPA-fa"); 248 addDependency("zu-am", "zu-zu_FONIPA", "am-am_FONIPA"); 249 addDependency("zu-ar", "zu-zu_FONIPA", "und_FONIPA-ar"); 250 addDependency("zu-chr", "zu-zu_FONIPA", "und_FONIPA-chr"); 251 addDependency("zu-fa", "zu-zu_FONIPA", "und_FONIPA-fa"); 252 addDependency("Latin-Bopomofo", "Latin-NumericPinyin"); 253 254 // addExtras("cs-ja", "cs-ja", "es-am", "es-ja", "es-zh", "Han-Latin/Names"); 255 // Pinyin-NumericPinyin.xml 256 } 257 addInterIndicDependency(String script)258 private void addInterIndicDependency(String script) { 259 addPivotDependency(script, "InterIndic"); 260 if (!script.equals("Arabic")) { 261 addDependency(script + "-Arabic", 262 script + "-InterIndic", "InterIndic-Arabic"); 263 } 264 } 265 addPivotDependency(String script, String pivot)266 private void addPivotDependency(String script, String pivot) { 267 addDependency(script + "-.*", "Bengali" + "-" + pivot, pivot + "-" + "Bengali"); 268 addDependency(".*-" + "Bengali" + "(/.*)?", pivot + "-" + "Bengali", pivot + "-" + "Bengali"); 269 } 270 271 // private void addExtras(String... strings) { 272 // for (String item : strings) { 273 // extras.add(item); 274 // } 275 // } 276 addDependency(String pattern, String... whatItDependsOn)277 private void addDependency(String pattern, String... whatItDependsOn) { 278 dependsOn.putAll(PatternCache.get(pattern).matcher(""), Arrays.asList(whatItDependsOn)); 279 } 280 getOrderedItems(Collection<String> rawInput, Matcher filter, boolean hasXmlSuffix)281 public Set<String> getOrderedItems(Collection<String> rawInput, Matcher filter, boolean hasXmlSuffix) { 282 Set<String> input = new LinkedHashSet<>(rawInput); 283 // input.addAll(extras); 284 285 Set<String> ordered = new LinkedHashSet<>(); 286 287 // for (String other : doFirst) { 288 // ordered.add(hasXmlSuffix ? other + ".xml" : other); 289 // } 290 291 for (String cldrFileName : input) { 292 if (hasXmlSuffix && !cldrFileName.endsWith(".xml")) { 293 continue; 294 } 295 296 if (filter != null && !filter.reset(cldrFileName).find()) { 297 append("Skipping " + cldrFileName + "\n"); 298 continue; 299 } 300 // add dependencies first 301 addDependenciesRecursively(cldrFileName, ordered, hasXmlSuffix); 302 } 303 append("Adding: " + ordered + "\n"); 304 return ordered; 305 } 306 addDependenciesRecursively(String cldrFileName, Set<String> ordered, boolean hasXmlSuffix)307 private void addDependenciesRecursively(String cldrFileName, Set<String> ordered, boolean hasXmlSuffix) { 308 String item = hasXmlSuffix && cldrFileName.endsWith(".xml") ? cldrFileName.substring(0, 309 cldrFileName.length() - 4) : cldrFileName; 310 for (Matcher m : dependsOn.keySet()) { 311 if (m.reset(item).matches()) { 312 for (String other : dependsOn.getAll(m)) { 313 final String toAdd = hasXmlSuffix ? other + ".xml" : other; 314 if (other.equals(item) || ordered.contains(toAdd)) { 315 continue; 316 } 317 addDependenciesRecursively(toAdd, ordered, hasXmlSuffix); 318 append("Dependency: Adding: " + toAdd + " before " + item + "\n"); 319 } 320 } 321 } 322 ordered.add(item); 323 } 324 325 } 326 getInstance(String id)327 public Transliterator getInstance(String id) { 328 if (!overridden.contains(id)) { 329 throw new IllegalArgumentException("No overriden transform for " + id); 330 } 331 return Transliterator.getInstance(id); 332 } 333 334 public static Pattern TRANSFORM_ID_PATTERN = PatternCache.get("(.+)-([^/]+)(/(.*))?"); 335 getReverseInstance(String id)336 public Transliterator getReverseInstance(String id) { 337 Matcher matcher = TRANSFORM_ID_PATTERN.matcher(id); 338 if (!matcher.matches()) { 339 throw new IllegalArgumentException("**No transform for " + id); 340 } 341 return getInstance(matcher.group(2) + "-" + matcher.group(1) 342 + (matcher.group(4) == null ? "" : "/" + matcher.group(4))); 343 } 344 345 private BiMap<String,String> displayNameToId = HashBiMap.create(); 346 getDisplayNameToId()347 public BiMap<String, String> getDisplayNameToId() { 348 return displayNameToId; 349 } 350 addDisplayNameToId(Map<String, String> ids2, ParsedTransformID directionInfo)351 private void addDisplayNameToId(Map<String, String> ids2, ParsedTransformID directionInfo) { 352 displayNameToId.put(directionInfo.getDisplayId(), directionInfo.toString()); 353 } 354 registerTransliteratorsFromXML(String dir, String cldrFileName, List<String> cantSkip, boolean keepDashTIds)355 public void registerTransliteratorsFromXML(String dir, String cldrFileName, List<String> cantSkip, boolean keepDashTIds) { 356 ParsedTransformID directionInfo = new ParsedTransformID(); 357 String ruleString; 358 final String cldrFileName2 = cldrFileName + ".xml"; 359 try { 360 ruleString = getIcuRulesFromXmlFile(dir, cldrFileName2, directionInfo); 361 } catch (RuntimeException e) { 362 if (!cantSkip.contains(cldrFileName2)) { 363 return; 364 } 365 throw e; 366 } 367 368 String id = directionInfo.getId(); 369 addDisplayNameToId(displayNameToId, directionInfo); 370 371 if (directionInfo.getDirection() == Direction.both || directionInfo.getDirection() == Direction.forward) { 372 internalRegister(id, ruleString, Transliterator.FORWARD); 373 for (String alias : directionInfo.getAliases()) { 374 if (!keepDashTIds && alias.contains("-t-")) { 375 continue; 376 } 377 Transliterator.registerAlias(alias, id); 378 } 379 } 380 if (directionInfo.getDirection() == Direction.both || directionInfo.getDirection() == Direction.backward) { 381 internalRegister(id, ruleString, Transliterator.REVERSE); 382 for (String alias : directionInfo.getBackwardAliases()) { 383 if (!keepDashTIds && alias.contains("-t-")) { 384 continue; 385 } 386 Transliterator.registerAlias(alias, directionInfo.getBackwardId()); 387 } 388 } 389 } 390 391 /** 392 * Return Icu rules, and the direction info 393 * 394 * @param dir 395 * TODO 396 * @param cldrFileName 397 * @param directionInfo 398 * @return 399 */ getIcuRulesFromXmlFile(String dir, String cldrFileName, ParsedTransformID directionInfo)400 public static String getIcuRulesFromXmlFile(String dir, String cldrFileName, ParsedTransformID directionInfo) { 401 final MyHandler myHandler = new MyHandler(cldrFileName, directionInfo); 402 XMLFileReader xfr = new XMLFileReader().setHandler(myHandler); 403 xfr.read(dir + cldrFileName, XMLFileReader.CONTENT_HANDLER | XMLFileReader.ERROR_HANDLER, true); 404 return myHandler.getRules(); 405 } 406 internalRegister(String id, String ruleString, int direction)407 private void internalRegister(String id, String ruleString, int direction) { 408 if (direction == Transliterator.REVERSE) { 409 id = ParsedTransformID.reverse(id); 410 } 411 internalRegisterNoReverseId(id, ruleString, direction); 412 } 413 internalRegisterNoReverseId(String id, String ruleString, int direction)414 private void internalRegisterNoReverseId(String id, String ruleString, int direction) { 415 try { 416 Transliterator t = Transliterator.createFromRules(id, ruleString, direction); 417 overridden.add(id); 418 Transliterator oldTranslit = null; 419 if (showProgress != null) { 420 try { 421 oldTranslit = Transliterator.getInstance(id); 422 } catch (Exception e) { 423 } 424 } 425 Transliterator.unregister(id); 426 Transliterator.registerInstance(t); 427 // if (false) { // for paranoid testing 428 // Transliterator t1 = Transliterator.createFromRules(id, ruleString, direction); 429 // String r1 = t1.toRules(false); 430 // Transliterator t2 = Transliterator.getInstance(id); 431 // String r2 = t2.toRules(false); 432 // if (!r1.equals(r2)) { 433 // throw (IllegalArgumentException) new IllegalArgumentException("Rules unequal" + ruleString + "$$$\n$$$" + 434 // r2); 435 // } 436 // } 437 // verifyNullFilter("halfwidth-fullwidth"); 438 if (showProgress != null) { 439 append("Registered new Transliterator: " + id 440 + (oldTranslit == null ? "" : "\told:\t" + oldTranslit.getID()) 441 + '\n'); 442 if (id.startsWith("el-")) { 443 TestTransforms.showTransliterator("", t, 999); 444 Transliterator t2 = Transliterator.getInstance(id); 445 TestTransforms.showTransliterator("", t2, 999); 446 } 447 } 448 } catch (RuntimeException e) { 449 if (showProgress != null) { 450 e.printStackTrace(); 451 append("Couldn't register new Transliterator: " + id + "\t" + e.getMessage() + '\n'); 452 } else { 453 throw (IllegalArgumentException) new IllegalArgumentException("Couldn't register new Transliterator: " 454 + id).initCause(e); 455 } 456 } 457 } 458 459 Appendable showProgress; 460 append(String string)461 private void append(String string) { 462 try { 463 if (showProgress == null) { 464 return; 465 } 466 showProgress.append(string); 467 if (showProgress instanceof Writer) { 468 ((Writer) showProgress).flush(); 469 } 470 } catch (IOException e) { 471 throw new ICUUncheckedIOException(e); 472 } 473 } 474 appendln(String s)475 private void appendln(String s) { 476 append(s + "\n"); 477 } 478 479 // =================================== 480 481 @SuppressWarnings("deprecation") registerFromIcuFormatFiles(String directory)482 public void registerFromIcuFormatFiles(String directory) throws IOException { 483 484 deregisterIcuTransliterators((Matcher) null); 485 486 Matcher getId = PatternCache.get("\\s*(\\S*)\\s*\\{\\s*").matcher(""); 487 Matcher getSource = PatternCache.get("\\s*(\\S*)\\s*\\{\\s*\\\"(.*)\\\".*").matcher(""); 488 Matcher translitID = PatternCache.get("([^-]+)-([^/]+)+(?:[/](.+))?").matcher(""); 489 490 Map<String, String> fixedIDs = new TreeMap<>(); 491 Set<String> oddIDs = new TreeSet<>(); 492 493 File dir = new File(directory); 494 // get the list of files to take, and their directions 495 BufferedReader input = FileUtilities.openUTF8Reader(directory, "root.txt"); 496 String id = null; 497 String filename = null; 498 Map<String, String> aliasMap = new LinkedHashMap<>(); 499 500 // deregisterIcuTransliterators(); 501 502 // do first, since others depend on theseregisterFromIcuFile 503 /** 504 * Special aliases. 505 * Tone-Digit { 506 * alias {"Pinyin-NumericPinyin"} 507 * } 508 * Digit-Tone { 509 * alias {"NumericPinyin-Pinyin"} 510 * } 511 */ 512 // registerFromIcuFile("Latin-ConjoiningJamo", directory, null); 513 // registerFromIcuFile("Pinyin-NumericPinyin", directory, null); 514 // Transliterator.registerAlias("Tone-Digit", "Pinyin-NumericPinyin"); 515 // Transliterator.registerAlias("Digit-Tone", "NumericPinyin-Pinyin"); 516 // registerFromIcuFile("Fullwidth-Halfwidth", directory, null); 517 // registerFromIcuFile("Hiragana-Katakana", directory, null); 518 // registerFromIcuFile("Latin-Katakana", directory, null); 519 // registerFromIcuFile("Hiragana-Latin", directory, null); 520 521 while (true) { 522 String line = input.readLine(); 523 if (line == null) break; 524 line = line.trim(); 525 if (line.startsWith("\uFEFF")) { 526 line = line.substring(1); 527 } 528 if (line.startsWith("TransliteratorNamePattern")) break; // done 529 // if (line.indexOf("Ethiopic") >= 0) { 530 // appendln("Skipping Ethiopic"); 531 // continue; 532 // } 533 if (getId.reset(line).matches()) { 534 String temp = getId.group(1); 535 if (!temp.equals("file") && !temp.equals("internal")) id = temp; 536 continue; 537 } 538 if (getSource.reset(line).matches()) { 539 String operation = getSource.group(1); 540 String source = getSource.group(2); 541 if (operation.equals("alias")) { 542 aliasMap.put(id, source); 543 checkIdFix(id, fixedIDs, oddIDs, translitID); 544 id = null; 545 } else if (operation.equals("resource:process(transliterator)")) { 546 filename = source; 547 } else if (operation.equals("direction")) { 548 try { 549 if (id == null || filename == null) { 550 // appendln("skipping: " + line); 551 continue; 552 } 553 if (filename.indexOf("InterIndic") >= 0 && filename.indexOf("Latin") >= 0) { 554 // append("**" + id); 555 } 556 checkIdFix(id, fixedIDs, oddIDs, translitID); 557 558 final int direction = source.equals("FORWARD") ? Transliterator.FORWARD 559 : Transliterator.REVERSE; 560 registerFromIcuFile(id, directory, filename, direction); 561 562 verifyNullFilter("halfwidth-fullwidth"); 563 564 id = null; 565 filename = null; 566 } catch (RuntimeException e) { 567 throw (RuntimeException) new IllegalArgumentException("Failed with " + filename + ", " + source) 568 .initCause(e); 569 } 570 } else { 571 append(dir + "root.txt unhandled line:" + line); 572 } 573 continue; 574 } 575 String trimmed = line.trim(); 576 if (trimmed.equals("")) continue; 577 if (trimmed.equals("}")) continue; 578 if (trimmed.startsWith("//")) continue; 579 throw new IllegalArgumentException("Unhandled:" + line); 580 } 581 582 final Set<String> rawIds = idToRules.keySet(); 583 Set<String> ordered = dependencyOrder.getOrderedItems(rawIds, null, false); 584 ordered.retainAll(rawIds); // since we are in ID space, kick out anything that isn't 585 586 for (String id2 : ordered) { 587 RuleDirection stuff = idToRules.get(id2); 588 internalRegisterNoReverseId(id2, stuff.ruleString, stuff.direction); 589 verifyNullFilter("halfwidth-fullwidth"); // TESTING 590 } 591 592 for (Iterator<String> it = aliasMap.keySet().iterator(); it.hasNext();) { 593 id = it.next(); 594 String source = aliasMap.get(id); 595 Transliterator.unregister(id); 596 Transliterator t = Transliterator.createFromRules(id, "::" + source + ";", Transliterator.FORWARD); 597 Transliterator.registerInstance(t); 598 // verifyNullFilter("halfwidth-fullwidth"); 599 appendln("Registered new Transliterator Alias: " + id); 600 601 } 602 appendln("Fixed IDs"); 603 for (Iterator<String> it = fixedIDs.keySet().iterator(); it.hasNext();) { 604 String id2 = it.next(); 605 appendln("\t" + id2 + "\t" + fixedIDs.get(id2)); 606 } 607 appendln("Odd IDs"); 608 for (Iterator<String> it = oddIDs.iterator(); it.hasNext();) { 609 String id2 = it.next(); 610 appendln("\t" + id2); 611 } 612 Transliterator.registerAny(); // do this last! 613 } 614 615 Map<String, RuleDirection> idToRules = new TreeMap<>(); 616 617 private class RuleDirection { 618 String ruleString; 619 int direction; 620 RuleDirection(String ruleString, int direction)621 public RuleDirection(String ruleString, int direction) { 622 super(); 623 this.ruleString = ruleString; 624 this.direction = direction; 625 } 626 } 627 registerFromIcuFile(String id, String directory, String filename, int direction)628 private void registerFromIcuFile(String id, String directory, String filename, int direction) { 629 if (filename == null) { 630 filename = id.replace("-", "_").replace("/", "_") + ".txt"; 631 } 632 String ruleString = CldrUtility.getText(directory, filename); 633 idToRules.put(id, new RuleDirection(ruleString, direction)); 634 } 635 636 // private void registerFromIcuFile(String id, String dir, String filename) { 637 // registerFromIcuFile(id, dir, filename, Transliterator.FORWARD); 638 // registerFromIcuFile(id, dir, filename, Transliterator.REVERSE); 639 // } 640 checkIdFix(String id, Map<String, String> fixedIDs, Set<String> oddIDs, Matcher translitID)641 public void checkIdFix(String id, Map<String, String> fixedIDs, Set<String> oddIDs, Matcher translitID) { 642 if (fixedIDs.containsKey(id)) return; 643 if (!translitID.reset(id).matches()) { 644 appendln("Can't fix: " + id); 645 fixedIDs.put(id, "?" + id); 646 return; 647 } 648 String source1 = translitID.group(1); 649 String target1 = translitID.group(2); 650 String variant = translitID.group(3); 651 String source = fixID(source1); 652 String target = fixID(target1); 653 if (!source1.equals(source)) { 654 fixedIDs.put(source1, source); 655 } 656 if (!target1.equals(target)) { 657 fixedIDs.put(target1, target); 658 } 659 if (variant != null) { 660 oddIDs.add("variant: " + variant); 661 } 662 } 663 fixID(String source)664 static String fixID(String source) { 665 return source; // for now 666 } 667 deregisterIcuTransliterators(Matcher filter)668 public void deregisterIcuTransliterators(Matcher filter) { 669 // Remove all of the current registrations 670 // first load into array, so we don't get sync problems. 671 List<String> rawAvailable = new ArrayList<>(); 672 for (Enumeration<String> en = Transliterator.getAvailableIDs(); en.hasMoreElements();) { 673 final String id = en.nextElement(); 674 if (filter != null && !filter.reset(id).matches()) { 675 continue; 676 } 677 rawAvailable.add(id); 678 } 679 680 // deregisterIcuTransliterators(rawAvailable); 681 682 Set<String> available = dependencyOrder.getOrderedItems(rawAvailable, filter, false); 683 List<String> reversed = new LinkedList<>(); 684 for (String item : available) { 685 reversed.add(0, item); 686 } 687 // available.retainAll(rawAvailable); // remove the items we won't touch anyway 688 // rawAvailable.removeAll(available); // now the ones whose order doesn't matter 689 // deregisterIcuTransliterators(rawAvailable); 690 deregisterIcuTransliterators(reversed); 691 692 for (Enumeration<String> en = Transliterator.getAvailableIDs(); en.hasMoreElements();) { 693 String oldId = en.nextElement(); 694 append("Retaining: " + oldId + "\n"); 695 } 696 } 697 deregisterIcuTransliterators(Collection<String> available)698 public void deregisterIcuTransliterators(Collection<String> available) { 699 for (String oldId : available) { 700 Transliterator t; 701 try { 702 t = Transliterator.getInstance(oldId); 703 } catch (IllegalArgumentException e) { 704 if (e.getMessage().startsWith("Illegal ID")) { 705 continue; 706 } 707 append("Failure with: " + oldId); 708 t = Transliterator.getInstance(oldId); 709 throw e; 710 } catch (RuntimeException e) { 711 append("Failure with: " + oldId); 712 t = Transliterator.getInstance(oldId); 713 throw e; 714 } 715 String className = t.getClass().getName(); 716 if (className.endsWith(".CompoundTransliterator") 717 || className.endsWith(".RuleBasedTransliterator") 718 || className.endsWith(".AnyTransliterator")) { 719 appendln("REMOVING: " + oldId); 720 Transliterator.unregister(oldId); 721 } else { 722 appendln("Retaining: " + oldId + "\t\t" + className); 723 } 724 } 725 } 726 727 public enum Direction { 728 backward, both, forward 729 } 730 731 public enum Visibility { 732 external, internal 733 } 734 735 public static class ParsedTransformID { 736 public String source = "Any"; 737 public String target = "Any"; 738 public String variant; 739 protected String[] aliases = {}; 740 protected String[] backwardAliases = {}; 741 protected Direction direction = null; 742 protected Visibility visibility; 743 getId()744 public String getId() { 745 return getSource() + "-" + getTarget() + (getVariant() == null ? "" : "/" + getVariant()); 746 } 747 getDisplayId()748 public String getDisplayId() { 749 return getDisplaySource() + "-" + getDisplayTarget() + (getVariant() == null ? "" : "/" + getDisplayVariant()); 750 } 751 getDisplayVariant()752 private String getDisplayVariant() { 753 return getVariant(); 754 } 755 getDisplayTarget()756 private String getDisplayTarget() { 757 return getDisplaySourceOrTarget(getTarget()); 758 } 759 getDisplaySource()760 private String getDisplaySource() { 761 return getDisplaySourceOrTarget(getSource()); 762 } 763 getDisplaySourceOrTarget(String sourceOrTarget)764 private String getDisplaySourceOrTarget(String sourceOrTarget) { 765 int uscript = UScript.getCodeFromName(sourceOrTarget); 766 if (uscript >= 0) { 767 return UScript.getName(uscript); 768 } 769 if (sourceOrTarget.contains("FONIPA")) { 770 return "IPA"; 771 } 772 if (sourceOrTarget.equals("InterIndic")) { 773 return "Indic"; 774 } 775 try { 776 String name = CLDRConfig.getInstance().getEnglish().getName(sourceOrTarget); 777 return name; 778 } catch (Exception e) { 779 return sourceOrTarget; 780 } 781 } 782 783 static final LikelySubtags likely = new LikelySubtags(); 784 getScriptCode(String sourceOrTarget)785 public static String getScriptCode(String sourceOrTarget) { 786 int uscript = UScript.getCodeFromName(sourceOrTarget); 787 if (uscript >= 0) { 788 return UScript.getShortName(uscript); 789 } 790 if (sourceOrTarget.contains("FONIPA")) { 791 return "Ipa0"; 792 } 793 if (sourceOrTarget.equals("InterIndic")) { 794 return "Ind0"; 795 } 796 try { 797 String max = likely.maximize(sourceOrTarget); 798 return max == null ? null : new LanguageTagParser().set(max).getScript(); 799 } catch (Exception e) { 800 return null; 801 } 802 } 803 getBackwardId()804 public String getBackwardId() { 805 return getTarget() + "-" + getSource() + (getVariant() == null ? "" : "/" + getVariant()); 806 } 807 ParsedTransformID()808 public ParsedTransformID() { 809 } 810 set(String source, String target, String variant, Direction direction)811 public ParsedTransformID set(String source, String target, String variant, Direction direction) { 812 this.source = source; 813 this.target = target; 814 this.variant = variant; 815 this.direction = direction; 816 return this; 817 } 818 set(String id)819 public ParsedTransformID set(String id) { 820 variant = null; 821 int pos = id.indexOf('-'); 822 if (pos < 0) { 823 source = "Any"; 824 target = id; 825 return this; 826 } 827 source = id.substring(0, pos); 828 int pos2 = id.indexOf('/', pos); 829 if (pos2 < 0) { 830 target = id.substring(pos + 1); 831 return this; 832 } 833 target = id.substring(pos + 1, pos2); 834 variant = id.substring(pos2 + 1); 835 return this; 836 } 837 reverse()838 public ParsedTransformID reverse() { 839 String temp = source; 840 source = target; 841 target = temp; 842 return this; 843 } 844 getTargetVariant()845 public String getTargetVariant() { 846 return target + (variant == null ? "" : "/" + variant); 847 } 848 getSourceVariant()849 public String getSourceVariant() { 850 return source + (variant == null ? "" : "/" + variant); 851 } 852 setDirection(Direction direction)853 protected void setDirection(Direction direction) { 854 this.direction = direction; 855 } 856 getDirection()857 public Direction getDirection() { 858 return direction; 859 } 860 setVariant(String variant)861 public void setVariant(String variant) { 862 this.variant = variant; 863 } 864 getVariant()865 protected String getVariant() { 866 return variant; 867 } 868 setTarget(String target)869 public void setTarget(String target) { 870 this.target = target; 871 } 872 getTarget()873 public String getTarget() { 874 return target; 875 } 876 setSource(String source)877 public void setSource(String source) { 878 this.source = source; 879 } 880 getSource()881 public String getSource() { 882 return source; 883 } 884 885 @Override toString()886 public String toString() { 887 return source + "-" + getTargetVariant(); 888 } 889 getId(String source, String target, String variant)890 public static String getId(String source, String target, String variant) { 891 String id = source + '-' + target; 892 if (variant != null) id += "/" + variant; 893 return id; 894 } 895 reverse(String id)896 public static String reverse(String id) { 897 return new ParsedTransformID().set(id).getBackwardId(); 898 } 899 setAliases(String[] aliases)900 public void setAliases(String[] aliases) { 901 this.aliases = aliases; 902 } 903 getAliases()904 public String[] getAliases() { 905 return aliases; 906 } 907 setBackwardAliases(String[] backwardAliases)908 public void setBackwardAliases(String[] backwardAliases) { 909 this.backwardAliases = backwardAliases; 910 } 911 getBackwardAliases()912 public String[] getBackwardAliases() { 913 return backwardAliases; 914 } 915 setVisibility(String string)916 protected void setVisibility(String string) { 917 visibility = Visibility.valueOf(string); 918 } 919 getVisibility()920 public Visibility getVisibility() { 921 return visibility; 922 } 923 } 924 925 /** 926 * Verify that if the transliterator exists, it has a null filter 927 * 928 * @param id 929 */ verifyNullFilter(String id)930 public static void verifyNullFilter(String id) { 931 Transliterator widen; 932 try { 933 widen = Transliterator.getInstance(id); 934 } catch (Exception e) { 935 return; 936 } 937 UnicodeFilter filter = widen.getFilter(); 938 if (filter != null) { 939 throw new IllegalArgumentException(id + " has non-empty filter: " + filter); 940 } 941 } 942 943 public static class MyHandler extends XMLFileReader.SimpleHandler { 944 boolean first = true; 945 ParsedTransformID directionInfo; 946 String cldrFileName; 947 StringBuilder rules = new StringBuilder(); 948 getRules()949 public String getRules() { 950 return rules.toString(); 951 } 952 MyHandler(String cldrFileName, ParsedTransformID directionInfo)953 public MyHandler(String cldrFileName, ParsedTransformID directionInfo) { 954 super(); 955 this.cldrFileName = cldrFileName; 956 this.directionInfo = directionInfo; 957 } 958 959 @Override handlePathValue(String path, String value)960 public void handlePathValue(String path, String value) { 961 if (first) { 962 if (path.startsWith("//supplementalData/version")) { 963 return; 964 } else if (path.startsWith("//supplementalData/generation")) { 965 return; 966 } 967 XPathParts parts = XPathParts.getFrozenInstance(path); 968 Map<String, String> attributes = parts.findAttributes("transform"); 969 if (attributes == null) { 970 throw new IllegalArgumentException("Not an XML transform file: " + cldrFileName + "\t" + path); 971 } 972 directionInfo.setSource(attributes.get("source")); 973 directionInfo.setTarget(attributes.get("target")); 974 directionInfo.setVariant(attributes.get("variant")); 975 directionInfo.setDirection(Direction.valueOf(attributes.get("direction").toLowerCase(Locale.ENGLISH))); 976 977 String alias = attributes.get("alias"); 978 if (alias != null) { 979 directionInfo.setAliases(alias.trim().split("\\s+")); 980 } 981 982 String backwardAlias = attributes.get("backwardAlias"); 983 if (backwardAlias != null) { 984 directionInfo.setBackwardAliases(backwardAlias.trim().split("\\s+")); 985 } 986 987 directionInfo.setVisibility(attributes.get("visibility")); 988 first = false; 989 } 990 if (path.indexOf("/comment") >= 0) { 991 // skip 992 } else if (path.indexOf("/tRule") >= 0) { 993 value = fixup.transliterate(value); 994 rules.append(value).append(CldrUtility.LINE_SEPARATOR); 995 } else { 996 throw new IllegalArgumentException("Unknown element: " + path + "\t " + value); 997 } 998 } 999 } 1000 } 1001