1 /** */ 2 package org.unicode.cldr.util; 3 4 import com.google.common.collect.BiMap; 5 import com.google.common.collect.HashBiMap; 6 import com.google.common.collect.HashMultimap; 7 import com.google.common.collect.ImmutableSet; 8 import com.google.common.collect.Multimap; 9 import com.google.common.collect.Multimaps; 10 import com.google.common.collect.TreeMultimap; 11 import com.ibm.icu.lang.UScript; 12 import com.ibm.icu.text.RuleBasedTransliterator; 13 import com.ibm.icu.text.Transliterator; 14 import com.ibm.icu.text.UnicodeFilter; 15 import com.ibm.icu.util.ICUUncheckedIOException; 16 import java.io.File; 17 import java.io.IOException; 18 import java.io.Writer; 19 import java.util.Arrays; 20 import java.util.Collection; 21 import java.util.Collections; 22 import java.util.HashSet; 23 import java.util.LinkedHashSet; 24 import java.util.List; 25 import java.util.Locale; 26 import java.util.Map; 27 import java.util.Map.Entry; 28 import java.util.Set; 29 import java.util.TreeMap; 30 import java.util.TreeSet; 31 import java.util.regex.Matcher; 32 import java.util.regex.Pattern; 33 import java.util.stream.Collectors; 34 import org.unicode.cldr.tool.LikelySubtags; 35 import org.unicode.cldr.util.DiscreteComparator.Builder; 36 37 public class CLDRTransforms { 38 39 public static final String TRANSFORM_DIR = (CLDRPaths.COMMON_DIRECTORY + "transforms/"); 40 41 static final CLDRTransforms SINGLETON = new CLDRTransforms(); 42 43 private static final boolean PARANOID = true; 44 getInstance()45 public static CLDRTransforms getInstance() { 46 return SINGLETON; 47 } 48 getShowProgress()49 public Appendable getShowProgress() { 50 return showProgress; 51 } 52 setShowProgress(Appendable showProgress)53 public CLDRTransforms setShowProgress(Appendable showProgress) { 54 this.showProgress = showProgress; 55 return this; 56 } 57 58 final Set<String> overridden = new HashSet<>(); 59 // final DependencyOrder dependencyOrder = new DependencyOrder(); 60 61 // static public class RegexFindFilenameFilter implements FilenameFilter { 62 // Matcher matcher; 63 // 64 // public RegexFindFilenameFilter(Matcher filter) { 65 // matcher = filter; 66 // } 67 // 68 // @Override 69 // public boolean accept(File dir, String name) { 70 // return matcher.reset(name).find(); 71 // } 72 // } 73 74 /** 75 * @param dir TODO 76 * @param namesMatchingRegex TODO 77 * @param showProgress null if no progress needed 78 * @param skipDashTIds TODO 79 * @return 80 */ registerCldrTransforms( String dir, String namesMatchingRegex, Appendable showProgress, boolean keepDashTIds)81 public static void registerCldrTransforms( 82 String dir, String namesMatchingRegex, Appendable showProgress, boolean keepDashTIds) { 83 CLDRTransforms r = getInstance(); 84 if (dir == null) { 85 dir = TRANSFORM_DIR; 86 } 87 // reorder to preload some 88 r.showProgress = showProgress; 89 Set<String> ordered = getFileRegistrationOrder(dir); 90 91 if (namesMatchingRegex != null) { 92 Matcher filter = PatternCache.get(namesMatchingRegex).matcher(""); 93 ordered = 94 ordered.stream() 95 .filter(x -> filter.reset(x).matches()) 96 .collect(Collectors.toCollection(LinkedHashSet::new)); 97 // r.deregisterIcuTransliterators(filter); 98 // files = Arrays.asList(new File(TRANSFORM_DIR).list(new 99 // RegexFindFilenameFilter(filter))); 100 // ordered = r.dependencyOrder.getOrderedItems(files, filter, true); 101 } 102 103 // System.out.println(ordered); 104 for (String cldrFileName : ordered) { 105 r.registerTransliteratorsFromXML( 106 dir, cldrFileName, Collections.emptySet(), keepDashTIds); 107 } 108 Transliterator.registerAny(); // do this last! 109 } 110 getAvailableIds()111 public static List<String> getAvailableIds() { 112 return Arrays.asList(new File(TRANSFORM_DIR).list()); 113 } 114 getOverriddenTransliterators()115 public Set<String> getOverriddenTransliterators() { 116 return Collections.unmodifiableSet(overridden); 117 } 118 119 static Transliterator fixup = Transliterator.getInstance("[:Mn:]any-hex/java"); 120 getInstance(String id)121 public Transliterator getInstance(String id) { 122 if (!overridden.contains(id)) { 123 throw new IllegalArgumentException("No overriden transform for " + id); 124 } 125 return Transliterator.getInstance(id); 126 } 127 128 public static Pattern TRANSFORM_ID_PATTERN = PatternCache.get("(.+)-([^/]+)(/(.*))?"); 129 getReverseInstance(String id)130 public Transliterator getReverseInstance(String id) { 131 Matcher matcher = TRANSFORM_ID_PATTERN.matcher(id); 132 if (!matcher.matches()) { 133 throw new IllegalArgumentException("**No transform for " + id); 134 } 135 return getInstance( 136 matcher.group(2) 137 + "-" 138 + matcher.group(1) 139 + (matcher.group(4) == null ? "" : "/" + matcher.group(4))); 140 } 141 142 private BiMap<String, String> displayNameToId = HashBiMap.create(); 143 getDisplayNameToId()144 public BiMap<String, String> getDisplayNameToId() { 145 return displayNameToId; 146 } 147 addDisplayNameToId(Map<String, String> ids2, ParsedTransformID directionInfo)148 private void addDisplayNameToId(Map<String, String> ids2, ParsedTransformID directionInfo) { 149 displayNameToId.put(directionInfo.getDisplayId(), directionInfo.toString()); 150 } 151 registerTransliteratorsFromXML( String dir, String cldrFileName, Set<String> cantSkip, boolean keepDashTIds)152 public String registerTransliteratorsFromXML( 153 String dir, String cldrFileName, Set<String> cantSkip, boolean keepDashTIds) { 154 ParsedTransformID directionInfo = new ParsedTransformID(); 155 String ruleString = getIcuRulesFromXmlFile(dir, cldrFileName, directionInfo); 156 157 String id = directionInfo.getId(); 158 addDisplayNameToId(displayNameToId, directionInfo); 159 160 if (directionInfo.getDirection() == Direction.both 161 || directionInfo.getDirection() == Direction.forward) { 162 for (String alias : directionInfo.getAliases()) { 163 if (!keepDashTIds && alias.contains("-t-")) { 164 continue; 165 } 166 Transliterator.unregister(alias); 167 Transliterator.registerAlias(alias, id); 168 } 169 internalRegister(id, ruleString, Transliterator.FORWARD); 170 } 171 if (directionInfo.getDirection() == Direction.both 172 || directionInfo.getDirection() == Direction.backward) { 173 for (String alias : directionInfo.getBackwardAliases()) { 174 if (!keepDashTIds && alias.contains("-t-")) { 175 continue; 176 } 177 Transliterator.unregister(alias); 178 Transliterator.registerAlias(alias, directionInfo.getBackwardId()); 179 } 180 internalRegister(id, ruleString, Transliterator.REVERSE); 181 } 182 return id; 183 } 184 185 /** 186 * Return Icu rules, and the direction info 187 * 188 * @param dir TODO 189 * @param cldrFileName 190 * @param directionInfo 191 * @return 192 */ getIcuRulesFromXmlFile( String dir, String cldrFileName, ParsedTransformID directionInfo)193 public static String getIcuRulesFromXmlFile( 194 String dir, String cldrFileName, ParsedTransformID directionInfo) { 195 final MyHandler myHandler = new MyHandler(cldrFileName, directionInfo); 196 XMLFileReader xfr = new XMLFileReader().setHandler(myHandler); 197 xfr.read( 198 dir + cldrFileName, 199 XMLFileReader.CONTENT_HANDLER | XMLFileReader.ERROR_HANDLER, 200 true); 201 return myHandler.getRules(); 202 } 203 internalRegister(String id, String ruleString, int direction)204 private void internalRegister(String id, String ruleString, int direction) { 205 if (direction == Transliterator.REVERSE) { 206 id = ParsedTransformID.reverse(id); 207 } 208 internalRegisterNoReverseId(id, ruleString, direction); 209 } 210 internalRegisterNoReverseId(String id, String ruleString, int direction)211 private void internalRegisterNoReverseId(String id, String ruleString, int direction) { 212 try { 213 Transliterator t = Transliterator.createFromRules(id, ruleString, direction); 214 overridden.add(id); 215 Transliterator oldTranslit = null; 216 if (showProgress != null) { 217 try { 218 oldTranslit = Transliterator.getInstance(id); 219 } catch (Exception e) { 220 } 221 } 222 Transliterator.unregister(id); 223 Transliterator.registerInstance(t); 224 225 if (PARANOID) { // for paranoid testing 226 String r1 = 227 CLDRTransforms.showTransliterator("", t, 9999, new StringBuilder()) 228 .toString(); 229 Transliterator t2 = Transliterator.getInstance(id); 230 String r2 = 231 CLDRTransforms.showTransliterator("", t2, 9999, new StringBuilder()) 232 .toString(); 233 if (!r1.equals(r2)) { 234 throw new IllegalArgumentException( 235 "Rules unequal\n" + ruleString + "$$$\n$$$" + r2); 236 } 237 } 238 // verifyNullFilter("halfwidth-fullwidth"); 239 if (showProgress != null) { 240 append( 241 "Registered new Transliterator: " 242 + id 243 + (oldTranslit == null ? "" : "\told:\t" + oldTranslit.getID()) 244 + '\n'); 245 if (id.startsWith("el-")) { 246 CLDRTransforms.showTransliterator("", t, 999); 247 Transliterator t2 = Transliterator.getInstance(id); 248 CLDRTransforms.showTransliterator("", t2, 999); 249 } 250 } 251 } catch (RuntimeException e) { 252 if (showProgress != null) { 253 e.printStackTrace(); 254 append( 255 "Couldn't register new Transliterator: " 256 + id 257 + "\t" 258 + e.getMessage() 259 + '\n'); 260 } else { 261 throw (IllegalArgumentException) 262 new IllegalArgumentException("Couldn't register new Transliterator: " + id) 263 .initCause(e); 264 } 265 } 266 } 267 268 Appendable showProgress; 269 append(String string)270 private void append(String string) { 271 try { 272 if (showProgress == null) { 273 return; 274 } 275 showProgress.append(string); 276 if (showProgress instanceof Writer) { 277 ((Writer) showProgress).flush(); 278 } 279 } catch (IOException e) { 280 throw new ICUUncheckedIOException(e); 281 } 282 } 283 appendln(String s)284 private void appendln(String s) { 285 append(s + "\n"); 286 } 287 288 // =================================== 289 290 // @SuppressWarnings("deprecation") 291 // public void registerFromIcuFormatFiles(String directory) throws IOException { 292 // 293 //// deregisterIcuTransliterators((Matcher) null); 294 // 295 // Matcher getId = PatternCache.get("\\s*(\\S*)\\s*\\{\\s*").matcher(""); 296 // Matcher getSource = 297 // PatternCache.get("\\s*(\\S*)\\s*\\{\\s*\\\"(.*)\\\".*").matcher(""); 298 // Matcher translitID = PatternCache.get("([^-]+)-([^/]+)+(?:[/](.+))?").matcher(""); 299 // 300 // Map<String, String> fixedIDs = new TreeMap<>(); 301 // Set<String> oddIDs = new TreeSet<>(); 302 // 303 // File dir = new File(directory); 304 // // get the list of files to take, and their directions 305 // BufferedReader input = FileUtilities.openUTF8Reader(directory, "root.txt"); 306 // String id = null; 307 // String filename = null; 308 // Map<String, String> aliasMap = new LinkedHashMap<>(); 309 // 310 // // deregisterIcuTransliterators(); 311 // 312 // // do first, since others depend on theseregisterFromIcuFile 313 // /** 314 // * Special aliases. 315 // * Tone-Digit { 316 // * alias {"Pinyin-NumericPinyin"} 317 // * } 318 // * Digit-Tone { 319 // * alias {"NumericPinyin-Pinyin"} 320 // * } 321 // */ 322 // // registerFromIcuFile("Latin-ConjoiningJamo", directory, null); 323 // // registerFromIcuFile("Pinyin-NumericPinyin", directory, null); 324 // // Transliterator.registerAlias("Tone-Digit", "Pinyin-NumericPinyin"); 325 // // Transliterator.registerAlias("Digit-Tone", "NumericPinyin-Pinyin"); 326 // // registerFromIcuFile("Fullwidth-Halfwidth", directory, null); 327 // // registerFromIcuFile("Hiragana-Katakana", directory, null); 328 // // registerFromIcuFile("Latin-Katakana", directory, null); 329 // // registerFromIcuFile("Hiragana-Latin", directory, null); 330 // 331 // while (true) { 332 // String line = input.readLine(); 333 // if (line == null) break; 334 // line = line.trim(); 335 // if (line.startsWith("\uFEFF")) { 336 // line = line.substring(1); 337 // } 338 // if (line.startsWith("TransliteratorNamePattern")) break; // done 339 // // if (line.indexOf("Ethiopic") >= 0) { 340 // // appendln("Skipping Ethiopic"); 341 // // continue; 342 // // } 343 // if (getId.reset(line).matches()) { 344 // String temp = getId.group(1); 345 // if (!temp.equals("file") && !temp.equals("internal")) id = temp; 346 // continue; 347 // } 348 // if (getSource.reset(line).matches()) { 349 // String operation = getSource.group(1); 350 // String source = getSource.group(2); 351 // if (operation.equals("alias")) { 352 // aliasMap.put(id, source); 353 // checkIdFix(id, fixedIDs, oddIDs, translitID); 354 // id = null; 355 // } else if (operation.equals("resource:process(transliterator)")) { 356 // filename = source; 357 // } else if (operation.equals("direction")) { 358 // try { 359 // if (id == null || filename == null) { 360 // // appendln("skipping: " + line); 361 // continue; 362 // } 363 // if (filename.indexOf("InterIndic") >= 0 && filename.indexOf("Latin") 364 // >= 0) { 365 // // append("**" + id); 366 // } 367 // checkIdFix(id, fixedIDs, oddIDs, translitID); 368 // 369 // final int direction = source.equals("FORWARD") ? 370 // Transliterator.FORWARD 371 // : Transliterator.REVERSE; 372 // registerFromIcuFile(id, directory, filename, direction); 373 // 374 // verifyNullFilter("halfwidth-fullwidth"); 375 // 376 // id = null; 377 // filename = null; 378 // } catch (RuntimeException e) { 379 // throw (RuntimeException) new IllegalArgumentException("Failed with " + 380 // filename + ", " + source) 381 // .initCause(e); 382 // } 383 // } else { 384 // append(dir + "root.txt unhandled line:" + line); 385 // } 386 // continue; 387 // } 388 // String trimmed = line.trim(); 389 // if (trimmed.equals("")) continue; 390 // if (trimmed.equals("}")) continue; 391 // if (trimmed.startsWith("//")) continue; 392 // throw new IllegalArgumentException("Unhandled:" + line); 393 // } 394 // 395 // final Set<String> rawIds = idToRules.keySet(); 396 // Set<String> ordered = dependencyOrder.getOrderedItems(rawIds, null, false); 397 // ordered.retainAll(rawIds); // since we are in ID space, kick out anything that isn't 398 // 399 // for (String id2 : ordered) { 400 // RuleDirection stuff = idToRules.get(id2); 401 // internalRegisterNoReverseId(id2, stuff.ruleString, stuff.direction); 402 // verifyNullFilter("halfwidth-fullwidth"); // TESTING 403 // } 404 // 405 // for (Iterator<String> it = aliasMap.keySet().iterator(); it.hasNext();) { 406 // id = it.next(); 407 // String source = aliasMap.get(id); 408 // Transliterator.unregister(id); 409 // Transliterator t = Transliterator.createFromRules(id, "::" + source + ";", 410 // Transliterator.FORWARD); 411 // Transliterator.registerInstance(t); 412 // // verifyNullFilter("halfwidth-fullwidth"); 413 // appendln("Registered new Transliterator Alias: " + id); 414 // 415 // } 416 // appendln("Fixed IDs"); 417 // for (Iterator<String> it = fixedIDs.keySet().iterator(); it.hasNext();) { 418 // String id2 = it.next(); 419 // appendln("\t" + id2 + "\t" + fixedIDs.get(id2)); 420 // } 421 // appendln("Odd IDs"); 422 // for (Iterator<String> it = oddIDs.iterator(); it.hasNext();) { 423 // String id2 = it.next(); 424 // appendln("\t" + id2); 425 // } 426 // Transliterator.registerAny(); // do this last! 427 // } 428 429 Map<String, RuleDirection> idToRules = new TreeMap<>(); 430 431 private class RuleDirection { 432 String ruleString; 433 int direction; 434 RuleDirection(String ruleString, int direction)435 public RuleDirection(String ruleString, int direction) { 436 super(); 437 this.ruleString = ruleString; 438 this.direction = direction; 439 } 440 } 441 registerFromIcuFile(String id, String directory, String filename, int direction)442 private void registerFromIcuFile(String id, String directory, String filename, int direction) { 443 if (filename == null) { 444 filename = id.replace("-", "_").replace("/", "_") + ".txt"; 445 } 446 String ruleString = CldrUtility.getText(directory, filename); 447 idToRules.put(id, new RuleDirection(ruleString, direction)); 448 } 449 450 // private void registerFromIcuFile(String id, String dir, String filename) { 451 // registerFromIcuFile(id, dir, filename, Transliterator.FORWARD); 452 // registerFromIcuFile(id, dir, filename, Transliterator.REVERSE); 453 // } 454 checkIdFix( String id, Map<String, String> fixedIDs, Set<String> oddIDs, Matcher translitID)455 public void checkIdFix( 456 String id, Map<String, String> fixedIDs, Set<String> oddIDs, Matcher translitID) { 457 if (fixedIDs.containsKey(id)) return; 458 if (!translitID.reset(id).matches()) { 459 appendln("Can't fix: " + id); 460 fixedIDs.put(id, "?" + id); 461 return; 462 } 463 String source1 = translitID.group(1); 464 String target1 = translitID.group(2); 465 String variant = translitID.group(3); 466 String source = fixID(source1); 467 String target = fixID(target1); 468 if (!source1.equals(source)) { 469 fixedIDs.put(source1, source); 470 } 471 if (!target1.equals(target)) { 472 fixedIDs.put(target1, target); 473 } 474 if (variant != null) { 475 oddIDs.add("variant: " + variant); 476 } 477 } 478 fixID(String source)479 static String fixID(String source) { 480 return source; // for now 481 } 482 483 // public void deregisterIcuTransliterators(Matcher filter) { 484 // // Remove all of the current registrations 485 // // first load into array, so we don't get sync problems. 486 // List<String> rawAvailable = new ArrayList<>(); 487 // for (Enumeration<String> en = Transliterator.getAvailableIDs(); en.hasMoreElements();) 488 // { 489 // final String id = en.nextElement(); 490 // if (filter != null && !filter.reset(id).matches()) { 491 // continue; 492 // } 493 // rawAvailable.add(id); 494 // } 495 // 496 // // deregisterIcuTransliterators(rawAvailable); 497 // 498 // Set<String> available = dependencyOrder.getOrderedItems(rawAvailable, filter, false); 499 // List<String> reversed = new LinkedList<>(); 500 // for (String item : available) { 501 // reversed.add(0, item); 502 // } 503 // // available.retainAll(rawAvailable); // remove the items we won't touch anyway 504 // // rawAvailable.removeAll(available); // now the ones whose order doesn't matter 505 // // deregisterIcuTransliterators(rawAvailable); 506 // deregisterIcuTransliterators(reversed); 507 // 508 // for (Enumeration<String> en = Transliterator.getAvailableIDs(); en.hasMoreElements();) 509 // { 510 // String oldId = en.nextElement(); 511 // append("Retaining: " + oldId + "\n"); 512 // } 513 // } 514 // 515 // public void deregisterIcuTransliterators(Collection<String> available) { 516 // for (String oldId : available) { 517 // Transliterator t; 518 // try { 519 // t = Transliterator.getInstance(oldId); 520 // } catch (IllegalArgumentException e) { 521 // if (e.getMessage().startsWith("Illegal ID")) { 522 // continue; 523 // } 524 // append("Failure with: " + oldId); 525 // t = Transliterator.getInstance(oldId); 526 // throw e; 527 // } catch (RuntimeException e) { 528 // append("Failure with: " + oldId); 529 // t = Transliterator.getInstance(oldId); 530 // throw e; 531 // } 532 // String className = t.getClass().getName(); 533 // if (className.endsWith(".CompoundTransliterator") 534 // || className.endsWith(".RuleBasedTransliterator") 535 // || className.endsWith(".AnyTransliterator")) { 536 // appendln("REMOVING: " + oldId); 537 // Transliterator.unregister(oldId); 538 // } else { 539 // appendln("Retaining: " + oldId + "\t\t" + className); 540 // } 541 // } 542 // } 543 544 public enum Direction { 545 backward, 546 both, 547 forward 548 } 549 550 public enum Visibility { 551 external, 552 internal 553 } 554 555 public static class ParsedTransformID { 556 public String source = "Any"; 557 public String target = "Any"; 558 public String variant; 559 protected String[] aliases = {}; 560 protected String[] backwardAliases = {}; 561 protected Direction direction = null; 562 protected Visibility visibility; 563 getId()564 public String getId() { 565 return getSource() 566 + "-" 567 + getTarget() 568 + (getVariant() == null ? "" : "/" + getVariant()); 569 } 570 getDisplayId()571 public String getDisplayId() { 572 return getDisplaySource() 573 + "-" 574 + getDisplayTarget() 575 + (getVariant() == null ? "" : "/" + getDisplayVariant()); 576 } 577 getDisplayVariant()578 private String getDisplayVariant() { 579 return getVariant(); 580 } 581 getDisplayTarget()582 private String getDisplayTarget() { 583 return getDisplaySourceOrTarget(getTarget()); 584 } 585 getDisplaySource()586 private String getDisplaySource() { 587 return getDisplaySourceOrTarget(getSource()); 588 } 589 getDisplaySourceOrTarget(String sourceOrTarget)590 private String getDisplaySourceOrTarget(String sourceOrTarget) { 591 int uscript = UScript.getCodeFromName(sourceOrTarget); 592 if (uscript >= 0) { 593 return UScript.getName(uscript); 594 } 595 if (sourceOrTarget.contains("FONIPA")) { 596 return "IPA"; 597 } 598 if (sourceOrTarget.equals("InterIndic")) { 599 return "Indic"; 600 } 601 try { 602 String name = CLDRConfig.getInstance().getEnglish().getName(sourceOrTarget); 603 return name; 604 } catch (Exception e) { 605 return sourceOrTarget; 606 } 607 } 608 609 static final LikelySubtags likely = new LikelySubtags(); 610 getScriptCode(String sourceOrTarget)611 public static String getScriptCode(String sourceOrTarget) { 612 int uscript = UScript.getCodeFromName(sourceOrTarget); 613 if (uscript >= 0) { 614 return UScript.getShortName(uscript); 615 } 616 if (sourceOrTarget.contains("FONIPA")) { 617 return "Ipa0"; 618 } 619 if (sourceOrTarget.equals("InterIndic")) { 620 return "Ind0"; 621 } 622 try { 623 String max = likely.maximize(sourceOrTarget); 624 return max == null ? null : new LanguageTagParser().set(max).getScript(); 625 } catch (Exception e) { 626 return null; 627 } 628 } 629 getBackwardId()630 public String getBackwardId() { 631 return getTarget() 632 + "-" 633 + getSource() 634 + (getVariant() == null ? "" : "/" + getVariant()); 635 } 636 ParsedTransformID()637 public ParsedTransformID() {} 638 set( String source, String target, String variant, Direction direction)639 public ParsedTransformID set( 640 String source, String target, String variant, Direction direction) { 641 this.source = source; 642 this.target = target; 643 this.variant = variant; 644 this.direction = direction; 645 return this; 646 } 647 set(String id)648 public ParsedTransformID set(String id) { 649 variant = null; 650 int pos = id.indexOf('-'); 651 if (pos < 0) { 652 source = "Any"; 653 target = id; 654 return this; 655 } 656 source = id.substring(0, pos); 657 int pos2 = id.indexOf('/', pos); 658 if (pos2 < 0) { 659 target = id.substring(pos + 1); 660 return this; 661 } 662 target = id.substring(pos + 1, pos2); 663 variant = id.substring(pos2 + 1); 664 return this; 665 } 666 reverse()667 public ParsedTransformID reverse() { 668 String temp = source; 669 source = target; 670 target = temp; 671 return this; 672 } 673 getTargetVariant()674 public String getTargetVariant() { 675 return target + (variant == null ? "" : "/" + variant); 676 } 677 getSourceVariant()678 public String getSourceVariant() { 679 return source + (variant == null ? "" : "/" + variant); 680 } 681 setDirection(Direction direction)682 protected void setDirection(Direction direction) { 683 this.direction = direction; 684 } 685 getDirection()686 public Direction getDirection() { 687 return direction; 688 } 689 setVariant(String variant)690 public void setVariant(String variant) { 691 this.variant = variant; 692 } 693 getVariant()694 protected String getVariant() { 695 return variant; 696 } 697 setTarget(String target)698 public void setTarget(String target) { 699 this.target = target; 700 } 701 getTarget()702 public String getTarget() { 703 return target; 704 } 705 setSource(String source)706 public void setSource(String source) { 707 this.source = source; 708 } 709 getSource()710 public String getSource() { 711 return source; 712 } 713 714 @Override toString()715 public String toString() { 716 return source + "-" + getTargetVariant(); 717 } 718 getId(String source, String target, String variant)719 public static String getId(String source, String target, String variant) { 720 String id = source + '-' + target; 721 if (variant != null) id += "/" + variant; 722 return id; 723 } 724 reverse(String id)725 public static String reverse(String id) { 726 return new ParsedTransformID().set(id).getBackwardId(); 727 } 728 setAliases(String[] aliases)729 public void setAliases(String[] aliases) { 730 this.aliases = aliases; 731 } 732 getAliases()733 public String[] getAliases() { 734 return aliases; 735 } 736 setBackwardAliases(String[] backwardAliases)737 public void setBackwardAliases(String[] backwardAliases) { 738 this.backwardAliases = backwardAliases; 739 } 740 getBackwardAliases()741 public String[] getBackwardAliases() { 742 return backwardAliases; 743 } 744 setVisibility(String string)745 protected void setVisibility(String string) { 746 visibility = Visibility.valueOf(string); 747 } 748 getVisibility()749 public Visibility getVisibility() { 750 return visibility; 751 } 752 } 753 754 /** 755 * Verify that if the transliterator exists, it has a null filter 756 * 757 * @param id 758 */ verifyNullFilter(String id)759 public static void verifyNullFilter(String id) { 760 Transliterator widen; 761 try { 762 widen = Transliterator.getInstance(id); 763 } catch (Exception e) { 764 return; 765 } 766 UnicodeFilter filter = widen.getFilter(); 767 if (filter != null) { 768 throw new IllegalArgumentException(id + " has non-empty filter: " + filter); 769 } 770 } 771 772 public static class MyHandler extends XMLFileReader.SimpleHandler { 773 boolean first = true; 774 ParsedTransformID directionInfo; 775 String cldrFileName; 776 StringBuilder rules = new StringBuilder(); 777 getRules()778 public String getRules() { 779 return rules.toString(); 780 } 781 MyHandler(String cldrFileName, ParsedTransformID directionInfo)782 public MyHandler(String cldrFileName, ParsedTransformID directionInfo) { 783 super(); 784 this.cldrFileName = cldrFileName; 785 this.directionInfo = directionInfo; 786 } 787 788 @Override handlePathValue(String path, String value)789 public void handlePathValue(String path, String value) { 790 if (first) { 791 if (path.startsWith("//supplementalData/version")) { 792 return; 793 } else if (path.startsWith("//supplementalData/generation")) { 794 return; 795 } 796 XPathParts parts = XPathParts.getFrozenInstance(path); 797 Map<String, String> attributes = parts.findAttributes("transform"); 798 if (attributes == null) { 799 throw new IllegalArgumentException( 800 "Not an XML transform file: " + cldrFileName + "\t" + path); 801 } 802 directionInfo.setSource(attributes.get("source")); 803 directionInfo.setTarget(attributes.get("target")); 804 directionInfo.setVariant(attributes.get("variant")); 805 directionInfo.setDirection( 806 Direction.valueOf(attributes.get("direction").toLowerCase(Locale.ENGLISH))); 807 808 String alias = attributes.get("alias"); 809 if (alias != null) { 810 directionInfo.setAliases(alias.trim().split("\\s+")); 811 } 812 813 String backwardAlias = attributes.get("backwardAlias"); 814 if (backwardAlias != null) { 815 directionInfo.setBackwardAliases(backwardAlias.trim().split("\\s+")); 816 } 817 818 directionInfo.setVisibility(attributes.get("visibility")); 819 first = false; 820 } 821 if (path.indexOf("/comment") >= 0) { 822 // skip 823 } else if (path.indexOf("/tRule") >= 0) { 824 value = fixup.transliterate(value); 825 rules.append(value).append(CldrUtility.LINE_SEPARATOR); 826 } else { 827 throw new IllegalArgumentException("Unknown element: " + path + "\t " + value); 828 } 829 } 830 } 831 832 static boolean ALREADY_REGISTERED = false; 833 /** 834 * Register just those transliterators that are different than ICU. TODO: check against the file 835 * system to make sure the list is accurate. 836 */ registerModified()837 public void registerModified() { 838 synchronized (CLDRTransforms.class) { 839 if (ALREADY_REGISTERED) { 840 return; 841 } 842 // NEW 843 registerTranslit("Lao-Latin", "ບ", "b"); 844 registerTranslit("Khmer-Latin", "ឥ", "ĕ"); 845 registerTranslit("Sinhala-Latin", "ක", "ka"); 846 registerTranslit("Japn-Latn", "譆", "aa"); 847 848 // MODIFIED 849 registerTranslit("Han-SpacedHan", "《", "«"); 850 registerTranslit("Greek-Latin", "΄", "´"); 851 registerTranslit("Hebrew-Latin", "־", "-"); 852 registerTranslit("Cyrillic-Latin", "ө", "ö"); 853 registerTranslit("Myanmar-Latin", "ဿ", "s"); 854 registerTranslit("Latin-Armenian", "’", "՚"); 855 856 registerTranslit("Interindic-Latin", "\uE070", ".", "\uE03C", "\u0323", "\uE04D", ""); 857 858 registerTranslit("Malayalam-Interindic", "ൺ", ""); 859 registerTranslit("Interindic-Malayalam", "", "ണ്"); 860 registerTranslit("Malayalam-Latin", "ൺ", "ṇ"); 861 862 registerTranslit("Devanagari-Interindic", "ॲ", "\uE084"); 863 registerTranslit("Devanagari-Latin", "ॲ", "æ"); 864 865 registerTranslit("Arabic-Latin", "؉", "‰"); 866 ALREADY_REGISTERED = true; 867 } 868 } 869 870 private static final ImmutableSet<String> noSkip = ImmutableSet.of(); 871 872 private static final boolean SHOW = false; 873 private static final boolean SHOW_FAILED_MATCHES = false; 874 875 /** Register a transliterator and verify that a sample changed value is accurate */ registerTranslit(String ID, String... sourcePairs)876 public void registerTranslit(String ID, String... sourcePairs) { 877 String internalId = registerTransliteratorsFromXML(TRANSFORM_DIR, ID, noSkip, true); 878 Transliterator.registerAny(); // do this last! 879 Transliterator t = null; 880 try { 881 t = Transliterator.getInstance(internalId); 882 } catch (Exception e) { 883 System.out.println("For " + ID + " (" + internalId + ")"); 884 e.printStackTrace(); 885 return; 886 } 887 testSourceTarget(t, sourcePairs); 888 } 889 showTransliterator(String prefix, Transliterator t, int limit)890 public static void showTransliterator(String prefix, Transliterator t, int limit) { 891 showTransliterator(prefix, t, limit, System.out); 892 System.out.flush(); 893 } 894 showTransliterator( String prefix, Transliterator t, int limit, T output)895 public static <T extends Appendable> T showTransliterator( 896 String prefix, Transliterator t, int limit, T output) { 897 if (!prefix.isEmpty()) { 898 prefix += " "; 899 } 900 try { 901 output.append(prefix + "ID:\t" + t.getID() + "\n"); 902 output.append(prefix + "Class:\t" + t.getClass().getName() + "\n"); 903 if (t.getFilter() != null) { 904 output.append(prefix + "Filter:\t" + t.getFilter().toPattern(false) + "\n"); 905 } 906 if (t instanceof RuleBasedTransliterator) { 907 RuleBasedTransliterator rbt = (RuleBasedTransliterator) t; 908 String[] rules = rbt.toRules(true).split("\n"); 909 int length = rules.length; 910 if (limit >= 0 && limit < length) length = limit; 911 output.append(prefix + "Rules:\n"); 912 prefix += "\t"; 913 for (int i = 0; i < length; ++i) { 914 output.append(prefix + rules[i] + "\n"); 915 } 916 } else { 917 Transliterator[] elements = t.getElements(); 918 if (elements[0] == t) { 919 output.append(prefix + "\tNonRuleBased\n"); 920 return output; 921 } else { 922 prefix += "\t"; 923 for (int i = 0; i < elements.length; ++i) { 924 showTransliterator(prefix, elements[i], limit, output); 925 } 926 } 927 } 928 } catch (IOException e) { 929 throw new ICUUncheckedIOException(e); 930 } 931 return output; 932 } 933 testSourceTarget(Transliterator t, String... sourcePairs)934 public static void testSourceTarget(Transliterator t, String... sourcePairs) { 935 for (int i = 0; i < sourcePairs.length; i += 2) { 936 String sourceTest = sourcePairs[i]; 937 String targetTest = sourcePairs[i + 1]; 938 String target = t.transform(sourceTest); 939 if (!target.equals(targetTest)) { 940 throw new IllegalArgumentException( 941 t.getID() 942 + " For " 943 + sourceTest 944 + ", expected " 945 + targetTest 946 + ", got " 947 + target); 948 } 949 } 950 } 951 952 /** 953 * Gets a transform from a script to Latin. for testing For a locale, use 954 * ExemplarUtilities.getScript(locale) to get the script 955 */ getTestingLatinScriptTransform(final String script)956 public static Transliterator getTestingLatinScriptTransform(final String script) { 957 String id; 958 959 switch (script) { 960 case "Latn": 961 return null; 962 case "Khmr": 963 id = "Khmr-Latn/UNGEGN"; 964 break; 965 case "Laoo": 966 id = "Laoo-Latn/UNGEGN"; 967 break; 968 case "Sinh": 969 id = "Sinh-Latn/UNGEGN"; 970 break; 971 case "Japn": 972 id = "Jpan-Latn"; 973 break; 974 case "Kore": 975 id = "Hangul-Latn"; 976 break; 977 case "Hant": 978 case "Hans": 979 id = "Han-Latn"; 980 break; 981 case "Olck": 982 id = "sat_Olck-sat_FONIPA"; // Latin IPA 983 break; 984 case "Cher": 985 id = "chr-chr_FONIPA"; 986 break; 987 default: 988 id = script + "-Latn"; 989 } 990 return Transliterator.getInstance(id); 991 } 992 993 /** 994 * Returns the set of all files that can be registered, in an order that makes sure that all 995 * dependencies are handled. That is, if X uses Y in its rules, then Y has to come before X. 996 * 997 * <p>The problem is that when you build a transliterator from rules, and one of those rules is 998 * to call another transliterator X, it inserts the <b>currently</b> registered transliterator 999 * into the transliterator being built. So whenever a transliterator X is changed, you have to 1000 * reregister every transliterator that calls X. Otherwise the old version of X sticks around in 1001 * those calling transliterators. So the order that you register transliterators is important! 1002 */ getFileRegistrationOrder(String dir)1003 public static Set<String> getFileRegistrationOrder(String dir) { 1004 if (dir == null) { 1005 dir = TRANSFORM_DIR; 1006 } 1007 List<String> files = getAvailableIds(); 1008 Multimap<String, String> fileToAliases = HashMultimap.create(); 1009 Multimap<String, String> fileToDependencies = TreeMultimap.create(); 1010 for (String file : files) { 1011 // Very simple test that depends on standard format 1012 // eg 1013 // ::[॑ ॒ ॔ ॓ ़ ँ-ः । ॥ ॰ ०-९ ॐ ॲ ऄ-ऋ ॠ ऌ ॡ ऍ-कक़ खख़ गग़ घ-जज़ झ-डड़ ढढ़ ण-फफ़ ब-यय़ 1014 // र-ह ऽ ॽ ा-ॄ ॢ ॣ ॅ-्]; 1015 // ::NFD; 1016 // ::Devanagari-InterIndic; 1017 // ::InterIndic-Latin; 1018 // ::NFC; 1019 ParsedTransformID directionInfo = new ParsedTransformID(); 1020 String ruleString = getIcuRulesFromXmlFile(dir, file, directionInfo); 1021 Set<String> others = new LinkedHashSet<>(); 1022 Set<String> order = 1023 ruleString 1024 .lines() 1025 .map(x -> x.trim()) 1026 .filter(x -> x.contains("::") && !x.trim().startsWith("#")) 1027 .map(x -> parseDoubleColon(x, others)) 1028 .collect(Collectors.toCollection(LinkedHashSet::new)); 1029 order.addAll(others); 1030 if (SHOW) { 1031 System.out.println(file + "=>" + order); 1032 } 1033 if (!order.isEmpty()) { 1034 fileToDependencies.putAll(file, order); 1035 } 1036 if (directionInfo.direction != Direction.backward) { // that is, forward or both 1037 fileToAliases.put(file, directionInfo.getId()); 1038 fileToAliases.putAll(file, Arrays.asList(directionInfo.getAliases())); 1039 if (SHOW) { 1040 System.out.println( 1041 "\t" 1042 + directionInfo.getId() 1043 + "\t" 1044 + Arrays.asList(directionInfo.getAliases())); 1045 } 1046 } 1047 if (directionInfo.direction != Direction.forward) { // that is, backward or both 1048 fileToAliases.put(file, directionInfo.getBackwardId()); 1049 fileToAliases.putAll(file, Arrays.asList(directionInfo.getBackwardAliases())); 1050 if (SHOW) { 1051 System.out.println( 1052 "\t" 1053 + directionInfo.getBackwardId() 1054 + "\t" 1055 + Arrays.asList(directionInfo.getBackwardAliases())); 1056 } 1057 } 1058 } 1059 TreeMultimap<String, String> aliasesToFile = 1060 Multimaps.invertFrom(fileToAliases, TreeMultimap.create()); 1061 Multimap<String, String> fileToDependentFiles = TreeMultimap.create(); 1062 1063 for (Entry<String, Collection<String>> entry : fileToDependencies.asMap().entrySet()) { 1064 Set<String> v = 1065 entry.getValue().stream() 1066 .filter(x -> aliasesToFile.containsKey(x)) 1067 .map(y -> aliasesToFile.get(y).first()) 1068 .collect(Collectors.toSet()); 1069 fileToDependentFiles.putAll(entry.getKey(), v); 1070 } 1071 Builder<String> comp = new DiscreteComparator.Builder<>(null); 1072 fileToDependentFiles.forEach( 1073 (x, y) -> { 1074 if (SHOW) { 1075 System.out.println(x + "=" + y); 1076 } 1077 comp.add(y, x); // put dependent earlier 1078 }); 1079 // .add("c", "d", "b", "a").add("m", "n", "d").get(); 1080 1081 DiscreteComparator<String> comp2 = comp.get(); 1082 Set<String> orderedDependents = new LinkedHashSet<>(comp2.getOrdering()); 1083 orderedDependents.retainAll( 1084 fileToDependentFiles.values()); // remove files that are not dependents 1085 Set<String> remainingFiles = new TreeSet<>(files); 1086 remainingFiles.removeAll(orderedDependents); 1087 orderedDependents.addAll(remainingFiles); 1088 if (SHOW_FAILED_MATCHES) { 1089 System.out.println(orderedDependents); 1090 } 1091 return ImmutableSet.copyOf(orderedDependents); 1092 } 1093 // fails match: :: [:Latin:] fullwidth-halfwidth (); 1094 1095 static final Pattern TRANSLIT_FINDER = 1096 Pattern.compile( 1097 "\\s*::\\s*" 1098 + "(?:\\[[^\\]]+\\]\\s*)?" 1099 + "([A-Za-z0-9////_//-]*)?" 1100 + "(?:" 1101 + "\\s*\\(" 1102 + "(?:\\[[^\\]]+\\]\\s*)?" 1103 + "([A-Za-z0-9////_//-]*)?" 1104 + "\\s*\\)" 1105 + ")?" 1106 + "\\s*;\\s*(#.*)?"); 1107 // static { 1108 // Matcher matcher = TRANSLIT_FINDER.matcher("::[:Latin:] fullwidth-halfwidth();"); 1109 // System.out.println(matcher.matches()); 1110 // } 1111 parseDoubleColon(String x, Set<String> others)1112 static String parseDoubleColon(String x, Set<String> others) { 1113 Matcher matcher = TRANSLIT_FINDER.matcher(x); 1114 if (matcher.matches()) { 1115 String first = matcher.group(1); 1116 String second = matcher.group(2); 1117 if (SHOW) { 1118 System.out.println("1: " + first + "\t2:" + second); 1119 } 1120 if (second != null && !second.isBlank()) { 1121 others.add(second); 1122 } 1123 return first == null || first.isBlank() ? "" : first; 1124 } else { 1125 if (SHOW_FAILED_MATCHES) { 1126 System.out.println("fails match: " + x); 1127 } 1128 } 1129 return ""; 1130 } 1131 1132 public class CLDRTransformsJsonIndex { 1133 /** raw list of available IDs */ 1134 public String[] available = 1135 getAvailableIds().stream() 1136 .map((String id) -> id.replace(".xml", "")) 1137 .sorted() 1138 .collect(Collectors.toList()) 1139 .toArray(new String[0]); 1140 } 1141 1142 /** This gets the metadata (index file) exposed as cldr-json/cldr-transforms/transforms.json */ getJsonIndex()1143 public CLDRTransformsJsonIndex getJsonIndex() { 1144 final CLDRTransformsJsonIndex index = new CLDRTransformsJsonIndex(); 1145 return index; 1146 } 1147 } 1148