1 package org.unicode.cldr.util; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.collect.Iterables; 5 import com.google.common.collect.Multimap; 6 import com.google.common.collect.Multiset; 7 import com.google.common.collect.Sets; 8 import com.google.common.collect.TreeMultimap; 9 import com.google.common.collect.TreeMultiset; 10 import java.util.ArrayList; 11 import java.util.Arrays; 12 import java.util.Collection; 13 import java.util.Collections; 14 import java.util.HashSet; 15 import java.util.List; 16 import java.util.NavigableSet; 17 import java.util.Objects; 18 import java.util.Set; 19 import java.util.TreeSet; 20 import java.util.regex.Matcher; 21 import java.util.regex.Pattern; 22 23 /** 24 * Helper class that allows logging the use of regular expressions. A class that will summarize them 25 * will get a NavigabSet of PatternCountInterface instances. 26 * 27 * @author ribnitz 28 */ 29 public class RegexLogger { 30 /** Should debugging be done? - if not, a null implementation will be used */ 31 private static final boolean DEBUG = false; 32 /** Instance */ 33 private static RegexLoggerInterface instance = null; 34 getInstance()35 public static RegexLoggerInterface getInstance() { 36 if (instance == null) { 37 if (DEBUG) { 38 instance = new RegexLoggerImpl(); 39 } else { 40 instance = new NullRegexLogger(); 41 } 42 } 43 return instance; 44 } 45 46 private static class PatternStringWithBoolean implements Comparable<PatternStringWithBoolean> { 47 private final String pattern; 48 private final boolean calledFromRegexFinder; 49 private final int hashCode; 50 PatternStringWithBoolean(String patternStr, boolean calledFromRegexFinder)51 public PatternStringWithBoolean(String patternStr, boolean calledFromRegexFinder) { 52 this.pattern = patternStr.trim(); 53 this.calledFromRegexFinder = calledFromRegexFinder; 54 hashCode = Objects.hash(this.pattern, this.calledFromRegexFinder); 55 } 56 57 @Override hashCode()58 public int hashCode() { 59 return hashCode; 60 } 61 getPattern()62 public String getPattern() { 63 return pattern; 64 } 65 isCalledFromRegexFinder()66 public boolean isCalledFromRegexFinder() { 67 return calledFromRegexFinder; 68 } 69 70 @Override equals(Object obj)71 public boolean equals(Object obj) { 72 if (this == obj) { 73 return true; 74 } 75 if (obj == null) { 76 return false; 77 } 78 if (getClass() != obj.getClass()) { 79 return false; 80 } 81 PatternStringWithBoolean other = (PatternStringWithBoolean) obj; 82 if (calledFromRegexFinder != other.calledFromRegexFinder) { 83 return false; 84 } 85 if (hashCode != other.hashCode) { 86 return false; 87 } 88 if (other.pattern != null) { 89 return false; 90 } 91 if (!pattern.equals(other.pattern)) { 92 return false; 93 } 94 return true; 95 } 96 97 @Override compareTo(PatternStringWithBoolean o)98 public int compareTo(PatternStringWithBoolean o) { 99 if (o == null) { 100 return 1; 101 } 102 if (this == o) { 103 return 0; 104 } 105 return pattern.compareTo(o.pattern); 106 } 107 } 108 109 /** 110 * Interface used for logging Regular expressions 111 * 112 * @author ribnitz 113 */ 114 public static interface RegexLoggerInterface { 115 /** 116 * Log that the given pattern was applied on the given matchStr, whether it matched, and 117 * what the type of the log was. Cls conains the calling class. 118 * 119 * @param pattern 120 * @param matchStr 121 * @param matched 122 * @param type 123 * @param cls 124 */ log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls)125 void log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls); 126 log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls)127 void log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls); 128 log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls)129 void log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls); 130 log( String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)131 void log( 132 String pattern, 133 String matchStr, 134 boolean matched, 135 double time, 136 LogType type, 137 Class<?> cls); 138 139 /** 140 * Get all the entries that matched 141 * 142 * @return 143 */ getEntries()144 NavigableSet<PatternCountInterface> getEntries(); 145 146 /** 147 * Get the entries that occurred at least minCount times. If there are no matches, an empty 148 * set is returned 149 * 150 * @param minCount 151 * @return 152 */ getEntries(final int minCount)153 NavigableSet<PatternCountInterface> getEntries(final int minCount); 154 isEnabled()155 boolean isEnabled(); 156 } 157 158 /** 159 * Three of the methods can be delegations, which reduces the actual implementation to two 160 * methods 161 * 162 * @author ribnitz 163 */ 164 private abstract static class AbstractRegexLogger implements RegexLoggerInterface { 165 166 @Override log( Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls)167 public void log( 168 Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls) { 169 log(matcher.pattern(), matchStr, matched, type, cls); 170 } 171 172 @Override log( Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls)173 public void log( 174 Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls) { 175 log(pattern.pattern(), matchStr, matched, type, cls); 176 } 177 178 @Override log( String pattern, String matchStr, boolean matched, LogType type, Class<?> cls)179 public void log( 180 String pattern, String matchStr, boolean matched, LogType type, Class<?> cls) { 181 log(pattern, matchStr, matched, 0, type, cls); 182 } 183 184 /** Get all entries */ 185 @Override getEntries()186 public NavigableSet<PatternCountInterface> getEntries() { 187 return getEntries(1); 188 } 189 190 @Override isEnabled()191 public boolean isEnabled() { 192 return DEBUG; 193 } 194 } 195 196 /** 197 * Null implementation 198 * 199 * @author ribnitz 200 */ 201 private static class NullRegexLogger extends AbstractRegexLogger { 202 203 @Override log( String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)204 public void log( 205 String pattern, 206 String matchStr, 207 boolean matched, 208 double time, 209 LogType type, 210 Class<?> cls) { 211 // do nothing 212 } 213 214 @Override getEntries(int minCount)215 public NavigableSet<PatternCountInterface> getEntries(int minCount) { 216 NavigableSet<PatternCountInterface> returned = Sets.newTreeSet(Collections.EMPTY_SET); 217 return returned; 218 } 219 } 220 221 /** 222 * Inetface used for the entries returnred by the RegexLogger 223 * 224 * @author ribnitz 225 */ 226 public static interface PatternCountInterface { 227 /** 228 * Get the pattern used 229 * 230 * @return 231 */ getPattern()232 String getPattern(); 233 234 /** 235 * Get the number of successful matches obtained through FIND 236 * 237 * @return 238 */ getNumberOfFindMatches()239 int getNumberOfFindMatches(); 240 241 /** 242 * Get the number of unsuccessful matches obtained through FIND 243 * 244 * @return 245 */ getNumberOfFindFailures()246 int getNumberOfFindFailures(); 247 248 /** 249 * Get the number of successful matches obtained through MATCH 250 * 251 * @return 252 */ getNumberOfMatchMatches()253 int getNumberOfMatchMatches(); 254 255 /** 256 * Get the number of unsuccessful matches obtained through FIND 257 * 258 * @return 259 */ getNumberOfMatchFailures()260 int getNumberOfMatchFailures(); 261 262 /** 263 * Return true if this call was made from RegexFinder 264 * 265 * @return 266 */ isCalledFromRegexFinder()267 boolean isCalledFromRegexFinder(); 268 269 /** 270 * Get a set of all call locations 271 * 272 * @return 273 */ getCallLocations()274 Set<String> getCallLocations(); 275 } 276 277 /** 278 * GetAll uses this class to add all the entries of a multiSet to the result set, constructing 279 * the object to return for each pattern. Objects will only be added once. 280 * 281 * <p>This is the implementatioon that adds all items. 282 * 283 * @author ribnitz 284 */ 285 private static class AddAllEntryProcessor { 286 protected final int minCount; 287 protected final CountSets c; 288 protected final Set<PatternStringWithBoolean> seen = new HashSet<>(); 289 protected final NavigableSet<PatternCountInterface> result = new TreeSet<>(); 290 AddAllEntryProcessor(int minCount, CountSets c)291 public AddAllEntryProcessor(int minCount, CountSets c) { 292 this.minCount = minCount; 293 this.c = c; 294 } 295 getResult()296 public NavigableSet<PatternCountInterface> getResult() { 297 return result; 298 } 299 process( PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet)300 public void process( 301 PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet) { 302 if (!seen.contains(item)) { 303 result.add(new RegexKeyWithCount(item, c)); 304 seen.add(item); 305 } 306 } 307 } 308 309 /** 310 * Sometimes getEntries is called with a minCount; this Class filters and only adds the items 311 * that occur at least minCount times. 312 * 313 * @author ribnitz 314 */ 315 private static class EntryProcessor extends AddAllEntryProcessor { EntryProcessor(int minCount, CountSets c)316 public EntryProcessor(int minCount, CountSets c) { 317 super(minCount, c); 318 } 319 320 @Override process( PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet)321 public void process( 322 PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet) { 323 if (countSet.count(item) >= minCount) { 324 super.process(item, countSet); 325 } 326 } 327 } 328 329 /** 330 * Since all the inner classes are static, this object is used to pass around the refernces to 331 * the different sets/the state 332 * 333 * @author ribnitz 334 */ 335 private static class CountSets { 336 final Multiset<PatternStringWithBoolean> matchedFindSet; 337 final Multiset<PatternStringWithBoolean> failedFindSet; 338 final Multiset<PatternStringWithBoolean> matchedMatchSet; 339 final Multiset<PatternStringWithBoolean> failedMatchSet; 340 final Multimap<PatternStringWithBoolean, String> stacktraces; 341 CountSets( Multiset<PatternStringWithBoolean> matchedFindSet, Multiset<PatternStringWithBoolean> failedFindSet, Multiset<PatternStringWithBoolean> matchedMatchSet, Multiset<PatternStringWithBoolean> failedMatchSet, Multimap<PatternStringWithBoolean, String> occurrences)342 public CountSets( 343 Multiset<PatternStringWithBoolean> matchedFindSet, 344 Multiset<PatternStringWithBoolean> failedFindSet, 345 Multiset<PatternStringWithBoolean> matchedMatchSet, 346 Multiset<PatternStringWithBoolean> failedMatchSet, 347 Multimap<PatternStringWithBoolean, String> occurrences) { 348 this.failedFindSet = failedFindSet; 349 this.failedMatchSet = failedMatchSet; 350 this.matchedMatchSet = matchedMatchSet; 351 this.stacktraces = occurrences; 352 this.matchedFindSet = matchedFindSet; 353 } 354 } 355 356 private static class RegexKeyWithCount 357 implements PatternCountInterface, Comparable<PatternCountInterface> { 358 private final String pattern; 359 private final int findMatchCount; 360 private final int findFailCount; 361 private final int matchMatchCount; 362 private final int matchFailCount; 363 private final boolean calledFromRegexFinder; 364 private final Set<String> callLocations = new HashSet<>(); 365 private final int hashCode; 366 RegexKeyWithCount(PatternStringWithBoolean key, CountSets bean)367 public RegexKeyWithCount(PatternStringWithBoolean key, CountSets bean) { 368 this.pattern = key.getPattern(); 369 this.calledFromRegexFinder = key.isCalledFromRegexFinder(); 370 this.findMatchCount = bean.matchedFindSet.count(key); 371 this.findFailCount = bean.failedFindSet.count(key); 372 this.matchMatchCount = bean.matchedMatchSet.count(key); 373 this.matchFailCount = bean.failedMatchSet.count(key); 374 Collection<String> tmp = bean.stacktraces.get(key); 375 for (String cur : tmp) { 376 if (!callLocations.contains(cur)) { 377 callLocations.add(cur); 378 } 379 } 380 this.hashCode = 381 Objects.hash( 382 this.pattern, 383 this.findMatchCount, 384 this.findFailCount, 385 this.matchFailCount, 386 this.matchMatchCount, 387 this.calledFromRegexFinder, 388 this.callLocations); 389 } 390 391 @Override getPattern()392 public String getPattern() { 393 return pattern; 394 } 395 396 @Override hashCode()397 public int hashCode() { 398 return hashCode; 399 } 400 401 @Override getNumberOfFindMatches()402 public int getNumberOfFindMatches() { 403 return findMatchCount; 404 } 405 406 @Override getNumberOfFindFailures()407 public int getNumberOfFindFailures() { 408 return findFailCount; 409 } 410 411 @Override getNumberOfMatchMatches()412 public int getNumberOfMatchMatches() { 413 return matchMatchCount; 414 } 415 416 @Override getNumberOfMatchFailures()417 public int getNumberOfMatchFailures() { 418 return matchFailCount; 419 } 420 421 @Override equals(Object obj)422 public boolean equals(Object obj) { 423 if (this == obj) { 424 return true; 425 } 426 if (obj == null) { 427 return false; 428 } 429 if (hashCode != obj.hashCode()) { 430 return false; 431 } 432 if (getClass() != obj.getClass()) { 433 return false; 434 } 435 RegexKeyWithCount other = (RegexKeyWithCount) obj; 436 if (matchFailCount != other.matchFailCount) { 437 return false; 438 } 439 if (matchMatchCount != other.matchMatchCount) { 440 return false; 441 } 442 if (findFailCount != other.findFailCount) { 443 return false; 444 } 445 if (findMatchCount != other.findMatchCount) { 446 return false; 447 } 448 if (!pattern.equals(other.pattern)) { 449 return false; 450 } 451 if (calledFromRegexFinder != other.calledFromRegexFinder) { 452 return false; 453 } 454 if (callLocations != other.callLocations) { 455 return false; 456 } 457 return true; 458 } 459 460 @Override compareTo(PatternCountInterface o)461 public int compareTo(PatternCountInterface o) { 462 if (o == null) { 463 return 1; 464 } 465 return Integer.compare( 466 matchFailCount + matchMatchCount + findFailCount + findMatchCount, 467 o.getNumberOfFindFailures() 468 + o.getNumberOfFindMatches() 469 + o.getNumberOfMatchFailures() 470 + o.getNumberOfMatchMatches()); 471 } 472 473 @Override isCalledFromRegexFinder()474 public boolean isCalledFromRegexFinder() { 475 return calledFromRegexFinder; 476 } 477 478 @Override getCallLocations()479 public Set<String> getCallLocations() { 480 return callLocations; 481 } 482 } 483 484 public enum LogType { 485 FIND, 486 MATCH 487 } 488 489 private static interface IterableTransformer<E, F> { transform(Iterable<E> input)490 Iterable<F> transform(Iterable<E> input); 491 } 492 493 private static class StringIterableTransformer implements IterableTransformer<String, String> { 494 495 @Override transform(Iterable<String> input)496 public Iterable<String> transform(Iterable<String> input) { 497 List<String> returned = new ArrayList<>(Iterables.size(input)); 498 String lastClass = null; 499 for (String current : input) { 500 String transformed = current; 501 if (lastClass != null) { 502 if (lastClass.startsWith("RegexLookup") 503 && !current.startsWith("org.unicode.cldr.util.RegexLookup")) { 504 returned.add(lastClass); 505 } 506 break; 507 } 508 if (current.startsWith("org.unicode.cldr.test.CheckCLDR") 509 && 510 /* 511 * TODO: fix this function to avoid referencing lastClass when it is null. 512 * The condition lastClass == null here prevents compiler warning/error or possible NullPointerException, 513 * since lastClass is ALWAYS null here; but this is obviously not the best solution. 514 */ 515 (lastClass == null 516 || !lastClass.startsWith("org.unicode.cldr.test.CheckCLDR"))) { 517 lastClass = current; 518 // leave out 519 continue; 520 } 521 // remove org.unicode.cldr 522 if (current.startsWith("org.unicode.cldr.util.")) { 523 transformed = current.substring("org.unicode.cldr.util.".length()); 524 } 525 // only the last RegexLookup will be added 526 if (!transformed.startsWith("RegexLookup")) { 527 returned.add(transformed); 528 } 529 lastClass = transformed; 530 } 531 return returned; 532 } 533 } 534 535 private static class ClassnameOnlyStringTransformer 536 implements IterableTransformer<String, String> { 537 538 @Override transform(Iterable<String> input)539 public Iterable<String> transform(Iterable<String> input) { 540 List<String> returned = new ArrayList<>(Iterables.size(input)); 541 String lastClass = null; 542 for (String current : input) { 543 if (current.lastIndexOf(".") > 0) { 544 current = current.substring(current.lastIndexOf(".")); 545 } 546 if (lastClass != null) { 547 if (lastClass.startsWith("RegexLookup") && !current.startsWith("RegexLookup")) { 548 returned.add(lastClass); 549 } 550 if (lastClass.startsWith("VettingViewer")) { 551 break; 552 } 553 if (current.startsWith("CheckCLDR") && !lastClass.startsWith("CheckCLDR")) { 554 lastClass = current; 555 // leave out 556 continue; 557 } 558 } 559 // only the last RegexLookup will be added 560 if (!current.startsWith("RegexLookup")) { 561 returned.add(current); 562 } 563 lastClass = current; 564 } 565 return returned; 566 } 567 } 568 569 /** 570 * This is the class doing the bulk of the work. 571 * 572 * @author ribnitz 573 */ 574 private static class RegexLoggerImpl extends AbstractRegexLogger { 575 576 /* 577 * Each has more than 1m hits, together they account for about 14m (of the 26m total) 578 */ 579 private static final Set<String> exactMatchSet = 580 new HashSet<>( 581 Arrays.asList( 582 new String[] { 583 "^//ldml.*", 584 "^//ldml/dates.*", 585 "^//ldml/units.*", 586 "^//ldml/characters/ellipsis[@type=\"(final|initial|medial)\"]", 587 "^//ldml/characters.*", 588 "^//ldml/listPatterns/listPattern.*", 589 "^//ldml/units/unitLength[@type=\"(long|short|narrow)\"].*", 590 })); 591 private static final Set<String> patternSet = 592 new HashSet<>( 593 Arrays.asList( 594 new String[] { 595 "^//ldml/dates/fields", 596 "^//ldml/dates/calendars/calendar", 597 "/(availableFormats", 598 })); 599 private final Multiset<PatternStringWithBoolean> matchedFindSet = TreeMultiset.create(); 600 private final Multiset<PatternStringWithBoolean> failedFindSet = TreeMultiset.create(); 601 private final Multiset<PatternStringWithBoolean> matchedMatchSet = TreeMultiset.create(); 602 private final Multiset<PatternStringWithBoolean> failedMatchSet = TreeMultiset.create(); 603 604 private final Multimap<PatternStringWithBoolean, String> occurrences = 605 TreeMultimap.create(); 606 private final IterableTransformer<String, String> transformer = 607 new StringIterableTransformer(); 608 609 @Override log( String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)610 public void log( 611 String pattern, 612 String matchStr, 613 boolean matched, 614 double time, 615 LogType type, 616 Class<?> cls) { 617 boolean isRegexFinder = findClassName("org.unicode.cldr.util.RegexLookup", 10); 618 PatternStringWithBoolean key = new PatternStringWithBoolean(pattern, isRegexFinder); 619 Collection<PatternStringWithBoolean> collectionToAdd = 620 determineCollectionToUse(matched, type); 621 if (collectionToAdd != null) { 622 collectionToAdd.add(key); 623 } 624 if (shouldLogPattern(pattern, isRegexFinder)) { 625 addElementToList(key); 626 } 627 } 628 determineCollectionToUse( boolean matched, LogType type)629 private Collection<PatternStringWithBoolean> determineCollectionToUse( 630 boolean matched, LogType type) { 631 Collection<PatternStringWithBoolean> collectionToAdd = null; 632 switch (type) { 633 case FIND: 634 if (matched) { 635 collectionToAdd = matchedFindSet; 636 } else { 637 collectionToAdd = failedFindSet; 638 } 639 break; 640 case MATCH: 641 if (matched) { 642 collectionToAdd = matchedMatchSet; 643 } else { 644 collectionToAdd = failedMatchSet; 645 } 646 break; 647 } 648 return collectionToAdd; 649 } 650 shouldLogPattern(String pattern, boolean isRegexFinder)651 private boolean shouldLogPattern(String pattern, boolean isRegexFinder) { 652 if (!isRegexFinder) { 653 return true; 654 } else { 655 if (exactMatchSet.contains(pattern)) { 656 return true; 657 } else { 658 for (String cur : patternSet) { 659 if (pattern.startsWith(cur)) { 660 return true; 661 } 662 } 663 } 664 } 665 return false; 666 } 667 findClassName(String className, int depth)668 private boolean findClassName(String className, int depth) { 669 StackTraceElement[] st = Thread.currentThread().getStackTrace(); 670 int startPos = (st.length > 2) ? 2 : 0; 671 int endPos = (startPos + depth > st.length) ? st.length : startPos + depth; 672 for (int i = startPos; i < endPos; i++) { 673 StackTraceElement cur = st[i]; 674 String curClass = cur.getClassName(); 675 if (curClass.startsWith(className)) { 676 return true; 677 } 678 } 679 return false; 680 } 681 682 private static final Joiner JOINER = Joiner.on(";"); 683 addElementToList(PatternStringWithBoolean key)684 private void addElementToList(PatternStringWithBoolean key) { 685 List<String> stList = processStackTrace("org.unicode.cldr.util.RegexLookup", 0); 686 687 if (!stList.isEmpty()) { 688 occurrences.put(key, JOINER.join(transformer.transform(stList))); 689 } 690 } 691 processStackTrace(String classNameToStartAt, int depth)692 private List<String> processStackTrace(String classNameToStartAt, int depth) { 693 StackTraceElement[] st = Thread.currentThread().getStackTrace(); 694 if (depth == 0) { 695 depth = st.length; 696 } 697 int startPos; 698 if (depth < 0) { 699 startPos = depth + st.length; 700 depth = Math.abs(depth); 701 } else { 702 startPos = (st.length > 2) ? 2 : 0; 703 } 704 int pos; 705 boolean found = false; 706 for (pos = startPos; pos < st.length; pos++) { 707 if (st[pos].getClassName().startsWith(classNameToStartAt)) { 708 found = true; 709 break; 710 } 711 } 712 if (!found) { 713 return Collections.emptyList(); 714 } 715 int endPos = (pos + depth > st.length) ? st.length : startPos + depth; 716 List<String> ret = new ArrayList<>(depth + 2); 717 for (int i = pos; i < endPos; i++) { 718 StackTraceElement cur = st[i]; 719 String curClass = cur.getClassName(); 720 ret.add(curClass + ":" + cur.getLineNumber()); 721 } 722 return ret; 723 } 724 725 @Override getEntries(final int minCount)726 public NavigableSet<PatternCountInterface> getEntries(final int minCount) { 727 CountSets c = 728 new CountSets( 729 matchedFindSet, 730 failedFindSet, 731 matchedMatchSet, 732 failedMatchSet, 733 occurrences); 734 final AddAllEntryProcessor processor = 735 (minCount == 1) 736 ? new AddAllEntryProcessor(minCount, c) 737 : new EntryProcessor(minCount, c); 738 for (PatternStringWithBoolean item : matchedFindSet) { 739 processor.process(item, matchedFindSet); 740 } 741 for (PatternStringWithBoolean item : failedFindSet) { 742 processor.process(item, failedFindSet); 743 } 744 for (PatternStringWithBoolean item : matchedMatchSet) { 745 processor.process(item, matchedMatchSet); 746 } 747 for (PatternStringWithBoolean item : failedMatchSet) { 748 processor.process(item, failedMatchSet); 749 } 750 return Sets.unmodifiableNavigableSet(processor.getResult()); 751 } 752 } 753 } 754