1 package org.unicode.cldr.util; 2 3 import java.util.ArrayList; 4 import java.util.Arrays; 5 import java.util.Collection; 6 import java.util.Collections; 7 import java.util.HashSet; 8 import java.util.List; 9 import java.util.NavigableSet; 10 import java.util.Objects; 11 import java.util.Set; 12 import java.util.TreeSet; 13 import java.util.regex.Matcher; 14 import java.util.regex.Pattern; 15 16 import com.google.common.base.Joiner; 17 import com.google.common.collect.Iterables; 18 import com.google.common.collect.Multimap; 19 import com.google.common.collect.Multiset; 20 import com.google.common.collect.Sets; 21 import com.google.common.collect.TreeMultimap; 22 import com.google.common.collect.TreeMultiset; 23 24 /** 25 * Helper class that allows logging the use of regular expressions. A class that will summarize them will get a 26 * NavigabSet of PatternCountInterface instances. 27 * 28 * @author ribnitz 29 * 30 */ 31 public class RegexLogger { 32 /** 33 * Should debugging be done? - if not, a null implementation will be used 34 */ 35 private static final boolean DEBUG = false; 36 /** 37 * Instance 38 */ 39 private static RegexLoggerInterface instance = null; 40 getInstance()41 public static RegexLoggerInterface getInstance() { 42 if (instance == null) { 43 if (DEBUG) { 44 instance = new RegexLoggerImpl(); 45 } else { 46 instance = new NullRegexLogger(); 47 } 48 } 49 return instance; 50 } 51 52 private static class PatternStringWithBoolean implements Comparable<PatternStringWithBoolean> { 53 private final String pattern; 54 private final boolean calledFromRegexFinder; 55 private final int hashCode; 56 PatternStringWithBoolean(String patternStr, boolean calledFromRegexFinder)57 public PatternStringWithBoolean(String patternStr, boolean calledFromRegexFinder) { 58 this.pattern = patternStr.trim(); 59 this.calledFromRegexFinder = calledFromRegexFinder; 60 hashCode = Objects.hash(this.pattern, this.calledFromRegexFinder); 61 } 62 63 @Override hashCode()64 public int hashCode() { 65 return hashCode; 66 } 67 getPattern()68 public String getPattern() { 69 return pattern; 70 } 71 isCalledFromRegexFinder()72 public boolean isCalledFromRegexFinder() { 73 return calledFromRegexFinder; 74 } 75 76 @Override equals(Object obj)77 public boolean equals(Object obj) { 78 if (this == obj) { 79 return true; 80 } 81 if (obj == null) { 82 return false; 83 } 84 if (getClass() != obj.getClass()) { 85 return false; 86 } 87 PatternStringWithBoolean other = (PatternStringWithBoolean) obj; 88 if (calledFromRegexFinder != other.calledFromRegexFinder) { 89 return false; 90 } 91 if (hashCode != other.hashCode) { 92 return false; 93 } 94 if (other.pattern != null) { 95 return false; 96 } 97 if (!pattern.equals(other.pattern)) { 98 return false; 99 } 100 return true; 101 } 102 103 @Override compareTo(PatternStringWithBoolean o)104 public int compareTo(PatternStringWithBoolean o) { 105 if (o == null) { 106 return 1; 107 } 108 if (this == o) { 109 return 0; 110 } 111 return pattern.compareTo(o.pattern); 112 } 113 } 114 115 /** 116 * Interface used for logging Regular expressions 117 * @author ribnitz 118 * 119 */ 120 public static interface RegexLoggerInterface { 121 /** 122 * Log that the given pattern was applied on the given matchStr, whether it matched, and 123 * what the type of the log was. Cls conains the calling class. 124 * @param pattern 125 * @param matchStr 126 * @param matched 127 * @param type 128 * @param cls 129 */ log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls)130 void log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls); 131 log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls)132 void log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls); 133 log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls)134 void log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls); 135 log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)136 void log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls); 137 138 /** 139 * Get all the entries that matched 140 * @return 141 */ getEntries()142 NavigableSet<PatternCountInterface> getEntries(); 143 144 /** 145 * Get the entries that occurred at least minCount times. If there are no matches, an empty set is returned 146 * @param minCount 147 * @return 148 */ getEntries(final int minCount)149 NavigableSet<PatternCountInterface> getEntries(final int minCount); 150 isEnabled()151 boolean isEnabled(); 152 } 153 154 /** 155 * Three of the methods can be delegations, which reduces the actual implementation to two methods 156 * @author ribnitz 157 * 158 */ 159 private static abstract class AbstractRegexLogger implements RegexLoggerInterface { 160 161 @Override log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls)162 public void log(Matcher matcher, String matchStr, boolean matched, LogType type, Class<?> cls) { 163 log(matcher.pattern(), matchStr, matched, type, cls); 164 165 } 166 log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls)167 public void log(Pattern pattern, String matchStr, boolean matched, LogType type, Class<?> cls) { 168 log(pattern.pattern(), matchStr, matched, type, cls); 169 } 170 log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls)171 public void log(String pattern, String matchStr, boolean matched, LogType type, Class<?> cls) { 172 log(pattern, matchStr, matched, 0, type, cls); 173 } 174 175 /** 176 * Get all entries 177 */ getEntries()178 public NavigableSet<PatternCountInterface> getEntries() { 179 return getEntries(1); 180 } 181 182 @Override isEnabled()183 public boolean isEnabled() { 184 return DEBUG; 185 } 186 187 } 188 189 /** 190 * Null implementation 191 * @author ribnitz 192 * 193 */ 194 private static class NullRegexLogger extends AbstractRegexLogger { 195 196 @Override log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)197 public void log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls) { 198 // do nothing 199 } 200 201 @Override getEntries(int minCount)202 public NavigableSet<PatternCountInterface> getEntries(int minCount) { 203 NavigableSet<PatternCountInterface> returned = (NavigableSet<PatternCountInterface>) Sets.newTreeSet(Collections.EMPTY_SET); 204 return returned; 205 } 206 } 207 208 /** 209 * Inetface used for the entries returnred by the RegexLogger 210 * @author ribnitz 211 * 212 */ 213 public static interface PatternCountInterface { 214 /** 215 * Get the pattern used 216 * @return 217 */ getPattern()218 String getPattern(); 219 220 /** 221 * Get the number of successful matches obtained through FIND 222 * @return 223 */ getNumberOfFindMatches()224 int getNumberOfFindMatches(); 225 226 /** 227 * Get the number of unsuccessful matches obtained through FIND 228 * @return 229 */ getNumberOfFindFailures()230 int getNumberOfFindFailures(); 231 232 /** 233 * Get the number of successful matches obtained through MATCH 234 * @return 235 */ getNumberOfMatchMatches()236 int getNumberOfMatchMatches(); 237 238 /** 239 * Get the number of unsuccessful matches obtained through FIND 240 * @return 241 */ getNumberOfMatchFailures()242 int getNumberOfMatchFailures(); 243 244 /** 245 * Return true if this call was made from RegexFinder 246 * @return 247 */ isCalledFromRegexFinder()248 boolean isCalledFromRegexFinder(); 249 250 /** 251 * Get a set of all call locations 252 * @return 253 */ getCallLocations()254 Set<String> getCallLocations(); 255 256 } 257 258 /** 259 * GetAll uses this class to add all the entries of a multiSet to the result set, constructing 260 * the object to return for each pattern. Objects will only be added once. 261 * 262 * This is the implementatioon that adds all items. 263 * @author ribnitz 264 * 265 */ 266 private static class AddAllEntryProcessor { 267 protected final int minCount; 268 protected final CountSets c; 269 protected final Set<PatternStringWithBoolean> seen = new HashSet<>(); 270 protected final NavigableSet<PatternCountInterface> result = new TreeSet<>(); 271 AddAllEntryProcessor(int minCount, CountSets c)272 public AddAllEntryProcessor(int minCount, CountSets c) { 273 this.minCount = minCount; 274 this.c = c; 275 } 276 getResult()277 public NavigableSet<PatternCountInterface> getResult() { 278 return result; 279 } 280 process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet)281 public void process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet) { 282 if (!seen.contains(item)) { 283 result.add(new RegexKeyWithCount(item, c)); 284 seen.add(item); 285 } 286 } 287 } 288 289 /** 290 * Sometimes getEntries is called with a minCount; this Class filters and only adds the 291 * items that occur at least minCount times. 292 * @author ribnitz 293 * 294 */ 295 private static class EntryProcessor extends AddAllEntryProcessor { EntryProcessor(int minCount, CountSets c)296 public EntryProcessor(int minCount, CountSets c) { 297 super(minCount, c); 298 } 299 process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet)300 public void process(PatternStringWithBoolean item, Multiset<PatternStringWithBoolean> countSet) { 301 if (countSet.count(item) >= minCount) { 302 super.process(item, countSet); 303 } 304 } 305 } 306 307 /** 308 * Since all the inner classes are static, this object is used to pass around the refernces to the 309 * different sets/the state 310 * 311 * @author ribnitz 312 * 313 */ 314 private static class CountSets { 315 final Multiset<PatternStringWithBoolean> matchedFindSet; 316 final Multiset<PatternStringWithBoolean> failedFindSet; 317 final Multiset<PatternStringWithBoolean> matchedMatchSet; 318 final Multiset<PatternStringWithBoolean> failedMatchSet; 319 final Multimap<PatternStringWithBoolean, String> stacktraces; 320 CountSets(Multiset<PatternStringWithBoolean> matchedFindSet, Multiset<PatternStringWithBoolean> failedFindSet, Multiset<PatternStringWithBoolean> matchedMatchSet, Multiset<PatternStringWithBoolean> failedMatchSet, Multimap<PatternStringWithBoolean, String> occurrences)321 public CountSets(Multiset<PatternStringWithBoolean> matchedFindSet, Multiset<PatternStringWithBoolean> failedFindSet, 322 Multiset<PatternStringWithBoolean> matchedMatchSet, Multiset<PatternStringWithBoolean> failedMatchSet, 323 Multimap<PatternStringWithBoolean, String> occurrences) { 324 this.failedFindSet = failedFindSet; 325 this.failedMatchSet = failedMatchSet; 326 this.matchedMatchSet = matchedMatchSet; 327 this.stacktraces = occurrences; 328 this.matchedFindSet = matchedFindSet; 329 } 330 } 331 332 private static class RegexKeyWithCount implements PatternCountInterface, Comparable<PatternCountInterface> { 333 private final String pattern; 334 private final int findMatchCount; 335 private final int findFailCount; 336 private final int matchMatchCount; 337 private final int matchFailCount; 338 private final boolean calledFromRegexFinder; 339 private final Set<String> callLocations = new HashSet<>(); 340 private final int hashCode; 341 RegexKeyWithCount(PatternStringWithBoolean key, CountSets bean)342 public RegexKeyWithCount(PatternStringWithBoolean key, CountSets bean) { 343 this.pattern = key.getPattern(); 344 this.calledFromRegexFinder = key.isCalledFromRegexFinder(); 345 this.findMatchCount = bean.matchedFindSet.count(key); 346 this.findFailCount = bean.failedFindSet.count(key); 347 this.matchMatchCount = bean.matchedMatchSet.count(key); 348 this.matchFailCount = bean.failedMatchSet.count(key); 349 Collection<String> tmp = bean.stacktraces.get(key); 350 for (String cur : tmp) { 351 if (!callLocations.contains(cur)) { 352 callLocations.add(cur); 353 } 354 } 355 this.hashCode = Objects.hash(this.pattern, 356 this.findMatchCount, 357 this.findFailCount, 358 this.matchFailCount, 359 this.matchMatchCount, 360 this.calledFromRegexFinder, 361 this.callLocations); 362 } 363 getPattern()364 public String getPattern() { 365 return pattern; 366 } 367 368 @Override hashCode()369 public int hashCode() { 370 return hashCode; 371 } 372 373 @Override getNumberOfFindMatches()374 public int getNumberOfFindMatches() { 375 return findMatchCount; 376 } 377 378 @Override getNumberOfFindFailures()379 public int getNumberOfFindFailures() { 380 return findFailCount; 381 } 382 383 @Override getNumberOfMatchMatches()384 public int getNumberOfMatchMatches() { 385 return matchMatchCount; 386 } 387 388 @Override getNumberOfMatchFailures()389 public int getNumberOfMatchFailures() { 390 return matchFailCount; 391 } 392 393 @Override equals(Object obj)394 public boolean equals(Object obj) { 395 if (this == obj) { 396 return true; 397 } 398 if (obj == null) { 399 return false; 400 } 401 if (hashCode != obj.hashCode()) { 402 return false; 403 } 404 if (getClass() != obj.getClass()) { 405 return false; 406 } 407 RegexKeyWithCount other = (RegexKeyWithCount) obj; 408 if (matchFailCount != other.matchFailCount) { 409 return false; 410 } 411 if (matchMatchCount != other.matchMatchCount) { 412 return false; 413 } 414 if (findFailCount != other.findFailCount) { 415 return false; 416 } 417 if (findMatchCount != other.findMatchCount) { 418 return false; 419 } 420 if (!pattern.equals(other.pattern)) { 421 return false; 422 } 423 if (calledFromRegexFinder != other.calledFromRegexFinder) { 424 return false; 425 } 426 if (callLocations != other.callLocations) { 427 return false; 428 } 429 return true; 430 } 431 432 @Override compareTo(PatternCountInterface o)433 public int compareTo(PatternCountInterface o) { 434 if (o == null) { 435 return 1; 436 } 437 return new Integer(matchFailCount + matchMatchCount + findFailCount + findMatchCount).compareTo( 438 o.getNumberOfFindFailures() + o.getNumberOfFindMatches() + o.getNumberOfMatchFailures() + o.getNumberOfMatchMatches()); 439 } 440 441 @Override isCalledFromRegexFinder()442 public boolean isCalledFromRegexFinder() { 443 return calledFromRegexFinder; 444 } 445 446 @Override getCallLocations()447 public Set<String> getCallLocations() { 448 return callLocations; 449 } 450 451 } 452 453 public enum LogType { 454 FIND, MATCH 455 } 456 457 private static interface IterableTransformer<E, F> { transform(Iterable<E> input)458 Iterable<F> transform(Iterable<E> input); 459 } 460 461 private static class StringIterableTransformer implements IterableTransformer<String, String> { 462 463 @Override transform(Iterable<String> input)464 public Iterable<String> transform(Iterable<String> input) { 465 List<String> returned = new ArrayList<>(Iterables.size(input)); 466 String lastClass = null; 467 for (String current : input) { 468 String transformed = current; 469 if (lastClass != null) { 470 if (lastClass.startsWith("RegexLookup") && !current.startsWith("org.unicode.cldr.util.RegexLookup")) { 471 returned.add(lastClass); 472 } 473 break; 474 } 475 if (current.startsWith("org.unicode.cldr.test.CheckCLDR") && 476 !lastClass.startsWith("org.unicode.cldr.test.CheckCLDR")) { 477 lastClass = current; 478 // leave out 479 continue; 480 } 481 // remove org.unicode.cldr 482 if (current.startsWith("org.unicode.cldr.util.")) { 483 transformed = current.substring("org.unicode.cldr.util.".length()); 484 } 485 // only the last RegexLookup will be added 486 if (!transformed.startsWith("RegexLookup")) { 487 returned.add(transformed); 488 } 489 lastClass = transformed; 490 } 491 return returned; 492 } 493 } 494 495 private static class ClassnameOnlyStringTransformer implements IterableTransformer<String, String> { 496 497 @Override transform(Iterable<String> input)498 public Iterable<String> transform(Iterable<String> input) { 499 List<String> returned = new ArrayList<>(Iterables.size(input)); 500 String lastClass = null; 501 for (String current : input) { 502 if (current.lastIndexOf(".") > 0) { 503 current = current.substring(current.lastIndexOf(".")); 504 } 505 if (lastClass != null) { 506 if (lastClass.startsWith("RegexLookup") && !current.startsWith("RegexLookup")) { 507 returned.add(lastClass); 508 } 509 if (lastClass.startsWith("VettingViewer")) { 510 break; 511 } 512 if (current.startsWith("CheckCLDR") && !lastClass.startsWith("CheckCLDR")) { 513 lastClass = current; 514 // leave out 515 continue; 516 } 517 } 518 // only the last RegexLookup will be added 519 if (!current.startsWith("RegexLookup")) { 520 returned.add(current); 521 } 522 lastClass = current; 523 } 524 return returned; 525 } 526 } 527 528 /** 529 * This is the class doing the bulk of the work. 530 * @author ribnitz 531 */ 532 private static class RegexLoggerImpl extends AbstractRegexLogger { 533 534 /* 535 * Each has more than 1m hits, together they account for about 14m (of the 26m total) 536 */ 537 private static final Set<String> exactMatchSet = new HashSet<>(Arrays.asList(new String[] { 538 "^//ldml.*", 539 "^//ldml/dates.*", 540 "^//ldml/units.*", 541 "^//ldml/characters/ellipsis[@type=\"(final|initial|medial)\"]", 542 "^//ldml/characters.*", 543 "^//ldml/listPatterns/listPattern.*", 544 "^//ldml/units/unitLength[@type=\"(long|short|narrow)\"].*", 545 })); 546 private static final Set<String> patternSet = new HashSet<>(Arrays.asList(new String[] { 547 "^//ldml/dates/fields", 548 "^//ldml/dates/calendars/calendar", 549 "/(availableFormats", 550 })); 551 private final Multiset<PatternStringWithBoolean> matchedFindSet = TreeMultiset.create(); 552 private final Multiset<PatternStringWithBoolean> failedFindSet = TreeMultiset.create(); 553 private final Multiset<PatternStringWithBoolean> matchedMatchSet = TreeMultiset.create(); 554 private final Multiset<PatternStringWithBoolean> failedMatchSet = TreeMultiset.create(); 555 556 private final Multimap<PatternStringWithBoolean, String> occurrences = TreeMultimap.create(); 557 private final IterableTransformer<String, String> transformer = new StringIterableTransformer(); 558 log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls)559 public void log(String pattern, String matchStr, boolean matched, double time, LogType type, Class<?> cls) { 560 boolean isRegexFinder = findClassName("org.unicode.cldr.util.RegexLookup", 10); 561 PatternStringWithBoolean key = new PatternStringWithBoolean(pattern, isRegexFinder); 562 Collection<PatternStringWithBoolean> collectionToAdd = determineCollectionToUse(matched, type); 563 if (collectionToAdd != null) { 564 collectionToAdd.add(key); 565 } 566 if (shouldLogPattern(pattern, isRegexFinder)) { 567 addElementToList(key); 568 } 569 } 570 determineCollectionToUse(boolean matched, LogType type)571 private Collection<PatternStringWithBoolean> determineCollectionToUse(boolean matched, LogType type) { 572 Collection<PatternStringWithBoolean> collectionToAdd = null; 573 switch (type) { 574 case FIND: 575 if (matched) { 576 collectionToAdd = matchedFindSet; 577 } else { 578 collectionToAdd = failedFindSet; 579 } 580 break; 581 case MATCH: 582 if (matched) { 583 collectionToAdd = matchedMatchSet; 584 } else { 585 collectionToAdd = failedMatchSet; 586 } 587 break; 588 } 589 return collectionToAdd; 590 } 591 shouldLogPattern(String pattern, boolean isRegexFinder)592 private boolean shouldLogPattern(String pattern, boolean isRegexFinder) { 593 if (!isRegexFinder) { 594 return true; 595 } else { 596 if (exactMatchSet.contains(pattern)) { 597 return true; 598 } else { 599 for (String cur : patternSet) { 600 if (pattern.startsWith(cur)) { 601 return true; 602 } 603 } 604 } 605 } 606 return false; 607 } 608 findClassName(String className, int depth)609 private boolean findClassName(String className, int depth) { 610 StackTraceElement[] st = Thread.currentThread().getStackTrace(); 611 int startPos = (st.length > 2) ? 2 : 0; 612 int endPos = (startPos + depth > st.length) ? st.length : startPos + depth; 613 for (int i = startPos; i < endPos; i++) { 614 StackTraceElement cur = st[i]; 615 String curClass = cur.getClassName(); 616 if (curClass.startsWith(className)) { 617 return true; 618 } 619 } 620 return false; 621 } 622 623 private final static Joiner JOINER = Joiner.on(";"); 624 addElementToList(PatternStringWithBoolean key)625 private void addElementToList(PatternStringWithBoolean key) { 626 List<String> stList = processStackTrace("org.unicode.cldr.util.RegexLookup", 0); 627 628 if (!stList.isEmpty()) { 629 occurrences.put(key, JOINER.join(transformer.transform(stList))); 630 } 631 } 632 processStackTrace(String classNameToStartAt, int depth)633 private List<String> processStackTrace(String classNameToStartAt, int depth) { 634 StackTraceElement[] st = Thread.currentThread().getStackTrace(); 635 if (depth == 0) { 636 depth = st.length; 637 } 638 int startPos; 639 if (depth < 0) { 640 startPos = depth + st.length; 641 depth = Math.abs(depth); 642 } else { 643 startPos = (st.length > 2) ? 2 : 0; 644 } 645 int pos; 646 boolean found = false; 647 for (pos = startPos; pos < st.length; pos++) { 648 if (st[pos].getClassName().startsWith(classNameToStartAt)) { 649 found = true; 650 break; 651 } 652 } 653 if (!found) { 654 return Collections.emptyList(); 655 } 656 int endPos = (pos + depth > st.length) ? st.length : startPos + depth; 657 List<String> ret = new ArrayList<>(depth + 2); 658 for (int i = pos; i < endPos; i++) { 659 StackTraceElement cur = st[i]; 660 String curClass = cur.getClassName(); 661 ret.add(curClass + ":" + cur.getLineNumber()); 662 } 663 return ret; 664 } 665 getEntries(final int minCount)666 public NavigableSet<PatternCountInterface> getEntries(final int minCount) { 667 CountSets c = new CountSets(matchedFindSet, failedFindSet, matchedMatchSet, failedMatchSet, occurrences); 668 final AddAllEntryProcessor processor = (minCount == 1) ? new AddAllEntryProcessor(minCount, c) : new EntryProcessor(minCount, c); 669 for (PatternStringWithBoolean item : matchedFindSet) { 670 processor.process(item, matchedFindSet); 671 } 672 for (PatternStringWithBoolean item : failedFindSet) { 673 processor.process(item, failedFindSet); 674 } 675 for (PatternStringWithBoolean item : matchedMatchSet) { 676 processor.process(item, matchedMatchSet); 677 } 678 for (PatternStringWithBoolean item : failedMatchSet) { 679 processor.process(item, failedMatchSet); 680 } 681 return Sets.unmodifiableNavigableSet(processor.getResult()); 682 } 683 } 684 } 685