1 /* 2 * Copyright (C) 2008 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.mail.common.base; 18 19 import static com.google.android.mail.common.base.Preconditions.checkArgument; 20 import static com.google.android.mail.common.base.Preconditions.checkNotNull; 21 22 import java.util.ArrayList; 23 import java.util.Arrays; 24 import java.util.List; 25 26 /** 27 * Determines a true or false value for any Java {@code char} value, just as 28 * {@link Predicate} does for any {@link Object}. Also offers basic text 29 * processing methods based on this function. Implementations are strongly 30 * encouraged to be side-effect-free and immutable. 31 * 32 * <p>Throughout the documentation of this class, the phrase "matching 33 * character" is used to mean "any character {@code c} for which {@code 34 * this.matches(c)} returns {@code true}". 35 * 36 * <p><b>Note:</b> This class deals only with {@code char} values; it does not 37 * understand supplementary Unicode code points in the range {@code 0x10000} to 38 * {@code 0x10FFFF}. Such logical characters are encoded into a {@code String} 39 * using surrogate pairs, and a {@code CharMatcher} treats these just as two 40 * separate characters. 41 * 42 * @author Kevin Bourrillion 43 * @since 2009.09.15 <b>tentative</b> 44 */ 45 public abstract class CharMatcher implements Predicate<Character> { 46 47 // Constants 48 49 // Excludes 2000-2000a, which is handled as a range 50 private static final String BREAKING_WHITESPACE_CHARS = 51 "\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000"; 52 53 // Excludes 2007, which is handled as a gap in a pair of ranges 54 private static final String NON_BREAKING_WHITESPACE_CHARS = 55 "\u00a0\u180e\u202f"; 56 57 /** 58 * Determines whether a character is whitespace according to the latest 59 * Unicode standard, as illustrated 60 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>. 61 * This is not the same definition used by other Java APIs. See a comparison 62 * of several definitions of "whitespace" at 63 * <a href="TODO">(TODO)</a>. 64 * 65 * <p><b>Note:</b> as the Unicode definition evolves, we will modify this 66 * constant to keep it up to date. 67 */ 68 public static final CharMatcher WHITESPACE = 69 anyOf(BREAKING_WHITESPACE_CHARS + NON_BREAKING_WHITESPACE_CHARS) 70 .or(inRange('\u2000', '\u200a')); 71 72 /** 73 * Determines whether a character is a breaking whitespace (that is, 74 * a whitespace which can be interpreted as a break between words 75 * for formatting purposes). See {@link #WHITESPACE} for a discussion 76 * of that term. 77 * 78 * @since 2010.01.04 <b>tentative</b> 79 */ 80 public static final CharMatcher BREAKING_WHITESPACE = 81 anyOf(BREAKING_WHITESPACE_CHARS) 82 .or(inRange('\u2000', '\u2006')) 83 .or(inRange('\u2008', '\u200a')); 84 85 /** 86 * Determines whether a character is ASCII, meaning that its code point is 87 * less than 128. 88 */ 89 public static final CharMatcher ASCII = inRange('\0', '\u007f'); 90 91 /** 92 * Determines whether a character is a digit according to 93 * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. 94 */ 95 public static final CharMatcher DIGIT; 96 97 static { 98 CharMatcher digit = inRange('0', '9'); 99 String zeroes = 100 "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66" 101 + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946" 102 + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10"; 103 for (char base : zeroes.toCharArray()) { 104 digit = digit.or(inRange(base, (char) (base + 9))); 105 } 106 DIGIT = digit; 107 } 108 109 /** 110 * Determines whether a character is whitespace according to {@link 111 * Character#isWhitespace(char) Java's definition}; it is usually preferable 112 * to use {@link #WHITESPACE}. See a comparison of several definitions of 113 * "whitespace" at <a href="http://go/white+space">go/white+space</a>. 114 */ 115 public static final CharMatcher JAVA_WHITESPACE 116 = inRange('\u0009', (char) 13) // \\u000d doesn't work as a char literal 117 .or(inRange('\u001c', '\u0020')) 118 .or(is('\u1680')) 119 .or(is('\u180e')) 120 .or(inRange('\u2000', '\u2006')) 121 .or(inRange('\u2008', '\u200b')) 122 .or(inRange('\u2028', '\u2029')) 123 .or(is('\u205f')) 124 .or(is('\u3000')); 125 126 /** 127 * Determines whether a character is a digit according to {@link 128 * Character#isDigit(char) Java's definition}. If you only care to match 129 * ASCII digits, you can use {@code inRange('0', '9')}. 130 */ 131 public static final CharMatcher JAVA_DIGIT = new CharMatcher() { 132 @Override public boolean matches(char c) { 133 return Character.isDigit(c); 134 } 135 }; 136 137 /** 138 * Determines whether a character is a letter according to {@link 139 * Character#isLetter(char) Java's definition}. If you only care to match 140 * letters of the Latin alphabet, you can use {@code 141 * inRange('a', 'z').or(inRange('A', 'Z'))}. 142 */ 143 public static final CharMatcher JAVA_LETTER = new CharMatcher() { 144 @Override public boolean matches(char c) { 145 return Character.isLetter(c); 146 } 147 }; 148 149 /** 150 * Determines whether a character is a letter or digit according to {@link 151 * Character#isLetterOrDigit(char) Java's definition}. 152 */ 153 public static final CharMatcher JAVA_LETTER_OR_DIGIT = new CharMatcher() { 154 @Override public boolean matches(char c) { 155 return Character.isLetterOrDigit(c); 156 } 157 }; 158 159 /** 160 * Determines whether a character is upper case according to {@link 161 * Character#isUpperCase(char) Java's definition}. 162 */ 163 public static final CharMatcher JAVA_UPPER_CASE = new CharMatcher() { 164 @Override public boolean matches(char c) { 165 return Character.isUpperCase(c); 166 } 167 }; 168 169 /** 170 * Determines whether a character is lower case according to {@link 171 * Character#isLowerCase(char) Java's definition}. 172 */ 173 public static final CharMatcher JAVA_LOWER_CASE = new CharMatcher() { 174 @Override public boolean matches(char c) { 175 return Character.isLowerCase(c); 176 } 177 }; 178 179 /** 180 * Determines whether a character is an ISO control character according to 181 * {@link Character#isISOControl(char)}. 182 */ 183 public static final CharMatcher JAVA_ISO_CONTROL = inRange('\u0000', '\u001f') 184 .or(inRange('\u007f', '\u009f')); 185 186 /** 187 * Determines whether a character is invisible; that is, if its Unicode 188 * category is any of SPACE_SEPARATOR, LINE_SEPARATOR, 189 * PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and PRIVATE_USE according 190 * to ICU4J. 191 */ 192 public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020') 193 .or(inRange('\u007f', '\u00a0')) 194 .or(is('\u00ad')) 195 .or(inRange('\u0600', '\u0603')) 196 .or(anyOf("\u06dd\u070f\u1680\u17b4\u17b5\u180e")) 197 .or(inRange('\u2000', '\u200f')) 198 .or(inRange('\u2028', '\u202f')) 199 .or(inRange('\u205f', '\u2064')) 200 .or(inRange('\u206a', '\u206f')) 201 .or(is('\u3000')) 202 .or(inRange('\ud800', '\uf8ff')) 203 .or(anyOf("\ufeff\ufff9\ufffa\ufffb")); 204 205 /** 206 * Determines whether a character is single-width (not double-width). When 207 * in doubt, this matcher errs on the side of returning {@code false} (that 208 * is, it tends to assume a character is double-width). 209 * 210 * <b>Note:</b> as the reference file evolves, we will modify this constant 211 * to keep it up to date. 212 */ 213 public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9') 214 .or(is('\u05be')) 215 .or(inRange('\u05d0', '\u05ea')) 216 .or(is('\u05f3')) 217 .or(is('\u05f4')) 218 .or(inRange('\u0600', '\u06ff')) 219 .or(inRange('\u0750', '\u077f')) 220 .or(inRange('\u0e00', '\u0e7f')) 221 .or(inRange('\u1e00', '\u20af')) 222 .or(inRange('\u2100', '\u213a')) 223 .or(inRange('\ufb50', '\ufdff')) 224 .or(inRange('\ufe70', '\ufeff')) 225 .or(inRange('\uff61', '\uffdc')); 226 227 /** 228 * Determines whether a character is whitespace according to an arbitrary definition used by 229 * {@link StringUtil} for years. Most likely you don't want to use this. See a comparison of 230 * several definitions of "whitespace" at <a href="http://goto/white space">goto/white space</a>. 231 * 232 * <p><b>To be deprecated.</b> use {@link #WHITESPACE} to switch to the Unicode definition, or 233 * create a matcher for the specific characters you want. Not deprecated yet because it is a 234 * stepping stone for getting off of many deprecated {@link StringUtil} methods. 235 */ 236 @Deprecated 237 public static final CharMatcher LEGACY_WHITESPACE = 238 anyOf(" \r\n\t\u3000\u00A0\u2007\u202F").precomputed(); 239 240 241 /** Matches any character. */ 242 public static final CharMatcher ANY = new CharMatcher() { 243 @Override public boolean matches(char c) { 244 return true; 245 } 246 247 @Override public int indexIn(CharSequence sequence) { 248 return (sequence.length() == 0) ? -1 : 0; 249 } 250 @Override public int indexIn(CharSequence sequence, int start) { 251 int length = sequence.length(); 252 Preconditions.checkPositionIndex(start, length); 253 return (start == length) ? -1 : start; 254 } 255 @Override public int lastIndexIn(CharSequence sequence) { 256 return sequence.length() - 1; 257 } 258 @Override public boolean matchesAllOf(CharSequence sequence) { 259 checkNotNull(sequence); 260 return true; 261 } 262 @Override public boolean matchesNoneOf(CharSequence sequence) { 263 return sequence.length() == 0; 264 } 265 @Override public String removeFrom(CharSequence sequence) { 266 checkNotNull(sequence); 267 return ""; 268 } 269 @Override public String replaceFrom( 270 CharSequence sequence, char replacement) { 271 char[] array = new char[sequence.length()]; 272 Arrays.fill(array, replacement); 273 return new String(array); 274 } 275 @Override public String replaceFrom( 276 CharSequence sequence, CharSequence replacement) { 277 StringBuilder retval = new StringBuilder(sequence.length() * replacement.length()); 278 for (int i = 0; i < sequence.length(); i++) { 279 retval.append(replacement); 280 } 281 return retval.toString(); 282 } 283 @Override public String collapseFrom(CharSequence sequence, char replacement) { 284 return (sequence.length() == 0) ? "" : String.valueOf(replacement); 285 } 286 @Override public String trimFrom(CharSequence sequence) { 287 checkNotNull(sequence); 288 return ""; 289 } 290 @Override public int countIn(CharSequence sequence) { 291 return sequence.length(); 292 } 293 @Override public CharMatcher and(CharMatcher other) { 294 return checkNotNull(other); 295 } 296 @Override public CharMatcher or(CharMatcher other) { 297 checkNotNull(other); 298 return this; 299 } 300 @Override public CharMatcher negate() { 301 return NONE; 302 } 303 @Override public CharMatcher precomputed() { 304 return this; 305 } 306 }; 307 308 /** Matches no characters. */ 309 public static final CharMatcher NONE = new CharMatcher() { 310 @Override public boolean matches(char c) { 311 return false; 312 } 313 314 @Override public int indexIn(CharSequence sequence) { 315 checkNotNull(sequence); 316 return -1; 317 } 318 @Override public int indexIn(CharSequence sequence, int start) { 319 int length = sequence.length(); 320 Preconditions.checkPositionIndex(start, length); 321 return -1; 322 } 323 @Override public int lastIndexIn(CharSequence sequence) { 324 checkNotNull(sequence); 325 return -1; 326 } 327 @Override public boolean matchesAllOf(CharSequence sequence) { 328 return sequence.length() == 0; 329 } 330 @Override public boolean matchesNoneOf(CharSequence sequence) { 331 checkNotNull(sequence); 332 return true; 333 } 334 @Override public String removeFrom(CharSequence sequence) { 335 return sequence.toString(); 336 } 337 @Override public String replaceFrom( 338 CharSequence sequence, char replacement) { 339 return sequence.toString(); 340 } 341 @Override public String replaceFrom( 342 CharSequence sequence, CharSequence replacement) { 343 checkNotNull(replacement); 344 return sequence.toString(); 345 } 346 @Override public String collapseFrom( 347 CharSequence sequence, char replacement) { 348 return sequence.toString(); 349 } 350 @Override public String trimFrom(CharSequence sequence) { 351 return sequence.toString(); 352 } 353 @Override public int countIn(CharSequence sequence) { 354 checkNotNull(sequence); 355 return 0; 356 } 357 @Override public CharMatcher and(CharMatcher other) { 358 checkNotNull(other); 359 return this; 360 } 361 @Override public CharMatcher or(CharMatcher other) { 362 return checkNotNull(other); 363 } 364 @Override public CharMatcher negate() { 365 return ANY; 366 } 367 @Override protected void setBits(LookupTable table) { 368 } 369 @Override public CharMatcher precomputed() { 370 return this; 371 } 372 }; 373 374 // Static factories 375 376 /** 377 * Returns a {@code char} matcher that matches only one specified character. 378 */ is(final char match)379 public static CharMatcher is(final char match) { 380 return new CharMatcher() { 381 @Override public boolean matches(char c) { 382 return c == match; 383 } 384 385 @Override public String replaceFrom( 386 CharSequence sequence, char replacement) { 387 return sequence.toString().replace(match, replacement); 388 } 389 @Override public CharMatcher and(CharMatcher other) { 390 return other.matches(match) ? this : NONE; 391 } 392 @Override public CharMatcher or(CharMatcher other) { 393 return other.matches(match) ? other : super.or(other); 394 } 395 @Override public CharMatcher negate() { 396 return isNot(match); 397 } 398 @Override protected void setBits(LookupTable table) { 399 table.set(match); 400 } 401 @Override public CharMatcher precomputed() { 402 return this; 403 } 404 }; 405 } 406 407 /** 408 * Returns a {@code char} matcher that matches any character except the one 409 * specified. 410 * 411 * <p>To negate another {@code CharMatcher}, use {@link #negate()}. 412 */ 413 public static CharMatcher isNot(final char match) { 414 return new CharMatcher() { 415 @Override public boolean matches(char c) { 416 return c != match; 417 } 418 419 @Override public CharMatcher and(CharMatcher other) { 420 return other.matches(match) ? super.and(other) : other; 421 } 422 @Override public CharMatcher or(CharMatcher other) { 423 return other.matches(match) ? ANY : this; 424 } 425 @Override public CharMatcher negate() { 426 return is(match); 427 } 428 }; 429 } 430 431 /** 432 * Returns a {@code char} matcher that matches any character present in the 433 * given character sequence. 434 */ 435 public static CharMatcher anyOf(final CharSequence sequence) { 436 switch (sequence.length()) { 437 case 0: 438 return NONE; 439 case 1: 440 return is(sequence.charAt(0)); 441 case 2: 442 final char match1 = sequence.charAt(0); 443 final char match2 = sequence.charAt(1); 444 return new CharMatcher() { 445 @Override public boolean matches(char c) { 446 return c == match1 || c == match2; 447 } 448 @Override protected void setBits(LookupTable table) { 449 table.set(match1); 450 table.set(match2); 451 } 452 @Override public CharMatcher precomputed() { 453 return this; 454 } 455 }; 456 } 457 458 final char[] chars = sequence.toString().toCharArray(); 459 Arrays.sort(chars); // not worth collapsing duplicates 460 461 return new CharMatcher() { 462 @Override public boolean matches(char c) { 463 return Arrays.binarySearch(chars, c) >= 0; 464 } 465 @Override protected void setBits(LookupTable table) { 466 for (char c : chars) { 467 table.set(c); 468 } 469 } 470 }; 471 } 472 473 /** 474 * Returns a {@code char} matcher that matches any character not present in 475 * the given character sequence. 476 */ 477 public static CharMatcher noneOf(CharSequence sequence) { 478 return anyOf(sequence).negate(); 479 } 480 481 /** 482 * Returns a {@code char} matcher that matches any character in a given range 483 * (both endpoints are inclusive). For example, to match any lowercase letter 484 * of the English alphabet, use {@code CharMatcher.inRange('a', 'z')}. 485 * 486 * @throws IllegalArgumentException if {@code endInclusive < startInclusive} 487 */ 488 public static CharMatcher inRange( 489 final char startInclusive, final char endInclusive) { 490 checkArgument(endInclusive >= startInclusive); 491 return new CharMatcher() { 492 @Override public boolean matches(char c) { 493 return startInclusive <= c && c <= endInclusive; 494 } 495 @Override protected void setBits(LookupTable table) { 496 char c = startInclusive; 497 while (true) { 498 table.set(c); 499 if (c++ == endInclusive) { 500 break; 501 } 502 } 503 } 504 @Override public CharMatcher precomputed() { 505 return this; 506 } 507 }; 508 } 509 510 /** 511 * Returns a matcher with identical behavior to the given {@link 512 * Character}-based predicate, but which operates on primitive {@code char} 513 * instances instead. 514 */ 515 public static CharMatcher forPredicate( 516 final Predicate<? super Character> predicate) { 517 checkNotNull(predicate); 518 if (predicate instanceof CharMatcher) { 519 return (CharMatcher) predicate; 520 } 521 return new CharMatcher() { 522 @Override public boolean matches(char c) { 523 return predicate.apply(c); 524 } 525 @Override public boolean apply(Character character) { 526 return predicate.apply(checkNotNull(character)); 527 } 528 }; 529 } 530 531 // Abstract methods 532 533 /** Determines a true or false value for the given character. */ 534 public abstract boolean matches(char c); 535 536 // Non-static factories 537 538 /** 539 * Returns a matcher that matches any character not matched by this matcher. 540 */ 541 public CharMatcher negate() { 542 final CharMatcher original = this; 543 return new CharMatcher() { 544 @Override public boolean matches(char c) { 545 return !original.matches(c); 546 } 547 548 @Override public boolean matchesAllOf(CharSequence sequence) { 549 return original.matchesNoneOf(sequence); 550 } 551 @Override public boolean matchesNoneOf(CharSequence sequence) { 552 return original.matchesAllOf(sequence); 553 } 554 @Override public int countIn(CharSequence sequence) { 555 return sequence.length() - original.countIn(sequence); 556 } 557 @Override public CharMatcher negate() { 558 return original; 559 } 560 }; 561 } 562 563 /** 564 * Returns a matcher that matches any character matched by both this matcher 565 * and {@code other}. 566 */ 567 public CharMatcher and(CharMatcher other) { 568 return new And(Arrays.asList(this, checkNotNull(other))); 569 } 570 571 private static class And extends CharMatcher { 572 List<CharMatcher> components; 573 574 And(List<CharMatcher> components) { 575 this.components = components; // Skip defensive copy (private) 576 } 577 578 @Override public boolean matches(char c) { 579 for (CharMatcher matcher : components) { 580 if (!matcher.matches(c)) { 581 return false; 582 } 583 } 584 return true; 585 } 586 587 @Override public CharMatcher and(CharMatcher other) { 588 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 589 newComponents.add(checkNotNull(other)); 590 return new And(newComponents); 591 } 592 } 593 594 /** 595 * Returns a matcher that matches any character matched by either this matcher 596 * or {@code other}. 597 */ 598 public CharMatcher or(CharMatcher other) { 599 return new Or(Arrays.asList(this, checkNotNull(other))); 600 } 601 602 private static class Or extends CharMatcher { 603 List<CharMatcher> components; 604 605 Or(List<CharMatcher> components) { 606 this.components = components; // Skip defensive copy (private) 607 } 608 609 @Override public boolean matches(char c) { 610 for (CharMatcher matcher : components) { 611 if (matcher.matches(c)) { 612 return true; 613 } 614 } 615 return false; 616 } 617 618 @Override public CharMatcher or(CharMatcher other) { 619 List<CharMatcher> newComponents = new ArrayList<CharMatcher>(components); 620 newComponents.add(checkNotNull(other)); 621 return new Or(newComponents); 622 } 623 624 @Override protected void setBits(LookupTable table) { 625 for (CharMatcher matcher : components) { 626 matcher.setBits(table); 627 } 628 } 629 } 630 631 /** 632 * Returns a {@code char} matcher functionally equivalent to this one, but 633 * which may be faster to query than the original; your mileage may vary. 634 * Precomputation takes time and is likely to be worthwhile only if the 635 * precomputed matcher is queried many thousands of times. 636 * 637 * <p>This method has no effect (returns {@code this}) when called in GWT: 638 * it's unclear whether a precomputed matcher is faster, but it certainly 639 * consumes more memory, which doesn't seem like a worthwhile tradeoff in a 640 * browser. 641 */ 642 public CharMatcher precomputed() { 643 return Platform.precomputeCharMatcher(this); 644 } 645 646 /** 647 * This is the actual implementation of {@link #precomputed}, but we bounce 648 * calls through a method on {@link Platform} so that we can have different 649 * behavior in GWT. 650 * 651 * <p>The default precomputation is to cache the configuration of the original 652 * matcher in an eight-kilobyte bit array. In some situations this produces a 653 * matcher which is faster to query than the original. 654 * 655 * <p>The default implementation creates a new bit array and passes it to 656 * {@link #setBits(LookupTable)}. 657 */ 658 CharMatcher precomputedInternal() { 659 final LookupTable table = new LookupTable(); 660 setBits(table); 661 662 return new CharMatcher() { 663 @Override public boolean matches(char c) { 664 return table.get(c); 665 } 666 667 // TODO: make methods like negate() smart 668 669 @Override public CharMatcher precomputed() { 670 return this; 671 } 672 }; 673 } 674 675 /** 676 * For use by implementors; sets the bit corresponding to each character ('\0' 677 * to '{@literal \}uFFFF') that matches this matcher in the given bit array, 678 * leaving all other bits untouched. 679 * 680 * <p>The default implementation loops over every possible character value, 681 * invoking {@link #matches} for each one. 682 */ 683 protected void setBits(LookupTable table) { 684 char c = Character.MIN_VALUE; 685 while (true) { 686 if (matches(c)) { 687 table.set(c); 688 } 689 if (c++ == Character.MAX_VALUE) { 690 break; 691 } 692 } 693 } 694 695 /** 696 * A bit array with one bit per {@code char} value, used by {@link 697 * CharMatcher#precomputed}. 698 * 699 * <p>TODO: possibly share a common BitArray class with BloomFilter 700 * and others... a simpler java.util.BitSet. 701 */ 702 protected static class LookupTable { 703 int[] data = new int[2048]; 704 705 void set(char index) { 706 data[index >> 5] |= (1 << index); 707 } 708 boolean get(char index) { 709 return (data[index >> 5] & (1 << index)) != 0; 710 } 711 } 712 713 // Text processing routines 714 715 /** 716 * Returns {@code true} if a character sequence contains only matching 717 * characters. 718 * 719 * <p>The default implementation iterates over the sequence, invoking {@link 720 * #matches} for each character, until this returns {@code false} or the end 721 * is reached. 722 * 723 * @param sequence the character sequence to examine, possibly empty 724 * @return {@code true} if this matcher matches every character in the 725 * sequence, including when the sequence is empty 726 */ 727 public boolean matchesAllOf(CharSequence sequence) { 728 for (int i = sequence.length() - 1; i >= 0; i--) { 729 if (!matches(sequence.charAt(i))) { 730 return false; 731 } 732 } 733 return true; 734 } 735 736 /** 737 * Returns {@code true} if a character sequence contains no matching 738 * characters. 739 * 740 * <p>The default implementation iterates over the sequence, invoking {@link 741 * #matches} for each character, until this returns {@code false} or the end is 742 * reached. 743 * 744 * @param sequence the character sequence to examine, possibly empty 745 * @return {@code true} if this matcher matches every character in the 746 * sequence, including when the sequence is empty 747 */ 748 public boolean matchesNoneOf(CharSequence sequence) { 749 return indexIn(sequence) == -1; 750 } 751 752 // TODO: perhaps add matchesAnyOf() 753 754 /** 755 * Returns the index of the first matching character in a character sequence, 756 * or {@code -1} if no matching character is present. 757 * 758 * <p>The default implementation iterates over the sequence in forward order 759 * calling {@link #matches} for each character. 760 * 761 * @param sequence the character sequence to examine from the beginning 762 * @return an index, or {@code -1} if no character matches 763 */ 764 public int indexIn(CharSequence sequence) { 765 int length = sequence.length(); 766 for (int i = 0; i < length; i++) { 767 if (matches(sequence.charAt(i))) { 768 return i; 769 } 770 } 771 return -1; 772 } 773 774 /** 775 * Returns the index of the first matching character in a character sequence, 776 * starting from a given position, or {@code -1} if no character matches after 777 * that position. 778 * 779 * <p>The default implementation iterates over the sequence in forward order, 780 * beginning at {@code start}, calling {@link #matches} for each character. 781 * 782 * @param sequence the character sequence to examine 783 * @param start the first index to examine; must be nonnegative and no 784 * greater than {@code sequence.length()} 785 * @return the index of the first matching character, guaranteed to be no less 786 * than {@code start}, or {@code -1} if no character matches 787 * @throws IndexOutOfBoundsException if start is negative or greater than 788 * {@code sequence.length()} 789 */ 790 public int indexIn(CharSequence sequence, int start) { 791 int length = sequence.length(); 792 Preconditions.checkPositionIndex(start, length); 793 for (int i = start; i < length; i++) { 794 if (matches(sequence.charAt(i))) { 795 return i; 796 } 797 } 798 return -1; 799 } 800 801 /** 802 * Returns the index of the last matching character in a character sequence, 803 * or {@code -1} if no matching character is present. 804 * 805 * <p>The default implementation iterates over the sequence in reverse order 806 * calling {@link #matches} for each character. 807 * 808 * @param sequence the character sequence to examine from the end 809 * @return an index, or {@code -1} if no character matches 810 */ 811 public int lastIndexIn(CharSequence sequence) { 812 for (int i = sequence.length() - 1; i >= 0; i--) { 813 if (matches(sequence.charAt(i))) { 814 return i; 815 } 816 } 817 return -1; 818 } 819 820 /** 821 * Returns the number of matching characters found in a character sequence. 822 */ 823 public int countIn(CharSequence sequence) { 824 int count = 0; 825 for (int i = 0; i < sequence.length(); i++) { 826 if (matches(sequence.charAt(i))) { 827 count++; 828 } 829 } 830 return count; 831 } 832 833 /** 834 * Returns a string containing all non-matching characters of a character 835 * sequence, in order. For example: <pre> {@code 836 * 837 * CharMatcher.is('a').removeFrom("bazaar")}</pre> 838 * 839 * ... returns {@code "bzr"}. 840 */ 841 public String removeFrom(CharSequence sequence) { 842 String string = sequence.toString(); 843 int pos = indexIn(string); 844 if (pos == -1) { 845 return string; 846 } 847 848 char[] chars = string.toCharArray(); 849 int spread = 1; 850 851 // This unusual loop comes from extensive benchmarking 852 OUT: 853 while (true) { 854 pos++; 855 while (true) { 856 if (pos == chars.length) { 857 break OUT; 858 } 859 if (matches(chars[pos])) { 860 break; 861 } 862 chars[pos - spread] = chars[pos]; 863 pos++; 864 } 865 spread++; 866 } 867 return new String(chars, 0, pos - spread); 868 } 869 870 /** 871 * Returns a string containing all matching characters of a character 872 * sequence, in order. For example: <pre> {@code 873 * 874 * CharMatcher.is('a').retainFrom("bazaar")}</pre> 875 * 876 * ... returns {@code "aaa"}. 877 */ 878 public String retainFrom(CharSequence sequence) { 879 return negate().removeFrom(sequence); 880 } 881 882 /** 883 * Returns a string copy of the input character sequence, with each character 884 * that matches this matcher replaced by a given replacement character. For 885 * example: <pre> {@code 886 * 887 * CharMatcher.is('a').replaceFrom("radar", 'o')}</pre> 888 * 889 * ... returns {@code "rodor"}. 890 * 891 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find 892 * the first matching character, then iterates the remainder of the sequence 893 * calling {@link #matches(char)} for each character. 894 * 895 * @param sequence the character sequence to replace matching characters in 896 * @param replacement the character to append to the result string in place of 897 * each matching character in {@code sequence} 898 * @return the new string 899 */ 900 public String replaceFrom(CharSequence sequence, char replacement) { 901 String string = sequence.toString(); 902 int pos = indexIn(string); 903 if (pos == -1) { 904 return string; 905 } 906 char[] chars = string.toCharArray(); 907 chars[pos] = replacement; 908 for (int i = pos + 1; i < chars.length; i++) { 909 if (matches(chars[i])) { 910 chars[i] = replacement; 911 } 912 } 913 return new String(chars); 914 } 915 916 /** 917 * Returns a string copy of the input character sequence, with each character 918 * that matches this matcher replaced by a given replacement sequence. For 919 * example: <pre> {@code 920 * 921 * CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre> 922 * 923 * ... returns {@code "yoohoo"}. 924 * 925 * <p><b>Note:</b> If the replacement is a fixed string with only one character, 926 * you are better off calling {@link #replaceFrom(CharSequence, char)} directly. 927 * 928 * @param sequence the character sequence to replace matching characters in 929 * @param replacement the characters to append to the result string in place 930 * of each matching character in {@code sequence} 931 * @return the new string 932 */ 933 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 934 int replacementLen = replacement.length(); 935 if (replacementLen == 0) { 936 return removeFrom(sequence); 937 } 938 if (replacementLen == 1) { 939 return replaceFrom(sequence, replacement.charAt(0)); 940 } 941 942 String string = sequence.toString(); 943 int pos = indexIn(string); 944 if (pos == -1) { 945 return string; 946 } 947 948 int len = string.length(); 949 StringBuilder buf = new StringBuilder((int) (len * 1.5) + 16); 950 951 int oldpos = 0; 952 do { 953 buf.append(string, oldpos, pos); 954 buf.append(replacement); 955 oldpos = pos + 1; 956 pos = indexIn(string, oldpos); 957 } while (pos != -1); 958 959 buf.append(string, oldpos, len); 960 return buf.toString(); 961 } 962 963 /** 964 * Returns a substring of the input character sequence that omits all 965 * characters this matcher matches from the beginning and from the end of the 966 * string. For example: <pre> {@code 967 * 968 * CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre> 969 * 970 * ... returns {@code "cat"}. 971 * 972 * <p>Note that<pre> {@code 973 * 974 * CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre> 975 * 976 * ... is equivalent to {@link String#trim()}. 977 */ 978 public String trimFrom(CharSequence sequence) { 979 int len = sequence.length(); 980 int first; 981 int last; 982 983 for (first = 0; first < len; first++) { 984 if (!matches(sequence.charAt(first))) { 985 break; 986 } 987 } 988 for (last = len - 1; last > first; last--) { 989 if (!matches(sequence.charAt(last))) { 990 break; 991 } 992 } 993 994 return sequence.subSequence(first, last + 1).toString(); 995 } 996 997 /** 998 * Returns a substring of the input character sequence that omits all 999 * characters this matcher matches from the beginning of the 1000 * string. For example: <pre> {@code 1001 * 1002 * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre> 1003 * 1004 * ... returns {@code "catbab"}. 1005 */ 1006 public String trimLeadingFrom(CharSequence sequence) { 1007 int len = sequence.length(); 1008 int first; 1009 1010 for (first = 0; first < len; first++) { 1011 if (!matches(sequence.charAt(first))) { 1012 break; 1013 } 1014 } 1015 1016 return sequence.subSequence(first, len).toString(); 1017 } 1018 1019 /** 1020 * Returns a substring of the input character sequence that omits all 1021 * characters this matcher matches from the end of the 1022 * string. For example: <pre> {@code 1023 * 1024 * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre> 1025 * 1026 * ... returns {@code "abacat"}. 1027 */ 1028 public String trimTrailingFrom(CharSequence sequence) { 1029 int len = sequence.length(); 1030 int last; 1031 1032 for (last = len - 1; last >= 0; last--) { 1033 if (!matches(sequence.charAt(last))) { 1034 break; 1035 } 1036 } 1037 1038 return sequence.subSequence(0, last + 1).toString(); 1039 } 1040 1041 /** 1042 * Returns a string copy of the input character sequence, with each group of 1043 * consecutive characters that match this matcher replaced by a single 1044 * replacement character. For example: <pre> {@code 1045 * 1046 * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre> 1047 * 1048 * ... returns {@code "b-p-r"}. 1049 * 1050 * <p>The default implementation uses {@link #indexIn(CharSequence)} to find 1051 * the first matching character, then iterates the remainder of the sequence 1052 * calling {@link #matches(char)} for each character. 1053 * 1054 * @param sequence the character sequence to replace matching groups of 1055 * characters in 1056 * @param replacement the character to append to the result string in place of 1057 * each group of matching characters in {@code sequence} 1058 * @return the new string 1059 */ 1060 public String collapseFrom(CharSequence sequence, char replacement) { 1061 int first = indexIn(sequence); 1062 if (first == -1) { 1063 return sequence.toString(); 1064 } 1065 1066 // TODO: this implementation can probably be made faster. 1067 1068 StringBuilder builder = new StringBuilder(sequence.length()) 1069 .append(sequence.subSequence(0, first)) 1070 .append(replacement); 1071 boolean in = true; 1072 for (int i = first + 1; i < sequence.length(); i++) { 1073 char c = sequence.charAt(i); 1074 if (apply(c)) { 1075 if (!in) { 1076 builder.append(replacement); 1077 in = true; 1078 } 1079 } else { 1080 builder.append(c); 1081 in = false; 1082 } 1083 } 1084 return builder.toString(); 1085 } 1086 1087 /** 1088 * Collapses groups of matching characters exactly as {@link #collapseFrom} 1089 * does, except that groups of matching characters at the start or end of the 1090 * sequence are removed without replacement. 1091 */ 1092 public String trimAndCollapseFrom(CharSequence sequence, char replacement) { 1093 int first = negate().indexIn(sequence); 1094 if (first == -1) { 1095 return ""; // everything matches. nothing's left. 1096 } 1097 StringBuilder builder = new StringBuilder(sequence.length()); 1098 boolean inMatchingGroup = false; 1099 for (int i = first; i < sequence.length(); i++) { 1100 char c = sequence.charAt(i); 1101 if (apply(c)) { 1102 inMatchingGroup = true; 1103 } else { 1104 if (inMatchingGroup) { 1105 builder.append(replacement); 1106 inMatchingGroup = false; 1107 } 1108 builder.append(c); 1109 } 1110 } 1111 return builder.toString(); 1112 } 1113 1114 // Predicate interface 1115 1116 /** 1117 * Returns {@code true} if this matcher matches the given character. 1118 * 1119 * @throws NullPointerException if {@code character} is null 1120 */ 1121 /*@Override*/ public boolean apply(Character character) { 1122 return matches(character); 1123 } 1124 }