1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 package java.util.regex; 28 29 import com.android.icu.util.regex.MatcherNative; 30 31 /** 32 * An engine that performs match operations on a {@linkplain java.lang.CharSequence 33 * character sequence} by interpreting a {@link Pattern}. 34 * 35 * <p> A matcher is created from a pattern by invoking the pattern's {@link 36 * Pattern#matcher matcher} method. Once created, a matcher can be used to 37 * perform three different kinds of match operations: 38 * 39 * <ul> 40 * 41 * <li><p> The {@link #matches matches} method attempts to match the entire 42 * input sequence against the pattern. </p></li> 43 * 44 * <li><p> The {@link #lookingAt lookingAt} method attempts to match the 45 * input sequence, starting at the beginning, against the pattern. </p></li> 46 * 47 * <li><p> The {@link #find find} method scans the input sequence looking for 48 * the next subsequence that matches the pattern. </p></li> 49 * 50 * </ul> 51 * 52 * <p> Each of these methods returns a boolean indicating success or failure. 53 * More information about a successful match can be obtained by querying the 54 * state of the matcher. 55 * 56 * <p> A matcher finds matches in a subset of its input called the 57 * <i>region</i>. By default, the region contains all of the matcher's input. 58 * The region can be modified via the{@link #region region} method and queried 59 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd} 60 * methods. The way that the region boundaries interact with some pattern 61 * constructs can be changed. See {@link #useAnchoringBounds 62 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds} 63 * for more details. 64 * 65 * <p> This class also defines methods for replacing matched subsequences with 66 * new strings whose contents can, if desired, be computed from the match 67 * result. The {@link #appendReplacement appendReplacement} and {@link 68 * #appendTail appendTail} methods can be used in tandem in order to collect 69 * the result into an existing string buffer, or the more convenient {@link 70 * #replaceAll replaceAll} method can be used to create a string in which every 71 * matching subsequence in the input sequence is replaced. 72 * 73 * <p> The explicit state of a matcher includes the start and end indices of 74 * the most recent successful match. It also includes the start and end 75 * indices of the input subsequence captured by each <a 76 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total 77 * count of such subsequences. As a convenience, methods are also provided for 78 * returning these captured subsequences in string form. 79 * 80 * <p> The explicit state of a matcher is initially undefined; attempting to 81 * query any part of it before a successful match will cause an {@link 82 * IllegalStateException} to be thrown. The explicit state of a matcher is 83 * recomputed by every match operation. 84 * 85 * <p> The implicit state of a matcher includes the input character sequence as 86 * well as the <i>append position</i>, which is initially zero and is updated 87 * by the {@link #appendReplacement appendReplacement} method. 88 * 89 * <p> A matcher may be reset explicitly by invoking its {@link #reset()} 90 * method or, if a new input sequence is desired, its {@link 91 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a 92 * matcher discards its explicit state information and sets the append position 93 * to zero. 94 * 95 * <p> Instances of this class are not safe for use by multiple concurrent 96 * threads. </p> 97 * 98 * 99 * @author Mike McCloskey 100 * @author Mark Reinhold 101 * @author JSR-51 Expert Group 102 * @since 1.4 103 * @spec JSR-51 104 */ 105 106 public final class Matcher implements MatchResult { 107 108 /** 109 * The Pattern object that created this Matcher. 110 */ 111 private Pattern parentPattern; 112 113 /** 114 * Holds the offsets for the most recent match. 115 */ 116 int[] groups; 117 118 /** 119 * The range within the sequence that is to be matched (between 0 120 * and text.length()). 121 */ 122 int from, to; 123 124 /** 125 * Holds the input text. 126 */ 127 String text; 128 129 /** 130 * Reflects whether a match has been found during the most recent find 131 * operation. 132 */ 133 private boolean matchFound; 134 135 private MatcherNative nativeMatcher; 136 137 /** 138 * The index of the last position appended in a substitution. 139 */ 140 int appendPos = 0; 141 142 /** 143 * Holds the original CharSequence for use in {@link #reset}. {@link #text} is used during 144 * matching. Note that CharSequence is mutable while String is not, so reset can cause the input 145 * to match to change. 146 */ 147 private CharSequence originalInput; 148 149 /** 150 * If transparentBounds is true then the boundaries of this 151 * matcher's region are transparent to lookahead, lookbehind, 152 * and boundary matching constructs that try to see beyond them. 153 */ 154 boolean transparentBounds = false; 155 156 /** 157 * If anchoringBounds is true then the boundaries of this 158 * matcher's region match anchors such as ^ and $. 159 */ 160 boolean anchoringBounds = true; 161 162 /** 163 * All matchers have the state used by Pattern during a match. 164 */ Matcher(Pattern parent, CharSequence text)165 Matcher(Pattern parent, CharSequence text) { 166 usePattern(parent); 167 reset(text); 168 } 169 170 /** 171 * Returns the pattern that is interpreted by this matcher. 172 * 173 * @return The pattern for which this matcher was created 174 */ pattern()175 public Pattern pattern() { 176 return parentPattern; 177 } 178 179 /** 180 * Returns the match state of this matcher as a {@link MatchResult}. 181 * The result is unaffected by subsequent operations performed upon this 182 * matcher. 183 * 184 * @return a <code>MatchResult</code> with the state of this matcher 185 * @since 1.5 186 */ toMatchResult()187 public MatchResult toMatchResult() { 188 ensureMatch(); 189 return new OffsetBasedMatchResult(text, groups); 190 } 191 192 /** 193 * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to 194 * find matches with. 195 * 196 * <p> This method causes this matcher to lose information 197 * about the groups of the last match that occurred. The 198 * matcher's position in the input is maintained and its 199 * last append position is unaffected.</p> 200 * 201 * @param newPattern 202 * The new pattern used by this matcher 203 * @return This matcher 204 * @throws IllegalArgumentException 205 * If newPattern is <tt>null</tt> 206 * @since 1.5 207 */ usePattern(Pattern newPattern)208 public Matcher usePattern(Pattern newPattern) { 209 if (newPattern == null) 210 throw new IllegalArgumentException("Pattern cannot be null"); 211 212 synchronized (this) { 213 // may throw 214 nativeMatcher = MatcherNative.create(newPattern.nativePattern); 215 } 216 parentPattern = newPattern; 217 218 if (text != null) { 219 resetForInput(); 220 } 221 222 groups = new int[(groupCount() + 1) * 2]; 223 matchFound = false; 224 return this; 225 } 226 227 /** 228 * Resets this matcher. 229 * 230 * <p> Resetting a matcher discards all of its explicit state information 231 * and sets its append position to zero. The matcher's region is set to the 232 * default region, which is its entire character sequence. The anchoring 233 * and transparency of this matcher's region boundaries are unaffected. 234 * 235 * @return This matcher 236 */ reset()237 public Matcher reset() { 238 return reset(originalInput, 0, originalInput.length()); 239 } 240 241 /** 242 * Resets this matcher with a new input sequence. 243 * 244 * <p> Resetting a matcher discards all of its explicit state information 245 * and sets its append position to zero. The matcher's region is set to 246 * the default region, which is its entire character sequence. The 247 * anchoring and transparency of this matcher's region boundaries are 248 * unaffected. 249 * 250 * @param input 251 * The new input character sequence 252 * 253 * @return This matcher 254 */ reset(CharSequence input)255 public Matcher reset(CharSequence input) { 256 return reset(input, 0, input.length()); 257 } 258 259 /** 260 * Returns the start index of the previous match. 261 * 262 * @return The index of the first character matched 263 * 264 * @throws IllegalStateException 265 * If no match has yet been attempted, 266 * or if the previous match operation failed 267 */ start()268 public int start() { 269 return start(0); 270 } 271 272 /** 273 * Returns the start index of the subsequence captured by the given group 274 * during the previous match operation. 275 * 276 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 277 * to right, starting at one. Group zero denotes the entire pattern, so 278 * the expression <i>m.</i><tt>start(0)</tt> is equivalent to 279 * <i>m.</i><tt>start()</tt>. </p> 280 * 281 * @param group 282 * The index of a capturing group in this matcher's pattern 283 * 284 * @return The index of the first character captured by the group, 285 * or <tt>-1</tt> if the match was successful but the group 286 * itself did not match anything 287 * 288 * @throws IllegalStateException 289 * If no match has yet been attempted, 290 * or if the previous match operation failed 291 * 292 * @throws IndexOutOfBoundsException 293 * If there is no capturing group in the pattern 294 * with the given index 295 */ start(int group)296 public int start(int group) { 297 ensureMatch(); 298 if (group < 0 || group > groupCount()) 299 throw new IndexOutOfBoundsException("No group " + group); 300 return groups[group * 2]; 301 } 302 303 /** 304 * Returns the start index of the subsequence captured by the given 305 * <a href="Pattern.html#groupname">named-capturing group</a> during the 306 * previous match operation. 307 * 308 * @param name 309 * The name of a named-capturing group in this matcher's pattern 310 * 311 * @return The index of the first character captured by the group, 312 * or {@code -1} if the match was successful but the group 313 * itself did not match anything 314 * 315 * @throws IllegalStateException 316 * If no match has yet been attempted, 317 * or if the previous match operation failed 318 * 319 * @throws IllegalArgumentException 320 * If there is no capturing group in the pattern 321 * with the given name 322 * @since 1.8 323 */ start(String name)324 public int start(String name) { 325 return groups[getMatchedGroupIndex(name) * 2]; 326 } 327 328 /** 329 * Returns the offset after the last character matched. 330 * 331 * @return The offset after the last character matched 332 * 333 * @throws IllegalStateException 334 * If no match has yet been attempted, 335 * or if the previous match operation failed 336 */ end()337 public int end() { 338 return end(0); 339 } 340 341 /** 342 * Returns the offset after the last character of the subsequence 343 * captured by the given group during the previous match operation. 344 * 345 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 346 * to right, starting at one. Group zero denotes the entire pattern, so 347 * the expression <i>m.</i><tt>end(0)</tt> is equivalent to 348 * <i>m.</i><tt>end()</tt>. </p> 349 * 350 * @param group 351 * The index of a capturing group in this matcher's pattern 352 * 353 * @return The offset after the last character captured by the group, 354 * or <tt>-1</tt> if the match was successful 355 * but the group itself did not match anything 356 * 357 * @throws IllegalStateException 358 * If no match has yet been attempted, 359 * or if the previous match operation failed 360 * 361 * @throws IndexOutOfBoundsException 362 * If there is no capturing group in the pattern 363 * with the given index 364 */ end(int group)365 public int end(int group) { 366 ensureMatch(); 367 if (group < 0 || group > groupCount()) 368 throw new IndexOutOfBoundsException("No group " + group); 369 return groups[group * 2 + 1]; 370 } 371 372 /** 373 * Returns the offset after the last character of the subsequence 374 * captured by the given <a href="Pattern.html#groupname">named-capturing 375 * group</a> during the previous match operation. 376 * 377 * @param name 378 * The name of a named-capturing group in this matcher's pattern 379 * 380 * @return The offset after the last character captured by the group, 381 * or {@code -1} if the match was successful 382 * but the group itself did not match anything 383 * 384 * @throws IllegalStateException 385 * If no match has yet been attempted, 386 * or if the previous match operation failed 387 * 388 * @throws IllegalArgumentException 389 * If there is no capturing group in the pattern 390 * with the given name 391 * @since 1.8 392 */ end(String name)393 public int end(String name) { 394 return groups[getMatchedGroupIndex(name) * 2 + 1]; 395 } 396 397 /** 398 * Returns the input subsequence matched by the previous match. 399 * 400 * <p> For a matcher <i>m</i> with input sequence <i>s</i>, 401 * the expressions <i>m.</i><tt>group()</tt> and 402 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt> 403 * are equivalent. </p> 404 * 405 * <p> Note that some patterns, for example <tt>a*</tt>, match the empty 406 * string. This method will return the empty string when the pattern 407 * successfully matches the empty string in the input. </p> 408 * 409 * @return The (possibly empty) subsequence matched by the previous match, 410 * in string form 411 * 412 * @throws IllegalStateException 413 * If no match has yet been attempted, 414 * or if the previous match operation failed 415 */ group()416 public String group() { 417 return group(0); 418 } 419 420 /** 421 * Returns the input subsequence captured by the given group during the 422 * previous match operation. 423 * 424 * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index 425 * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and 426 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt> 427 * are equivalent. </p> 428 * 429 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 430 * to right, starting at one. Group zero denotes the entire pattern, so 431 * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>. 432 * </p> 433 * 434 * <p> If the match was successful but the group specified failed to match 435 * any part of the input sequence, then <tt>null</tt> is returned. Note 436 * that some groups, for example <tt>(a*)</tt>, match the empty string. 437 * This method will return the empty string when such a group successfully 438 * matches the empty string in the input. </p> 439 * 440 * @param group 441 * The index of a capturing group in this matcher's pattern 442 * 443 * @return The (possibly empty) subsequence captured by the group 444 * during the previous match, or <tt>null</tt> if the group 445 * failed to match part of the input 446 * 447 * @throws IllegalStateException 448 * If no match has yet been attempted, 449 * or if the previous match operation failed 450 * 451 * @throws IndexOutOfBoundsException 452 * If there is no capturing group in the pattern 453 * with the given index 454 */ group(int group)455 public String group(int group) { 456 ensureMatch(); 457 if (group < 0 || group > groupCount()) 458 throw new IndexOutOfBoundsException("No group " + group); 459 if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) 460 return null; 461 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); 462 } 463 464 /** 465 * Returns the input subsequence captured by the given 466 * <a href="Pattern.html#groupname">named-capturing group</a> during the previous 467 * match operation. 468 * 469 * <p> If the match was successful but the group specified failed to match 470 * any part of the input sequence, then <tt>null</tt> is returned. Note 471 * that some groups, for example <tt>(a*)</tt>, match the empty string. 472 * This method will return the empty string when such a group successfully 473 * matches the empty string in the input. </p> 474 * 475 * @param name 476 * The name of a named-capturing group in this matcher's pattern 477 * 478 * @return The (possibly empty) subsequence captured by the named group 479 * during the previous match, or <tt>null</tt> if the group 480 * failed to match part of the input 481 * 482 * @throws IllegalStateException 483 * If no match has yet been attempted, 484 * or if the previous match operation failed 485 * 486 * @throws IllegalArgumentException 487 * If there is no capturing group in the pattern 488 * with the given name 489 * @since 1.7 490 */ group(String name)491 public String group(String name) { 492 int group = getMatchedGroupIndex(name); 493 if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) 494 return null; 495 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); 496 } 497 498 /** 499 * Returns the number of capturing groups in this matcher's pattern. 500 * 501 * <p> Group zero denotes the entire pattern by convention. It is not 502 * included in this count. 503 * 504 * <p> Any non-negative integer smaller than or equal to the value 505 * returned by this method is guaranteed to be a valid group index for 506 * this matcher. </p> 507 * 508 * @return The number of capturing groups in this matcher's pattern 509 */ groupCount()510 public int groupCount() { 511 synchronized (this) { 512 return nativeMatcher.groupCount(); 513 } 514 } 515 516 /** 517 * Attempts to match the entire region against the pattern. 518 * 519 * <p> If the match succeeds then more information can be obtained via the 520 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 521 * 522 * @return <tt>true</tt> if, and only if, the entire region sequence 523 * matches this matcher's pattern 524 */ matches()525 public boolean matches() { 526 synchronized (this) { 527 matchFound = nativeMatcher.matches(groups); 528 } 529 return matchFound; 530 } 531 532 /** 533 * Attempts to find the next subsequence of the input sequence that matches 534 * the pattern. 535 * 536 * <p> This method starts at the beginning of this matcher's region, or, if 537 * a previous invocation of the method was successful and the matcher has 538 * not since been reset, at the first character not matched by the previous 539 * match. 540 * 541 * <p> If the match succeeds then more information can be obtained via the 542 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 543 * 544 * @return <tt>true</tt> if, and only if, a subsequence of the input 545 * sequence matches this matcher's pattern 546 */ find()547 public boolean find() { 548 synchronized (this) { 549 matchFound = nativeMatcher.findNext(groups); 550 } 551 return matchFound; 552 } 553 554 /** 555 * Resets this matcher and then attempts to find the next subsequence of 556 * the input sequence that matches the pattern, starting at the specified 557 * index. 558 * 559 * <p> If the match succeeds then more information can be obtained via the 560 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent 561 * invocations of the {@link #find()} method will start at the first 562 * character not matched by this match. </p> 563 * 564 * @param start the index to start searching for a match 565 * @throws IndexOutOfBoundsException 566 * If start is less than zero or if start is greater than the 567 * length of the input sequence. 568 * 569 * @return <tt>true</tt> if, and only if, a subsequence of the input 570 * sequence starting at the given index matches this matcher's 571 * pattern 572 */ find(int start)573 public boolean find(int start) { 574 int limit = getTextLength(); 575 if ((start < 0) || (start > limit)) 576 throw new IndexOutOfBoundsException("Illegal start index"); 577 reset(); 578 synchronized (this) { 579 matchFound = nativeMatcher.find(start, groups); 580 } 581 return matchFound; 582 } 583 584 /** 585 * Attempts to match the input sequence, starting at the beginning of the 586 * region, against the pattern. 587 * 588 * <p> Like the {@link #matches matches} method, this method always starts 589 * at the beginning of the region; unlike that method, it does not 590 * require that the entire region be matched. 591 * 592 * <p> If the match succeeds then more information can be obtained via the 593 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 594 * 595 * @return <tt>true</tt> if, and only if, a prefix of the input 596 * sequence matches this matcher's pattern 597 */ lookingAt()598 public boolean lookingAt() { 599 synchronized (this) { 600 matchFound = nativeMatcher.lookingAt(groups); 601 } 602 return matchFound; 603 } 604 605 /** 606 * Returns a literal replacement <code>String</code> for the specified 607 * <code>String</code>. 608 * 609 * This method produces a <code>String</code> that will work 610 * as a literal replacement <code>s</code> in the 611 * <code>appendReplacement</code> method of the {@link Matcher} class. 612 * The <code>String</code> produced will match the sequence of characters 613 * in <code>s</code> treated as a literal sequence. Slashes ('\') and 614 * dollar signs ('$') will be given no special meaning. 615 * 616 * @param s The string to be literalized 617 * @return A literal string replacement 618 * @since 1.5 619 */ quoteReplacement(String s)620 public static String quoteReplacement(String s) { 621 if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1)) 622 return s; 623 StringBuilder sb = new StringBuilder(); 624 for (int i=0; i<s.length(); i++) { 625 char c = s.charAt(i); 626 if (c == '\\' || c == '$') { 627 sb.append('\\'); 628 } 629 sb.append(c); 630 } 631 return sb.toString(); 632 } 633 634 /** 635 * Implements a non-terminal append-and-replace step. 636 * 637 * <p> This method performs the following actions: </p> 638 * 639 * <ol> 640 * 641 * <li><p> It reads characters from the input sequence, starting at the 642 * append position, and appends them to the given string buffer. It 643 * stops after reading the last character preceding the previous match, 644 * that is, the character at index {@link 645 * #start()} <tt>-</tt> <tt>1</tt>. </p></li> 646 * 647 * <li><p> It appends the given replacement string to the string buffer. 648 * </p></li> 649 * 650 * <li><p> It sets the append position of this matcher to the index of 651 * the last character matched, plus one, that is, to {@link #end()}. 652 * </p></li> 653 * 654 * </ol> 655 * 656 * <p> The replacement string may contain references to subsequences 657 * captured during the previous match: Each occurrence of 658 * <tt>${</tt><i>name</i><tt>}</tt> or <tt>$</tt><i>g</i> 659 * will be replaced by the result of evaluating the corresponding 660 * {@link #group(String) group(name)} or {@link #group(int) group(g)} 661 * respectively. For <tt>$</tt><i>g</i>, 662 * the first number after the <tt>$</tt> is always treated as part of 663 * the group reference. Subsequent numbers are incorporated into g if 664 * they would form a legal group reference. Only the numerals '0' 665 * through '9' are considered as potential components of the group 666 * reference. If the second group matched the string <tt>"foo"</tt>, for 667 * example, then passing the replacement string <tt>"$2bar"</tt> would 668 * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar 669 * sign (<tt>$</tt>) may be included as a literal in the replacement 670 * string by preceding it with a backslash (<tt>\$</tt>). 671 * 672 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 673 * the replacement string may cause the results to be different than if it 674 * were being treated as a literal replacement string. Dollar signs may be 675 * treated as references to captured subsequences as described above, and 676 * backslashes are used to escape literal characters in the replacement 677 * string. 678 * 679 * <p> This method is intended to be used in a loop together with the 680 * {@link #appendTail appendTail} and {@link #find find} methods. The 681 * following code, for example, writes <tt>one dog two dogs in the 682 * yard</tt> to the standard-output stream: </p> 683 * 684 * <blockquote><pre> 685 * Pattern p = Pattern.compile("cat"); 686 * Matcher m = p.matcher("one cat two cats in the yard"); 687 * StringBuffer sb = new StringBuffer(); 688 * while (m.find()) { 689 * m.appendReplacement(sb, "dog"); 690 * } 691 * m.appendTail(sb); 692 * System.out.println(sb.toString());</pre></blockquote> 693 * 694 * @param sb 695 * The target string buffer 696 * 697 * @param replacement 698 * The replacement string 699 * 700 * @return This matcher 701 * 702 * @throws IllegalStateException 703 * If no match has yet been attempted, 704 * or if the previous match operation failed 705 * 706 * @throws IllegalArgumentException 707 * If the replacement string refers to a named-capturing 708 * group that does not exist in the pattern 709 * 710 * @throws IndexOutOfBoundsException 711 * If the replacement string refers to a capturing group 712 * that does not exist in the pattern 713 */ appendReplacement(StringBuffer sb, String replacement)714 public Matcher appendReplacement(StringBuffer sb, String replacement) { 715 716 sb.append(text.substring(appendPos, start())); 717 appendEvaluated(sb, replacement); 718 appendPos = end(); 719 720 return this; 721 } 722 723 /** 724 * Internal helper method to append a given string to a given string buffer. 725 * If the string contains any references to groups, these are replaced by 726 * the corresponding group's contents. 727 * 728 * @param buffer the string buffer. 729 * @param s the string to append. 730 */ appendEvaluated(StringBuffer buffer, String s)731 private void appendEvaluated(StringBuffer buffer, String s) { 732 boolean escape = false; 733 boolean dollar = false; 734 boolean escapeNamedGroup = false; 735 int escapeNamedGroupStart = -1; 736 737 for (int i = 0; i < s.length(); i++) { 738 char c = s.charAt(i); 739 if (c == '\\' && !escape) { 740 escape = true; 741 } else if (c == '$' && !escape) { 742 dollar = true; 743 } else if (c >= '0' && c <= '9' && dollar && !escapeNamedGroup) { 744 String groupValue = group(c - '0'); 745 if (groupValue != null) { 746 buffer.append(groupValue); 747 } 748 dollar = false; 749 } else if (c == '{' && dollar) { 750 escapeNamedGroup = true; 751 escapeNamedGroupStart = i; 752 } else if (c == '}' && dollar && escapeNamedGroup) { 753 String groupValue = group(s.substring(escapeNamedGroupStart + 1, i)); 754 if (groupValue != null) { 755 buffer.append(groupValue); 756 } 757 dollar = false; 758 escapeNamedGroup = false; 759 } else if (c != '}' && dollar && escapeNamedGroup) { 760 continue; 761 } else { 762 buffer.append(c); 763 dollar = false; 764 escape = false; 765 escapeNamedGroup = false; 766 } 767 } 768 769 if (escape) { 770 throw new IllegalArgumentException("character to be escaped is missing"); 771 } 772 773 if (dollar) { 774 throw new IllegalArgumentException("Illegal group reference: group index is missing"); 775 } 776 777 if (escapeNamedGroup) { 778 throw new IllegalArgumentException("Missing ending brace '}' from replacement string"); 779 } 780 } 781 782 /** 783 * Implements a terminal append-and-replace step. 784 * 785 * <p> This method reads characters from the input sequence, starting at 786 * the append position, and appends them to the given string buffer. It is 787 * intended to be invoked after one or more invocations of the {@link 788 * #appendReplacement appendReplacement} method in order to copy the 789 * remainder of the input sequence. </p> 790 * 791 * @param sb 792 * The target string buffer 793 * 794 * @return The target string buffer 795 */ appendTail(StringBuffer sb)796 public StringBuffer appendTail(StringBuffer sb) { 797 if (appendPos < to) { 798 sb.append(text.substring(appendPos, to)); 799 } 800 return sb; 801 } 802 803 /** 804 * Replaces every subsequence of the input sequence that matches the 805 * pattern with the given replacement string. 806 * 807 * <p> This method first resets this matcher. It then scans the input 808 * sequence looking for matches of the pattern. Characters that are not 809 * part of any match are appended directly to the result string; each match 810 * is replaced in the result by the replacement string. The replacement 811 * string may contain references to captured subsequences as in the {@link 812 * #appendReplacement appendReplacement} method. 813 * 814 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 815 * the replacement string may cause the results to be different than if it 816 * were being treated as a literal replacement string. Dollar signs may be 817 * treated as references to captured subsequences as described above, and 818 * backslashes are used to escape literal characters in the replacement 819 * string. 820 * 821 * <p> Given the regular expression <tt>a*b</tt>, the input 822 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string 823 * <tt>"-"</tt>, an invocation of this method on a matcher for that 824 * expression would yield the string <tt>"-foo-foo-foo-"</tt>. 825 * 826 * <p> Invoking this method changes this matcher's state. If the matcher 827 * is to be used in further matching operations then it should first be 828 * reset. </p> 829 * 830 * @param replacement 831 * The replacement string 832 * 833 * @return The string constructed by replacing each matching subsequence 834 * by the replacement string, substituting captured subsequences 835 * as needed 836 */ replaceAll(String replacement)837 public String replaceAll(String replacement) { 838 reset(); 839 boolean result = find(); 840 if (result) { 841 StringBuffer sb = new StringBuffer(); 842 do { 843 appendReplacement(sb, replacement); 844 result = find(); 845 } while (result); 846 appendTail(sb); 847 return sb.toString(); 848 } 849 return text.toString(); 850 } 851 852 /** 853 * Replaces the first subsequence of the input sequence that matches the 854 * pattern with the given replacement string. 855 * 856 * <p> This method first resets this matcher. It then scans the input 857 * sequence looking for a match of the pattern. Characters that are not 858 * part of the match are appended directly to the result string; the match 859 * is replaced in the result by the replacement string. The replacement 860 * string may contain references to captured subsequences as in the {@link 861 * #appendReplacement appendReplacement} method. 862 * 863 * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 864 * the replacement string may cause the results to be different than if it 865 * were being treated as a literal replacement string. Dollar signs may be 866 * treated as references to captured subsequences as described above, and 867 * backslashes are used to escape literal characters in the replacement 868 * string. 869 * 870 * <p> Given the regular expression <tt>dog</tt>, the input 871 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string 872 * <tt>"cat"</tt>, an invocation of this method on a matcher for that 873 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p> 874 * 875 * <p> Invoking this method changes this matcher's state. If the matcher 876 * is to be used in further matching operations then it should first be 877 * reset. </p> 878 * 879 * @param replacement 880 * The replacement string 881 * @return The string constructed by replacing the first matching 882 * subsequence by the replacement string, substituting captured 883 * subsequences as needed 884 */ replaceFirst(String replacement)885 public String replaceFirst(String replacement) { 886 if (replacement == null) 887 throw new NullPointerException("replacement"); 888 reset(); 889 if (!find()) 890 return text.toString(); 891 StringBuffer sb = new StringBuffer(); 892 appendReplacement(sb, replacement); 893 appendTail(sb); 894 return sb.toString(); 895 } 896 897 /** 898 * Sets the limits of this matcher's region. The region is the part of the 899 * input sequence that will be searched to find a match. Invoking this 900 * method resets the matcher, and then sets the region to start at the 901 * index specified by the <code>start</code> parameter and end at the 902 * index specified by the <code>end</code> parameter. 903 * 904 * <p>Depending on the transparency and anchoring being used (see 905 * {@link #useTransparentBounds useTransparentBounds} and 906 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such 907 * as anchors may behave differently at or around the boundaries of the 908 * region. 909 * 910 * @param start 911 * The index to start searching at (inclusive) 912 * @param end 913 * The index to end searching at (exclusive) 914 * @throws IndexOutOfBoundsException 915 * If start or end is less than zero, if 916 * start is greater than the length of the input sequence, if 917 * end is greater than the length of the input sequence, or if 918 * start is greater than end. 919 * @return this matcher 920 * @since 1.5 921 */ region(int start, int end)922 public Matcher region(int start, int end) { 923 return reset(originalInput, start, end); 924 } 925 926 /** 927 * Reports the start index of this matcher's region. The 928 * searches this matcher conducts are limited to finding matches 929 * within {@link #regionStart regionStart} (inclusive) and 930 * {@link #regionEnd regionEnd} (exclusive). 931 * 932 * @return The starting point of this matcher's region 933 * @since 1.5 934 */ regionStart()935 public int regionStart() { 936 return from; 937 } 938 939 /** 940 * Reports the end index (exclusive) of this matcher's region. 941 * The searches this matcher conducts are limited to finding matches 942 * within {@link #regionStart regionStart} (inclusive) and 943 * {@link #regionEnd regionEnd} (exclusive). 944 * 945 * @return the ending point of this matcher's region 946 * @since 1.5 947 */ regionEnd()948 public int regionEnd() { 949 return to; 950 } 951 952 /** 953 * Queries the transparency of region bounds for this matcher. 954 * 955 * <p> This method returns <tt>true</tt> if this matcher uses 956 * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i> 957 * bounds. 958 * 959 * <p> See {@link #useTransparentBounds useTransparentBounds} for a 960 * description of transparent and opaque bounds. 961 * 962 * <p> By default, a matcher uses opaque region boundaries. 963 * 964 * @return <tt>true</tt> iff this matcher is using transparent bounds, 965 * <tt>false</tt> otherwise. 966 * @see java.util.regex.Matcher#useTransparentBounds(boolean) 967 * @since 1.5 968 */ hasTransparentBounds()969 public boolean hasTransparentBounds() { 970 return transparentBounds; 971 } 972 973 /** 974 * Sets the transparency of region bounds for this matcher. 975 * 976 * <p> Invoking this method with an argument of <tt>true</tt> will set this 977 * matcher to use <i>transparent</i> bounds. If the boolean 978 * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used. 979 * 980 * <p> Using transparent bounds, the boundaries of this 981 * matcher's region are transparent to lookahead, lookbehind, 982 * and boundary matching constructs. Those constructs can see beyond the 983 * boundaries of the region to see if a match is appropriate. 984 * 985 * <p> Using opaque bounds, the boundaries of this matcher's 986 * region are opaque to lookahead, lookbehind, and boundary matching 987 * constructs that may try to see beyond them. Those constructs cannot 988 * look past the boundaries so they will fail to match anything outside 989 * of the region. 990 * 991 * <p> By default, a matcher uses opaque bounds. 992 * 993 * @param b a boolean indicating whether to use opaque or transparent 994 * regions 995 * @return this matcher 996 * @see java.util.regex.Matcher#hasTransparentBounds 997 * @since 1.5 998 */ useTransparentBounds(boolean b)999 public Matcher useTransparentBounds(boolean b) { 1000 synchronized (this) { 1001 transparentBounds = b; 1002 nativeMatcher.useTransparentBounds(b); 1003 } 1004 return this; 1005 } 1006 1007 /** 1008 * Queries the anchoring of region bounds for this matcher. 1009 * 1010 * <p> This method returns <tt>true</tt> if this matcher uses 1011 * <i>anchoring</i> bounds, <tt>false</tt> otherwise. 1012 * 1013 * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a 1014 * description of anchoring bounds. 1015 * 1016 * <p> By default, a matcher uses anchoring region boundaries. 1017 * 1018 * @return <tt>true</tt> iff this matcher is using anchoring bounds, 1019 * <tt>false</tt> otherwise. 1020 * @see java.util.regex.Matcher#useAnchoringBounds(boolean) 1021 * @since 1.5 1022 */ hasAnchoringBounds()1023 public boolean hasAnchoringBounds() { 1024 return anchoringBounds; 1025 } 1026 1027 /** 1028 * Sets the anchoring of region bounds for this matcher. 1029 * 1030 * <p> Invoking this method with an argument of <tt>true</tt> will set this 1031 * matcher to use <i>anchoring</i> bounds. If the boolean 1032 * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be 1033 * used. 1034 * 1035 * <p> Using anchoring bounds, the boundaries of this 1036 * matcher's region match anchors such as ^ and $. 1037 * 1038 * <p> Without anchoring bounds, the boundaries of this 1039 * matcher's region will not match anchors such as ^ and $. 1040 * 1041 * <p> By default, a matcher uses anchoring region boundaries. 1042 * 1043 * @param b a boolean indicating whether or not to use anchoring bounds. 1044 * @return this matcher 1045 * @see java.util.regex.Matcher#hasAnchoringBounds 1046 * @since 1.5 1047 */ useAnchoringBounds(boolean b)1048 public Matcher useAnchoringBounds(boolean b) { 1049 synchronized (this) { 1050 anchoringBounds = b; 1051 nativeMatcher.useAnchoringBounds(b); 1052 } 1053 return this; 1054 } 1055 1056 /** 1057 * <p>Returns the string representation of this matcher. The 1058 * string representation of a <code>Matcher</code> contains information 1059 * that may be useful for debugging. The exact format is unspecified. 1060 * 1061 * @return The string representation of this matcher 1062 * @since 1.5 1063 */ toString()1064 public String toString() { 1065 StringBuilder sb = new StringBuilder(); 1066 sb.append("java.util.regex.Matcher"); 1067 sb.append("[pattern=" + pattern()); 1068 sb.append(" region="); 1069 sb.append(regionStart() + "," + regionEnd()); 1070 sb.append(" lastmatch="); 1071 if (matchFound && (group() != null)) { 1072 sb.append(group()); 1073 } 1074 sb.append("]"); 1075 return sb.toString(); 1076 } 1077 1078 /** 1079 * <p>Returns true if the end of input was hit by the search engine in 1080 * the last match operation performed by this matcher. 1081 * 1082 * <p>When this method returns true, then it is possible that more input 1083 * would have changed the result of the last search. 1084 * 1085 * @return true iff the end of input was hit in the last match; false 1086 * otherwise 1087 * @since 1.5 1088 */ hitEnd()1089 public boolean hitEnd() { 1090 synchronized (this) { 1091 return nativeMatcher.hitEnd(); 1092 } 1093 } 1094 1095 /** 1096 * <p>Returns true if more input could change a positive match into a 1097 * negative one. 1098 * 1099 * <p>If this method returns true, and a match was found, then more 1100 * input could cause the match to be lost. If this method returns false 1101 * and a match was found, then more input might change the match but the 1102 * match won't be lost. If a match was not found, then requireEnd has no 1103 * meaning. 1104 * 1105 * @return true iff more input could change a positive match into a 1106 * negative one. 1107 * @since 1.5 1108 */ requireEnd()1109 public boolean requireEnd() { 1110 synchronized (this) { 1111 return nativeMatcher.requireEnd(); 1112 } 1113 } 1114 1115 /** 1116 * Returns the end index of the text. 1117 * 1118 * @return the index after the last character in the text 1119 */ getTextLength()1120 int getTextLength() { 1121 return text.length(); 1122 } 1123 1124 /** 1125 * Generates a String from this Matcher's input in the specified range. 1126 * 1127 * @param beginIndex the beginning index, inclusive 1128 * @param endIndex the ending index, exclusive 1129 * @return A String generated from this Matcher's input 1130 */ getSubSequence(int beginIndex, int endIndex)1131 CharSequence getSubSequence(int beginIndex, int endIndex) { 1132 return text.subSequence(beginIndex, endIndex); 1133 } 1134 1135 /** 1136 * Resets the Matcher. A new input sequence and a new region can be 1137 * specified. Results of a previous find get lost. The next attempt to find 1138 * an occurrence of the Pattern in the string will start at the beginning of 1139 * the region. This is the internal version of reset() to which the several 1140 * public versions delegate. 1141 * 1142 * @param input 1143 * the input sequence. 1144 * @param start 1145 * the start of the region. 1146 * @param end 1147 * the end of the region. 1148 * 1149 * @return the matcher itself. 1150 */ reset(CharSequence input, int start, int end)1151 private Matcher reset(CharSequence input, int start, int end) { 1152 if (input == null) { 1153 throw new IllegalArgumentException("input == null"); 1154 } 1155 1156 if (start < 0 || end < 0 || start > input.length() || end > input.length() || start > end) { 1157 throw new IndexOutOfBoundsException(); 1158 } 1159 1160 this.originalInput = input; 1161 this.text = input.toString(); 1162 this.from = start; 1163 this.to = end; 1164 resetForInput(); 1165 1166 matchFound = false; 1167 appendPos = 0; 1168 1169 return this; 1170 } 1171 resetForInput()1172 private void resetForInput() { 1173 synchronized (this) { 1174 nativeMatcher.setInput(text, from, to); 1175 nativeMatcher.useAnchoringBounds(anchoringBounds); 1176 nativeMatcher.useTransparentBounds(transparentBounds); 1177 } 1178 } 1179 1180 /** 1181 * Makes sure that a successful match has been made. Is invoked internally 1182 * from various places in the class. 1183 * 1184 * @throws IllegalStateException 1185 * if no successful match has been made. 1186 */ ensureMatch()1187 private void ensureMatch() { 1188 if (!matchFound) { 1189 throw new IllegalStateException("No successful match so far"); 1190 } 1191 } 1192 getMatchedGroupIndex(String name)1193 private int getMatchedGroupIndex(String name) { 1194 ensureMatch(); 1195 int result = nativeMatcher.getMatchedGroupIndex(name); 1196 if (result < 0) { 1197 throw new IllegalArgumentException("No capturing group in the pattern " + 1198 "with the name " + name); 1199 } 1200 return result; 1201 } 1202 1203 /** 1204 * A trivial match result implementation that's based on an array of integers 1205 * representing match offsets. The array is of the form 1206 * {@code { start1, end1, start2, end2 ....}) where each consecutive pair of elements represents 1207 * the start and end of a match respectively. 1208 */ 1209 static final class OffsetBasedMatchResult implements MatchResult { 1210 private final String input; 1211 private final int[] offsets; 1212 OffsetBasedMatchResult(String input, int[] offsets)1213 OffsetBasedMatchResult(String input, int[] offsets) { 1214 this.input = input; 1215 this.offsets = offsets.clone(); 1216 } 1217 1218 @Override start()1219 public int start() { 1220 return start(0); 1221 } 1222 1223 @Override start(int group)1224 public int start(int group) { 1225 return offsets[2 * group]; 1226 } 1227 1228 @Override end()1229 public int end() { 1230 return end(0); 1231 } 1232 1233 @Override end(int group)1234 public int end(int group) { 1235 return offsets[2 * group + 1]; 1236 } 1237 1238 @Override group()1239 public String group() { 1240 return group(0); 1241 } 1242 1243 @Override group(int group)1244 public String group(int group) { 1245 final int start = start(group); 1246 final int end = end(group); 1247 if (start == -1 || end == -1) { 1248 return null; 1249 } 1250 1251 return input.substring(start, end); 1252 } 1253 1254 @Override groupCount()1255 public int groupCount() { 1256 return (offsets.length / 2) - 1; 1257 } 1258 } 1259 } 1260