• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 package java.util.regex;
28 
29 import com.android.icu.util.regex.MatcherNative;
30 
31 /**
32  * An engine that performs match operations on a {@linkplain java.lang.CharSequence
33  * character sequence} by interpreting a {@link Pattern}.
34  *
35  * <p> A matcher is created from a pattern by invoking the pattern's {@link
36  * Pattern#matcher matcher} method.  Once created, a matcher can be used to
37  * perform three different kinds of match operations:
38  *
39  * <ul>
40  *
41  *   <li><p> The {@link #matches matches} method attempts to match the entire
42  *   input sequence against the pattern.  </p></li>
43  *
44  *   <li><p> The {@link #lookingAt lookingAt} method attempts to match the
45  *   input sequence, starting at the beginning, against the pattern.  </p></li>
46  *
47  *   <li><p> The {@link #find find} method scans the input sequence looking for
48  *   the next subsequence that matches the pattern.  </p></li>
49  *
50  * </ul>
51  *
52  * <p> Each of these methods returns a boolean indicating success or failure.
53  * More information about a successful match can be obtained by querying the
54  * state of the matcher.
55  *
56  * <p> A matcher finds matches in a subset of its input called the
57  * <i>region</i>. By default, the region contains all of the matcher's input.
58  * The region can be modified via the{@link #region region} method and queried
59  * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
60  * methods. The way that the region boundaries interact with some pattern
61  * constructs can be changed. See {@link #useAnchoringBounds
62  * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
63  * for more details.
64  *
65  * <p> This class also defines methods for replacing matched subsequences with
66  * new strings whose contents can, if desired, be computed from the match
67  * result.  The {@link #appendReplacement appendReplacement} and {@link
68  * #appendTail appendTail} methods can be used in tandem in order to collect
69  * the result into an existing string buffer, or the more convenient {@link
70  * #replaceAll replaceAll} method can be used to create a string in which every
71  * matching subsequence in the input sequence is replaced.
72  *
73  * <p> The explicit state of a matcher includes the start and end indices of
74  * the most recent successful match.  It also includes the start and end
75  * indices of the input subsequence captured by each <a
76  * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
77  * count of such subsequences.  As a convenience, methods are also provided for
78  * returning these captured subsequences in string form.
79  *
80  * <p> The explicit state of a matcher is initially undefined; attempting to
81  * query any part of it before a successful match will cause an {@link
82  * IllegalStateException} to be thrown.  The explicit state of a matcher is
83  * recomputed by every match operation.
84  *
85  * <p> The implicit state of a matcher includes the input character sequence as
86  * well as the <i>append position</i>, which is initially zero and is updated
87  * by the {@link #appendReplacement appendReplacement} method.
88  *
89  * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
90  * method or, if a new input sequence is desired, its {@link
91  * #reset(java.lang.CharSequence) reset(CharSequence)} method.  Resetting a
92  * matcher discards its explicit state information and sets the append position
93  * to zero.
94  *
95  * <p> Instances of this class are not safe for use by multiple concurrent
96  * threads. </p>
97  *
98  *
99  * @author      Mike McCloskey
100  * @author      Mark Reinhold
101  * @author      JSR-51 Expert Group
102  * @since       1.4
103  * @spec        JSR-51
104  */
105 
106 public final class Matcher implements MatchResult {
107 
108     /**
109      * The Pattern object that created this Matcher.
110      */
111     private Pattern parentPattern;
112 
113     /**
114      * Holds the offsets for the most recent match.
115      */
116     int[] groups;
117 
118     /**
119      * The range within the sequence that is to be matched (between  0
120      * and text.length()).
121      */
122     int from, to;
123 
124     /**
125      * Holds the input text.
126      */
127     String text;
128 
129     /**
130      * Reflects whether a match has been found during the most recent find
131      * operation.
132      */
133     private boolean matchFound;
134 
135     private MatcherNative nativeMatcher;
136 
137     /**
138      * The index of the last position appended in a substitution.
139      */
140     int appendPos = 0;
141 
142     /**
143      * Holds the original CharSequence for use in {@link #reset}. {@link #text} is used during
144      * matching. Note that CharSequence is mutable while String is not, so reset can cause the input
145      * to match to change.
146      */
147     private CharSequence originalInput;
148 
149     /**
150      * If transparentBounds is true then the boundaries of this
151      * matcher's region are transparent to lookahead, lookbehind,
152      * and boundary matching constructs that try to see beyond them.
153      */
154     boolean transparentBounds = false;
155 
156     /**
157      * If anchoringBounds is true then the boundaries of this
158      * matcher's region match anchors such as ^ and $.
159      */
160     boolean anchoringBounds = true;
161 
162     /**
163      * All matchers have the state used by Pattern during a match.
164      */
Matcher(Pattern parent, CharSequence text)165     Matcher(Pattern parent, CharSequence text) {
166         usePattern(parent);
167         reset(text);
168     }
169 
170     /**
171      * Returns the pattern that is interpreted by this matcher.
172      *
173      * @return  The pattern for which this matcher was created
174      */
pattern()175     public Pattern pattern() {
176         return parentPattern;
177     }
178 
179     /**
180      * Returns the match state of this matcher as a {@link MatchResult}.
181      * The result is unaffected by subsequent operations performed upon this
182      * matcher.
183      *
184      * @return  a <code>MatchResult</code> with the state of this matcher
185      * @since 1.5
186      */
toMatchResult()187     public MatchResult toMatchResult() {
188         ensureMatch();
189         return new OffsetBasedMatchResult(text, groups);
190     }
191 
192     /**
193       * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
194       * find matches with.
195       *
196       * <p> This method causes this matcher to lose information
197       * about the groups of the last match that occurred. The
198       * matcher's position in the input is maintained and its
199       * last append position is unaffected.</p>
200       *
201       * @param  newPattern
202       *         The new pattern used by this matcher
203       * @return  This matcher
204       * @throws  IllegalArgumentException
205       *          If newPattern is <tt>null</tt>
206       * @since 1.5
207       */
usePattern(Pattern newPattern)208     public Matcher usePattern(Pattern newPattern) {
209         if (newPattern == null)
210             throw new IllegalArgumentException("Pattern cannot be null");
211 
212         synchronized (this) {
213             // may throw
214             nativeMatcher = MatcherNative.create(newPattern.nativePattern);
215         }
216         parentPattern = newPattern;
217 
218         if (text != null) {
219             resetForInput();
220         }
221 
222         groups = new int[(groupCount() + 1) * 2];
223         matchFound = false;
224         return this;
225     }
226 
227     /**
228      * Resets this matcher.
229      *
230      * <p> Resetting a matcher discards all of its explicit state information
231      * and sets its append position to zero. The matcher's region is set to the
232      * default region, which is its entire character sequence. The anchoring
233      * and transparency of this matcher's region boundaries are unaffected.
234      *
235      * @return  This matcher
236      */
reset()237     public Matcher reset() {
238         return reset(originalInput, 0, originalInput.length());
239     }
240 
241     /**
242      * Resets this matcher with a new input sequence.
243      *
244      * <p> Resetting a matcher discards all of its explicit state information
245      * and sets its append position to zero.  The matcher's region is set to
246      * the default region, which is its entire character sequence.  The
247      * anchoring and transparency of this matcher's region boundaries are
248      * unaffected.
249      *
250      * @param  input
251      *         The new input character sequence
252      *
253      * @return  This matcher
254      */
reset(CharSequence input)255     public Matcher reset(CharSequence input) {
256         return reset(input, 0, input.length());
257     }
258 
259     /**
260      * Returns the start index of the previous match.
261      *
262      * @return  The index of the first character matched
263      *
264      * @throws  IllegalStateException
265      *          If no match has yet been attempted,
266      *          or if the previous match operation failed
267      */
start()268     public int start() {
269         return start(0);
270     }
271 
272     /**
273      * Returns the start index of the subsequence captured by the given group
274      * during the previous match operation.
275      *
276      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
277      * to right, starting at one.  Group zero denotes the entire pattern, so
278      * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
279      * <i>m.</i><tt>start()</tt>.  </p>
280      *
281      * @param  group
282      *         The index of a capturing group in this matcher's pattern
283      *
284      * @return  The index of the first character captured by the group,
285      *          or <tt>-1</tt> if the match was successful but the group
286      *          itself did not match anything
287      *
288      * @throws  IllegalStateException
289      *          If no match has yet been attempted,
290      *          or if the previous match operation failed
291      *
292      * @throws  IndexOutOfBoundsException
293      *          If there is no capturing group in the pattern
294      *          with the given index
295      */
start(int group)296     public int start(int group) {
297         ensureMatch();
298         if (group < 0 || group > groupCount())
299             throw new IndexOutOfBoundsException("No group " + group);
300         return groups[group * 2];
301     }
302 
303     /**
304      * Returns the start index of the subsequence captured by the given
305      * <a href="Pattern.html#groupname">named-capturing group</a> during the
306      * previous match operation.
307      *
308      * @param  name
309      *         The name of a named-capturing group in this matcher's pattern
310      *
311      * @return  The index of the first character captured by the group,
312      *          or {@code -1} if the match was successful but the group
313      *          itself did not match anything
314      *
315      * @throws  IllegalStateException
316      *          If no match has yet been attempted,
317      *          or if the previous match operation failed
318      *
319      * @throws  IllegalArgumentException
320      *          If there is no capturing group in the pattern
321      *          with the given name
322      * @since 1.8
323      */
start(String name)324     public int start(String name) {
325         return groups[getMatchedGroupIndex(name) * 2];
326     }
327 
328     /**
329      * Returns the offset after the last character matched.
330      *
331      * @return  The offset after the last character matched
332      *
333      * @throws  IllegalStateException
334      *          If no match has yet been attempted,
335      *          or if the previous match operation failed
336      */
end()337     public int end() {
338         return end(0);
339     }
340 
341     /**
342      * Returns the offset after the last character of the subsequence
343      * captured by the given group during the previous match operation.
344      *
345      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
346      * to right, starting at one.  Group zero denotes the entire pattern, so
347      * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
348      * <i>m.</i><tt>end()</tt>.  </p>
349      *
350      * @param  group
351      *         The index of a capturing group in this matcher's pattern
352      *
353      * @return  The offset after the last character captured by the group,
354      *          or <tt>-1</tt> if the match was successful
355      *          but the group itself did not match anything
356      *
357      * @throws  IllegalStateException
358      *          If no match has yet been attempted,
359      *          or if the previous match operation failed
360      *
361      * @throws  IndexOutOfBoundsException
362      *          If there is no capturing group in the pattern
363      *          with the given index
364      */
end(int group)365     public int end(int group) {
366         ensureMatch();
367         if (group < 0 || group > groupCount())
368             throw new IndexOutOfBoundsException("No group " + group);
369         return groups[group * 2 + 1];
370     }
371 
372     /**
373      * Returns the offset after the last character of the subsequence
374      * captured by the given <a href="Pattern.html#groupname">named-capturing
375      * group</a> during the previous match operation.
376      *
377      * @param  name
378      *         The name of a named-capturing group in this matcher's pattern
379      *
380      * @return  The offset after the last character captured by the group,
381      *          or {@code -1} if the match was successful
382      *          but the group itself did not match anything
383      *
384      * @throws  IllegalStateException
385      *          If no match has yet been attempted,
386      *          or if the previous match operation failed
387      *
388      * @throws  IllegalArgumentException
389      *          If there is no capturing group in the pattern
390      *          with the given name
391      * @since 1.8
392      */
end(String name)393     public int end(String name) {
394         return groups[getMatchedGroupIndex(name) * 2 + 1];
395     }
396 
397     /**
398      * Returns the input subsequence matched by the previous match.
399      *
400      * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
401      * the expressions <i>m.</i><tt>group()</tt> and
402      * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt>&nbsp;<i>m.</i><tt>end())</tt>
403      * are equivalent.  </p>
404      *
405      * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
406      * string.  This method will return the empty string when the pattern
407      * successfully matches the empty string in the input.  </p>
408      *
409      * @return The (possibly empty) subsequence matched by the previous match,
410      *         in string form
411      *
412      * @throws  IllegalStateException
413      *          If no match has yet been attempted,
414      *          or if the previous match operation failed
415      */
group()416     public String group() {
417         return group(0);
418     }
419 
420     /**
421      * Returns the input subsequence captured by the given group during the
422      * previous match operation.
423      *
424      * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
425      * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
426      * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt>&nbsp;<i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
427      * are equivalent.  </p>
428      *
429      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
430      * to right, starting at one.  Group zero denotes the entire pattern, so
431      * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
432      * </p>
433      *
434      * <p> If the match was successful but the group specified failed to match
435      * any part of the input sequence, then <tt>null</tt> is returned. Note
436      * that some groups, for example <tt>(a*)</tt>, match the empty string.
437      * This method will return the empty string when such a group successfully
438      * matches the empty string in the input.  </p>
439      *
440      * @param  group
441      *         The index of a capturing group in this matcher's pattern
442      *
443      * @return  The (possibly empty) subsequence captured by the group
444      *          during the previous match, or <tt>null</tt> if the group
445      *          failed to match part of the input
446      *
447      * @throws  IllegalStateException
448      *          If no match has yet been attempted,
449      *          or if the previous match operation failed
450      *
451      * @throws  IndexOutOfBoundsException
452      *          If there is no capturing group in the pattern
453      *          with the given index
454      */
group(int group)455     public String group(int group) {
456         ensureMatch();
457         if (group < 0 || group > groupCount())
458             throw new IndexOutOfBoundsException("No group " + group);
459         if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
460             return null;
461         return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
462     }
463 
464     /**
465      * Returns the input subsequence captured by the given
466      * <a href="Pattern.html#groupname">named-capturing group</a> during the previous
467      * match operation.
468      *
469      * <p> If the match was successful but the group specified failed to match
470      * any part of the input sequence, then <tt>null</tt> is returned. Note
471      * that some groups, for example <tt>(a*)</tt>, match the empty string.
472      * This method will return the empty string when such a group successfully
473      * matches the empty string in the input.  </p>
474      *
475      * @param  name
476      *         The name of a named-capturing group in this matcher's pattern
477      *
478      * @return  The (possibly empty) subsequence captured by the named group
479      *          during the previous match, or <tt>null</tt> if the group
480      *          failed to match part of the input
481      *
482      * @throws  IllegalStateException
483      *          If no match has yet been attempted,
484      *          or if the previous match operation failed
485      *
486      * @throws  IllegalArgumentException
487      *          If there is no capturing group in the pattern
488      *          with the given name
489      * @since 1.7
490      */
group(String name)491     public String group(String name) {
492         int group = getMatchedGroupIndex(name);
493         if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
494             return null;
495         return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
496     }
497 
498     /**
499      * Returns the number of capturing groups in this matcher's pattern.
500      *
501      * <p> Group zero denotes the entire pattern by convention. It is not
502      * included in this count.
503      *
504      * <p> Any non-negative integer smaller than or equal to the value
505      * returned by this method is guaranteed to be a valid group index for
506      * this matcher.  </p>
507      *
508      * @return The number of capturing groups in this matcher's pattern
509      */
groupCount()510     public int groupCount() {
511         synchronized (this) {
512             return nativeMatcher.groupCount();
513         }
514     }
515 
516     /**
517      * Attempts to match the entire region against the pattern.
518      *
519      * <p> If the match succeeds then more information can be obtained via the
520      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
521      *
522      * @return  <tt>true</tt> if, and only if, the entire region sequence
523      *          matches this matcher's pattern
524      */
matches()525     public boolean matches() {
526         synchronized (this) {
527             matchFound = nativeMatcher.matches(groups);
528         }
529         return matchFound;
530     }
531 
532     /**
533      * Attempts to find the next subsequence of the input sequence that matches
534      * the pattern.
535      *
536      * <p> This method starts at the beginning of this matcher's region, or, if
537      * a previous invocation of the method was successful and the matcher has
538      * not since been reset, at the first character not matched by the previous
539      * match.
540      *
541      * <p> If the match succeeds then more information can be obtained via the
542      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
543      *
544      * @return  <tt>true</tt> if, and only if, a subsequence of the input
545      *          sequence matches this matcher's pattern
546      */
find()547     public boolean find() {
548         synchronized (this) {
549             matchFound = nativeMatcher.findNext(groups);
550         }
551         return matchFound;
552     }
553 
554     /**
555      * Resets this matcher and then attempts to find the next subsequence of
556      * the input sequence that matches the pattern, starting at the specified
557      * index.
558      *
559      * <p> If the match succeeds then more information can be obtained via the
560      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
561      * invocations of the {@link #find()} method will start at the first
562      * character not matched by this match.  </p>
563      *
564      * @param start the index to start searching for a match
565      * @throws  IndexOutOfBoundsException
566      *          If start is less than zero or if start is greater than the
567      *          length of the input sequence.
568      *
569      * @return  <tt>true</tt> if, and only if, a subsequence of the input
570      *          sequence starting at the given index matches this matcher's
571      *          pattern
572      */
find(int start)573     public boolean find(int start) {
574         int limit = getTextLength();
575         if ((start < 0) || (start > limit))
576             throw new IndexOutOfBoundsException("Illegal start index");
577         reset();
578         synchronized (this) {
579             matchFound = nativeMatcher.find(start, groups);
580         }
581         return matchFound;
582     }
583 
584     /**
585      * Attempts to match the input sequence, starting at the beginning of the
586      * region, against the pattern.
587      *
588      * <p> Like the {@link #matches matches} method, this method always starts
589      * at the beginning of the region; unlike that method, it does not
590      * require that the entire region be matched.
591      *
592      * <p> If the match succeeds then more information can be obtained via the
593      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
594      *
595      * @return  <tt>true</tt> if, and only if, a prefix of the input
596      *          sequence matches this matcher's pattern
597      */
lookingAt()598     public boolean lookingAt() {
599         synchronized (this) {
600             matchFound = nativeMatcher.lookingAt(groups);
601         }
602         return matchFound;
603     }
604 
605     /**
606      * Returns a literal replacement <code>String</code> for the specified
607      * <code>String</code>.
608      *
609      * This method produces a <code>String</code> that will work
610      * as a literal replacement <code>s</code> in the
611      * <code>appendReplacement</code> method of the {@link Matcher} class.
612      * The <code>String</code> produced will match the sequence of characters
613      * in <code>s</code> treated as a literal sequence. Slashes ('\') and
614      * dollar signs ('$') will be given no special meaning.
615      *
616      * @param  s The string to be literalized
617      * @return  A literal string replacement
618      * @since 1.5
619      */
quoteReplacement(String s)620     public static String quoteReplacement(String s) {
621         if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
622             return s;
623         StringBuilder sb = new StringBuilder();
624         for (int i=0; i<s.length(); i++) {
625             char c = s.charAt(i);
626             if (c == '\\' || c == '$') {
627                 sb.append('\\');
628             }
629             sb.append(c);
630         }
631         return sb.toString();
632     }
633 
634     /**
635      * Implements a non-terminal append-and-replace step.
636      *
637      * <p> This method performs the following actions: </p>
638      *
639      * <ol>
640      *
641      *   <li><p> It reads characters from the input sequence, starting at the
642      *   append position, and appends them to the given string buffer.  It
643      *   stops after reading the last character preceding the previous match,
644      *   that is, the character at index {@link
645      *   #start()}&nbsp;<tt>-</tt>&nbsp;<tt>1</tt>.  </p></li>
646      *
647      *   <li><p> It appends the given replacement string to the string buffer.
648      *   </p></li>
649      *
650      *   <li><p> It sets the append position of this matcher to the index of
651      *   the last character matched, plus one, that is, to {@link #end()}.
652      *   </p></li>
653      *
654      * </ol>
655      *
656      * <p> The replacement string may contain references to subsequences
657      * captured during the previous match: Each occurrence of
658      * <tt>${</tt><i>name</i><tt>}</tt> or <tt>$</tt><i>g</i>
659      * will be replaced by the result of evaluating the corresponding
660      * {@link #group(String) group(name)} or {@link #group(int) group(g)}
661      * respectively. For  <tt>$</tt><i>g</i>,
662      * the first number after the <tt>$</tt> is always treated as part of
663      * the group reference. Subsequent numbers are incorporated into g if
664      * they would form a legal group reference. Only the numerals '0'
665      * through '9' are considered as potential components of the group
666      * reference. If the second group matched the string <tt>"foo"</tt>, for
667      * example, then passing the replacement string <tt>"$2bar"</tt> would
668      * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
669      * sign (<tt>$</tt>) may be included as a literal in the replacement
670      * string by preceding it with a backslash (<tt>\$</tt>).
671      *
672      * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
673      * the replacement string may cause the results to be different than if it
674      * were being treated as a literal replacement string. Dollar signs may be
675      * treated as references to captured subsequences as described above, and
676      * backslashes are used to escape literal characters in the replacement
677      * string.
678      *
679      * <p> This method is intended to be used in a loop together with the
680      * {@link #appendTail appendTail} and {@link #find find} methods.  The
681      * following code, for example, writes <tt>one dog two dogs in the
682      * yard</tt> to the standard-output stream: </p>
683      *
684      * <blockquote><pre>
685      * Pattern p = Pattern.compile("cat");
686      * Matcher m = p.matcher("one cat two cats in the yard");
687      * StringBuffer sb = new StringBuffer();
688      * while (m.find()) {
689      *     m.appendReplacement(sb, "dog");
690      * }
691      * m.appendTail(sb);
692      * System.out.println(sb.toString());</pre></blockquote>
693      *
694      * @param  sb
695      *         The target string buffer
696      *
697      * @param  replacement
698      *         The replacement string
699      *
700      * @return  This matcher
701      *
702      * @throws  IllegalStateException
703      *          If no match has yet been attempted,
704      *          or if the previous match operation failed
705      *
706      * @throws  IllegalArgumentException
707      *          If the replacement string refers to a named-capturing
708      *          group that does not exist in the pattern
709      *
710      * @throws  IndexOutOfBoundsException
711      *          If the replacement string refers to a capturing group
712      *          that does not exist in the pattern
713      */
appendReplacement(StringBuffer sb, String replacement)714     public Matcher appendReplacement(StringBuffer sb, String replacement) {
715 
716         sb.append(text.substring(appendPos, start()));
717         appendEvaluated(sb, replacement);
718         appendPos = end();
719 
720         return this;
721     }
722 
723     /**
724      * Internal helper method to append a given string to a given string buffer.
725      * If the string contains any references to groups, these are replaced by
726      * the corresponding group's contents.
727      *
728      * @param buffer the string buffer.
729      * @param s the string to append.
730      */
appendEvaluated(StringBuffer buffer, String s)731     private void appendEvaluated(StringBuffer buffer, String s) {
732         boolean escape = false;
733         boolean dollar = false;
734         boolean escapeNamedGroup = false;
735         int escapeNamedGroupStart = -1;
736 
737         for (int i = 0; i < s.length(); i++) {
738             char c = s.charAt(i);
739             if (c == '\\' && !escape) {
740                 escape = true;
741             } else if (c == '$' && !escape) {
742                 dollar = true;
743             } else if (c >= '0' && c <= '9' && dollar && !escapeNamedGroup) {
744                 String groupValue = group(c - '0');
745                 if (groupValue != null) {
746                     buffer.append(groupValue);
747                 }
748                 dollar = false;
749             } else if (c == '{' && dollar) {
750                 escapeNamedGroup = true;
751                 escapeNamedGroupStart = i;
752             } else if (c == '}' && dollar && escapeNamedGroup) {
753                 String groupValue = group(s.substring(escapeNamedGroupStart + 1, i));
754                 if (groupValue != null) {
755                     buffer.append(groupValue);
756                 }
757                 dollar = false;
758                 escapeNamedGroup = false;
759             } else if (c != '}' && dollar && escapeNamedGroup) {
760                 continue;
761             } else {
762                 buffer.append(c);
763                 dollar = false;
764                 escape = false;
765                 escapeNamedGroup = false;
766             }
767         }
768 
769         if (escape) {
770             throw new IllegalArgumentException("character to be escaped is missing");
771         }
772 
773         if (dollar) {
774             throw new IllegalArgumentException("Illegal group reference: group index is missing");
775         }
776 
777         if (escapeNamedGroup) {
778             throw new IllegalArgumentException("Missing ending brace '}' from replacement string");
779         }
780     }
781 
782     /**
783      * Implements a terminal append-and-replace step.
784      *
785      * <p> This method reads characters from the input sequence, starting at
786      * the append position, and appends them to the given string buffer.  It is
787      * intended to be invoked after one or more invocations of the {@link
788      * #appendReplacement appendReplacement} method in order to copy the
789      * remainder of the input sequence.  </p>
790      *
791      * @param  sb
792      *         The target string buffer
793      *
794      * @return  The target string buffer
795      */
appendTail(StringBuffer sb)796     public StringBuffer appendTail(StringBuffer sb) {
797         if (appendPos < to) {
798             sb.append(text.substring(appendPos, to));
799         }
800         return sb;
801     }
802 
803     /**
804      * Replaces every subsequence of the input sequence that matches the
805      * pattern with the given replacement string.
806      *
807      * <p> This method first resets this matcher.  It then scans the input
808      * sequence looking for matches of the pattern.  Characters that are not
809      * part of any match are appended directly to the result string; each match
810      * is replaced in the result by the replacement string.  The replacement
811      * string may contain references to captured subsequences as in the {@link
812      * #appendReplacement appendReplacement} method.
813      *
814      * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
815      * the replacement string may cause the results to be different than if it
816      * were being treated as a literal replacement string. Dollar signs may be
817      * treated as references to captured subsequences as described above, and
818      * backslashes are used to escape literal characters in the replacement
819      * string.
820      *
821      * <p> Given the regular expression <tt>a*b</tt>, the input
822      * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
823      * <tt>"-"</tt>, an invocation of this method on a matcher for that
824      * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
825      *
826      * <p> Invoking this method changes this matcher's state.  If the matcher
827      * is to be used in further matching operations then it should first be
828      * reset.  </p>
829      *
830      * @param  replacement
831      *         The replacement string
832      *
833      * @return  The string constructed by replacing each matching subsequence
834      *          by the replacement string, substituting captured subsequences
835      *          as needed
836      */
replaceAll(String replacement)837     public String replaceAll(String replacement) {
838         reset();
839         boolean result = find();
840         if (result) {
841             StringBuffer sb = new StringBuffer();
842             do {
843                 appendReplacement(sb, replacement);
844                 result = find();
845             } while (result);
846             appendTail(sb);
847             return sb.toString();
848         }
849         return text.toString();
850     }
851 
852     /**
853      * Replaces the first subsequence of the input sequence that matches the
854      * pattern with the given replacement string.
855      *
856      * <p> This method first resets this matcher.  It then scans the input
857      * sequence looking for a match of the pattern.  Characters that are not
858      * part of the match are appended directly to the result string; the match
859      * is replaced in the result by the replacement string.  The replacement
860      * string may contain references to captured subsequences as in the {@link
861      * #appendReplacement appendReplacement} method.
862      *
863      * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
864      * the replacement string may cause the results to be different than if it
865      * were being treated as a literal replacement string. Dollar signs may be
866      * treated as references to captured subsequences as described above, and
867      * backslashes are used to escape literal characters in the replacement
868      * string.
869      *
870      * <p> Given the regular expression <tt>dog</tt>, the input
871      * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
872      * <tt>"cat"</tt>, an invocation of this method on a matcher for that
873      * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>.  </p>
874      *
875      * <p> Invoking this method changes this matcher's state.  If the matcher
876      * is to be used in further matching operations then it should first be
877      * reset.  </p>
878      *
879      * @param  replacement
880      *         The replacement string
881      * @return  The string constructed by replacing the first matching
882      *          subsequence by the replacement string, substituting captured
883      *          subsequences as needed
884      */
replaceFirst(String replacement)885     public String replaceFirst(String replacement) {
886         if (replacement == null)
887             throw new NullPointerException("replacement");
888         reset();
889         if (!find())
890             return text.toString();
891         StringBuffer sb = new StringBuffer();
892         appendReplacement(sb, replacement);
893         appendTail(sb);
894         return sb.toString();
895     }
896 
897     /**
898      * Sets the limits of this matcher's region. The region is the part of the
899      * input sequence that will be searched to find a match. Invoking this
900      * method resets the matcher, and then sets the region to start at the
901      * index specified by the <code>start</code> parameter and end at the
902      * index specified by the <code>end</code> parameter.
903      *
904      * <p>Depending on the transparency and anchoring being used (see
905      * {@link #useTransparentBounds useTransparentBounds} and
906      * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
907      * as anchors may behave differently at or around the boundaries of the
908      * region.
909      *
910      * @param  start
911      *         The index to start searching at (inclusive)
912      * @param  end
913      *         The index to end searching at (exclusive)
914      * @throws  IndexOutOfBoundsException
915      *          If start or end is less than zero, if
916      *          start is greater than the length of the input sequence, if
917      *          end is greater than the length of the input sequence, or if
918      *          start is greater than end.
919      * @return  this matcher
920      * @since 1.5
921      */
region(int start, int end)922     public Matcher region(int start, int end) {
923         return reset(originalInput, start, end);
924     }
925 
926     /**
927      * Reports the start index of this matcher's region. The
928      * searches this matcher conducts are limited to finding matches
929      * within {@link #regionStart regionStart} (inclusive) and
930      * {@link #regionEnd regionEnd} (exclusive).
931      *
932      * @return  The starting point of this matcher's region
933      * @since 1.5
934      */
regionStart()935     public int regionStart() {
936         return from;
937     }
938 
939     /**
940      * Reports the end index (exclusive) of this matcher's region.
941      * The searches this matcher conducts are limited to finding matches
942      * within {@link #regionStart regionStart} (inclusive) and
943      * {@link #regionEnd regionEnd} (exclusive).
944      *
945      * @return  the ending point of this matcher's region
946      * @since 1.5
947      */
regionEnd()948     public int regionEnd() {
949         return to;
950     }
951 
952     /**
953      * Queries the transparency of region bounds for this matcher.
954      *
955      * <p> This method returns <tt>true</tt> if this matcher uses
956      * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
957      * bounds.
958      *
959      * <p> See {@link #useTransparentBounds useTransparentBounds} for a
960      * description of transparent and opaque bounds.
961      *
962      * <p> By default, a matcher uses opaque region boundaries.
963      *
964      * @return <tt>true</tt> iff this matcher is using transparent bounds,
965      *         <tt>false</tt> otherwise.
966      * @see java.util.regex.Matcher#useTransparentBounds(boolean)
967      * @since 1.5
968      */
hasTransparentBounds()969     public boolean hasTransparentBounds() {
970         return transparentBounds;
971     }
972 
973     /**
974      * Sets the transparency of region bounds for this matcher.
975      *
976      * <p> Invoking this method with an argument of <tt>true</tt> will set this
977      * matcher to use <i>transparent</i> bounds. If the boolean
978      * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
979      *
980      * <p> Using transparent bounds, the boundaries of this
981      * matcher's region are transparent to lookahead, lookbehind,
982      * and boundary matching constructs. Those constructs can see beyond the
983      * boundaries of the region to see if a match is appropriate.
984      *
985      * <p> Using opaque bounds, the boundaries of this matcher's
986      * region are opaque to lookahead, lookbehind, and boundary matching
987      * constructs that may try to see beyond them. Those constructs cannot
988      * look past the boundaries so they will fail to match anything outside
989      * of the region.
990      *
991      * <p> By default, a matcher uses opaque bounds.
992      *
993      * @param  b a boolean indicating whether to use opaque or transparent
994      *         regions
995      * @return this matcher
996      * @see java.util.regex.Matcher#hasTransparentBounds
997      * @since 1.5
998      */
useTransparentBounds(boolean b)999     public Matcher useTransparentBounds(boolean b) {
1000         synchronized (this) {
1001             transparentBounds = b;
1002             nativeMatcher.useTransparentBounds(b);
1003         }
1004         return this;
1005     }
1006 
1007     /**
1008      * Queries the anchoring of region bounds for this matcher.
1009      *
1010      * <p> This method returns <tt>true</tt> if this matcher uses
1011      * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
1012      *
1013      * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
1014      * description of anchoring bounds.
1015      *
1016      * <p> By default, a matcher uses anchoring region boundaries.
1017      *
1018      * @return <tt>true</tt> iff this matcher is using anchoring bounds,
1019      *         <tt>false</tt> otherwise.
1020      * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
1021      * @since 1.5
1022      */
hasAnchoringBounds()1023     public boolean hasAnchoringBounds() {
1024         return anchoringBounds;
1025     }
1026 
1027     /**
1028      * Sets the anchoring of region bounds for this matcher.
1029      *
1030      * <p> Invoking this method with an argument of <tt>true</tt> will set this
1031      * matcher to use <i>anchoring</i> bounds. If the boolean
1032      * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
1033      * used.
1034      *
1035      * <p> Using anchoring bounds, the boundaries of this
1036      * matcher's region match anchors such as ^ and $.
1037      *
1038      * <p> Without anchoring bounds, the boundaries of this
1039      * matcher's region will not match anchors such as ^ and $.
1040      *
1041      * <p> By default, a matcher uses anchoring region boundaries.
1042      *
1043      * @param  b a boolean indicating whether or not to use anchoring bounds.
1044      * @return this matcher
1045      * @see java.util.regex.Matcher#hasAnchoringBounds
1046      * @since 1.5
1047      */
useAnchoringBounds(boolean b)1048     public Matcher useAnchoringBounds(boolean b) {
1049         synchronized (this) {
1050             anchoringBounds = b;
1051             nativeMatcher.useAnchoringBounds(b);
1052         }
1053         return this;
1054     }
1055 
1056     /**
1057      * <p>Returns the string representation of this matcher. The
1058      * string representation of a <code>Matcher</code> contains information
1059      * that may be useful for debugging. The exact format is unspecified.
1060      *
1061      * @return  The string representation of this matcher
1062      * @since 1.5
1063      */
toString()1064     public String toString() {
1065         StringBuilder sb = new StringBuilder();
1066         sb.append("java.util.regex.Matcher");
1067         sb.append("[pattern=" + pattern());
1068         sb.append(" region=");
1069         sb.append(regionStart() + "," + regionEnd());
1070         sb.append(" lastmatch=");
1071         if (matchFound && (group() != null)) {
1072             sb.append(group());
1073         }
1074         sb.append("]");
1075         return sb.toString();
1076     }
1077 
1078     /**
1079      * <p>Returns true if the end of input was hit by the search engine in
1080      * the last match operation performed by this matcher.
1081      *
1082      * <p>When this method returns true, then it is possible that more input
1083      * would have changed the result of the last search.
1084      *
1085      * @return  true iff the end of input was hit in the last match; false
1086      *          otherwise
1087      * @since 1.5
1088      */
hitEnd()1089     public boolean hitEnd() {
1090         synchronized (this) {
1091             return nativeMatcher.hitEnd();
1092         }
1093     }
1094 
1095     /**
1096      * <p>Returns true if more input could change a positive match into a
1097      * negative one.
1098      *
1099      * <p>If this method returns true, and a match was found, then more
1100      * input could cause the match to be lost. If this method returns false
1101      * and a match was found, then more input might change the match but the
1102      * match won't be lost. If a match was not found, then requireEnd has no
1103      * meaning.
1104      *
1105      * @return  true iff more input could change a positive match into a
1106      *          negative one.
1107      * @since 1.5
1108      */
requireEnd()1109     public boolean requireEnd() {
1110         synchronized (this) {
1111             return nativeMatcher.requireEnd();
1112         }
1113     }
1114 
1115     /**
1116      * Returns the end index of the text.
1117      *
1118      * @return the index after the last character in the text
1119      */
getTextLength()1120     int getTextLength() {
1121         return text.length();
1122     }
1123 
1124     /**
1125      * Generates a String from this Matcher's input in the specified range.
1126      *
1127      * @param  beginIndex   the beginning index, inclusive
1128      * @param  endIndex     the ending index, exclusive
1129      * @return A String generated from this Matcher's input
1130      */
getSubSequence(int beginIndex, int endIndex)1131     CharSequence getSubSequence(int beginIndex, int endIndex) {
1132         return text.subSequence(beginIndex, endIndex);
1133     }
1134 
1135     /**
1136      * Resets the Matcher. A new input sequence and a new region can be
1137      * specified. Results of a previous find get lost. The next attempt to find
1138      * an occurrence of the Pattern in the string will start at the beginning of
1139      * the region. This is the internal version of reset() to which the several
1140      * public versions delegate.
1141      *
1142      * @param input
1143      *            the input sequence.
1144      * @param start
1145      *            the start of the region.
1146      * @param end
1147      *            the end of the region.
1148      *
1149      * @return the matcher itself.
1150      */
reset(CharSequence input, int start, int end)1151     private Matcher reset(CharSequence input, int start, int end) {
1152         if (input == null) {
1153             throw new IllegalArgumentException("input == null");
1154         }
1155 
1156         if (start < 0 || end < 0 || start > input.length() || end > input.length() || start > end) {
1157             throw new IndexOutOfBoundsException();
1158         }
1159 
1160         this.originalInput = input;
1161         this.text = input.toString();
1162         this.from = start;
1163         this.to = end;
1164         resetForInput();
1165 
1166         matchFound = false;
1167         appendPos = 0;
1168 
1169         return this;
1170     }
1171 
resetForInput()1172     private void resetForInput() {
1173         synchronized (this) {
1174             nativeMatcher.setInput(text, from, to);
1175             nativeMatcher.useAnchoringBounds(anchoringBounds);
1176             nativeMatcher.useTransparentBounds(transparentBounds);
1177         }
1178     }
1179 
1180     /**
1181      * Makes sure that a successful match has been made. Is invoked internally
1182      * from various places in the class.
1183      *
1184      * @throws IllegalStateException
1185      *             if no successful match has been made.
1186      */
ensureMatch()1187     private void ensureMatch() {
1188         if (!matchFound) {
1189             throw new IllegalStateException("No successful match so far");
1190         }
1191     }
1192 
getMatchedGroupIndex(String name)1193     private int getMatchedGroupIndex(String name) {
1194         ensureMatch();
1195         int result = nativeMatcher.getMatchedGroupIndex(name);
1196         if (result < 0) {
1197             throw new IllegalArgumentException("No capturing group in the pattern " +
1198                                                "with the name " + name);
1199         }
1200         return result;
1201     }
1202 
1203     /**
1204      * A trivial match result implementation that's based on an array of integers
1205      * representing match offsets. The array is of the form
1206      * {@code { start1, end1, start2, end2 ....}) where each consecutive pair of elements represents
1207      * the start and end of a match respectively.
1208      */
1209     static final class OffsetBasedMatchResult implements MatchResult {
1210         private final String input;
1211         private final int[] offsets;
1212 
OffsetBasedMatchResult(String input, int[] offsets)1213         OffsetBasedMatchResult(String input, int[] offsets) {
1214             this.input = input;
1215             this.offsets = offsets.clone();
1216         }
1217 
1218         @Override
start()1219         public int start() {
1220             return start(0);
1221         }
1222 
1223         @Override
start(int group)1224         public int start(int group) {
1225             return offsets[2 * group];
1226         }
1227 
1228         @Override
end()1229         public int end() {
1230             return end(0);
1231         }
1232 
1233         @Override
end(int group)1234         public int end(int group) {
1235             return offsets[2 * group + 1];
1236         }
1237 
1238         @Override
group()1239         public String group() {
1240             return group(0);
1241         }
1242 
1243         @Override
group(int group)1244         public String group(int group) {
1245             final int start = start(group);
1246             final int end = end(group);
1247             if (start == -1 || end == -1) {
1248                 return null;
1249             }
1250 
1251             return input.substring(start, end);
1252         }
1253 
1254         @Override
groupCount()1255         public int groupCount() {
1256             return (offsets.length / 2) - 1;
1257         }
1258     }
1259 }
1260