• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2014-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import com.ibm.icu.text.UnicodeSet.SpanCondition;
12 import com.ibm.icu.util.OutputInt;
13 
14 /**
15  * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
16  * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
17  * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with
18  * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on.
19  * For the inverse, the following mapping is used:
20  * <ul>
21  * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
22  * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
23  * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li>
24  * </ul>
25  * These are actually not complete inverses. However, the alternating works because there are no gaps.
26  * For example, with [a{ab}{bc}], you get the following behavior when scanning forward:
27  *
28  * <table border="1">
29  * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr>
30  * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr>
31  * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr>
32  * </table>
33  * <p>So here is what happens when you alternate:
34  *
35  * <table border="1">
36  * <tr><th>start</th><td>|xxxabcyyy</td></tr>
37  * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr>
38  * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr>
39  * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr>
40  * </table>
41  * <p>The entire string is traversed.
42  *
43  * @stable ICU 54
44  */
45 public class UnicodeSetSpanner {
46 
47     private final UnicodeSet unicodeSet;
48 
49     /**
50      * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class
51      * can be used with a non-frozen version to avoid the cost of freezing.
52      *
53      * @param source
54      *            the original UnicodeSet
55      *
56      * @stable ICU 54
57      */
UnicodeSetSpanner(UnicodeSet source)58     public UnicodeSetSpanner(UnicodeSet source) {
59         unicodeSet = source;
60     }
61 
62     /**
63      * Returns the UnicodeSet used for processing. It is frozen iff the original was.
64      *
65      * @return the construction set.
66      *
67      * @stable ICU 54
68      */
getUnicodeSet()69     public UnicodeSet getUnicodeSet() {
70         return unicodeSet;
71     }
72 
73 
74     /**
75      * {@inheritDoc}
76      *
77      * @stable ICU 54
78      */
79     @Override
equals(Object other)80     public boolean equals(Object other) {
81         return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet);
82     }
83 
84     /**
85      * {@inheritDoc}
86      *
87      * @stable ICU 54
88      */
89     @Override
hashCode()90     public int hashCode() {
91         return unicodeSet.hashCode();
92     }
93 
94     /**
95      * Options for replaceFrom and countIn to control how to treat each matched span.
96      * It is similar to whether one is replacing [abc] by x, or [abc]* by x.
97      *
98      * @stable ICU 54
99      */
100     public enum CountMethod {
101         /**
102          * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
103          * set elements.
104          *
105          * @stable ICU 54
106          */
107         WHOLE_SPAN,
108         /**
109          * Use the smallest number of elements in the spanned range for counting and modification,
110          * based on the {@link UnicodeSet.SpanCondition}.
111          * If the set has no strings, this will be the same as the number of spanned code points.
112          * <p>For example, in the string "abab" with SpanCondition.SIMPLE:
113          * <ul>
114          * <li>spanning with [ab] will count four MIN_ELEMENTS.</li>
115          * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
116          * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
117          * </ul>
118          *
119          * @stable ICU 54
120          */
121         MIN_ELEMENTS,
122         // Note: could in the future have an additional option MAX_ELEMENTS
123     }
124 
125     /**
126      * Returns the number of matching characters found in a character sequence,
127      * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
128      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
129      * @param sequence
130      *            the sequence to count characters in
131      * @return the count. Zero if there are none.
132      *
133      * @stable ICU 54
134      */
countIn(CharSequence sequence)135     public int countIn(CharSequence sequence) {
136         return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
137     }
138 
139     /**
140      * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE.
141      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
142      * @param sequence
143      *            the sequence to count characters in
144      * @param countMethod
145      *            whether to treat an entire span as a match, or individual elements as matches
146      * @return the count. Zero if there are none.
147      *
148      * @stable ICU 54
149      */
countIn(CharSequence sequence, CountMethod countMethod)150     public int countIn(CharSequence sequence, CountMethod countMethod) {
151         return countIn(sequence, countMethod, SpanCondition.SIMPLE);
152     }
153 
154     /**
155      * Returns the number of matching characters found in a character sequence.
156      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
157      * @param sequence
158      *            the sequence to count characters in
159      * @param countMethod
160      *            whether to treat an entire span as a match, or individual elements as matches
161      * @param spanCondition
162      *            the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span;
163      *            NOT_CONTAINED is the reverse.
164      *            <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases.
165      * @return the count. Zero if there are none.
166      *
167      * @stable ICU 54
168      */
countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition)169     public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) {
170         int count = 0;
171         int start = 0;
172         SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
173                 : SpanCondition.NOT_CONTAINED;
174         final int length = sequence.length();
175         OutputInt spanCount = null;
176         while (start != length) {
177             int endOfSpan = unicodeSet.span(sequence, start, skipSpan);
178             if (endOfSpan == length) {
179                 break;
180             }
181             if (countMethod == CountMethod.WHOLE_SPAN) {
182                 start = unicodeSet.span(sequence, endOfSpan, spanCondition);
183                 count += 1;
184             } else {
185                 if (spanCount == null) {
186                     spanCount = new OutputInt();
187                 }
188                 start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount);
189                 count += spanCount.value;
190             }
191         }
192         return count;
193     }
194 
195     /**
196      * Delete all the matching spans in sequence, using SpanCondition.SIMPLE
197      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
198      * @param sequence
199      *            charsequence to replace matching spans in.
200      * @return modified string.
201      *
202      * @stable ICU 54
203      */
deleteFrom(CharSequence sequence)204     public String deleteFrom(CharSequence sequence) {
205         return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE);
206     }
207 
208     /**
209      * Delete all matching spans in sequence, according to the spanCondition.
210      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
211      * @param sequence
212      *            charsequence to replace matching spans in.
213      * @param spanCondition
214      *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED)
215      * @return modified string.
216      *
217      * @stable ICU 54
218      */
deleteFrom(CharSequence sequence, SpanCondition spanCondition)219     public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) {
220         return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition);
221     }
222 
223     /**
224      * Replace all matching spans in sequence by the replacement,
225      * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
226      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
227      * @param sequence
228      *            charsequence to replace matching spans in.
229      * @param replacement
230      *            replacement sequence. To delete, use ""
231      * @return modified string.
232      *
233      * @stable ICU 54
234      */
replaceFrom(CharSequence sequence, CharSequence replacement)235     public String replaceFrom(CharSequence sequence, CharSequence replacement) {
236         return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
237     }
238 
239     /**
240      * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE.
241      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
242      *
243      * @param sequence
244      *            charsequence to replace matching spans in.
245      * @param replacement
246      *            replacement sequence. To delete, use ""
247      * @param countMethod
248      *            whether to treat an entire span as a match, or individual elements as matches
249      * @return modified string.
250      *
251      * @stable ICU 54
252      */
replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod)253     public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) {
254         return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE);
255     }
256 
257     /**
258      * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition.
259      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
260      * @param sequence
261      *            charsequence to replace matching spans in.
262      * @param replacement
263      *            replacement sequence. To delete, use ""
264      * @param countMethod
265      *            whether to treat an entire span as a match, or individual elements as matches
266      * @param spanCondition
267      *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching
268      *            (NOT_CONTAINED)
269      * @return modified string.
270      *
271      * @stable ICU 54
272      */
replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, SpanCondition spanCondition)273     public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod,
274             SpanCondition spanCondition) {
275         SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
276                 : SpanCondition.NOT_CONTAINED;
277         final boolean remove = replacement.length() == 0;
278         StringBuilder result = new StringBuilder();
279         // TODO, we can optimize this to
280         // avoid this allocation unless needed
281 
282         final int length = sequence.length();
283         OutputInt spanCount = null;
284         for (int endCopy = 0; endCopy != length;) {
285             int endModify;
286             if (countMethod == CountMethod.WHOLE_SPAN) {
287                 endModify = unicodeSet.span(sequence, endCopy, spanCondition);
288             } else {
289                 if (spanCount == null) {
290                     spanCount = new OutputInt();
291                 }
292                 endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount);
293             }
294             if (remove || endModify == 0) {
295                 // do nothing
296             } else if (countMethod == CountMethod.WHOLE_SPAN) {
297                 result.append(replacement);
298             } else {
299                 for (int i = spanCount.value; i > 0; --i) {
300                     result.append(replacement);
301                 }
302             }
303             if (endModify == length) {
304                 break;
305             }
306             endCopy = unicodeSet.span(sequence, endModify, copySpan);
307             result.append(sequence.subSequence(endModify, endCopy));
308         }
309         return result.toString();
310     }
311 
312     /**
313      * Options for the trim() method
314      *
315      * @stable ICU 54
316      */
317     public enum TrimOption {
318         /**
319          * Trim leading spans.
320          *
321          * @stable ICU 54
322          */
323         LEADING,
324         /**
325          * Trim leading and trailing spans.
326          *
327          * @stable ICU 54
328          */
329         BOTH,
330         /**
331          * Trim trailing spans.
332          *
333          * @stable ICU 54
334          */
335         TRAILING;
336     }
337 
338     /**
339      * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and
340      * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example:
341      *
342      * <pre>
343      * {@code
344      *
345      *   new UnicodeSet("[ab]").trim("abacatbab")}
346      * </pre>
347      *
348      * ... returns {@code "cat"}.
349      * @param sequence
350      *            the sequence to trim
351      * @return a subsequence
352      *
353      * @stable ICU 54
354      */
trim(CharSequence sequence)355     public CharSequence trim(CharSequence sequence) {
356         return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE);
357     }
358 
359     /**
360      * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
361      * end of the string, using the trimOption and SpanCondition.SIMPLE. For example:
362      *
363      * <pre>
364      * {@code
365      *
366      *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)}
367      * </pre>
368      *
369      * ... returns {@code "catbab"}.
370      *
371      * @param sequence
372      *            the sequence to trim
373      * @param trimOption
374      *            LEADING, TRAILING, or BOTH
375      * @return a subsequence
376      *
377      * @stable ICU 54
378      */
trim(CharSequence sequence, TrimOption trimOption)379     public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
380         return trim(sequence, trimOption, SpanCondition.SIMPLE);
381     }
382 
383     /**
384      * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
385      * end of the string, depending on the trimOption and spanCondition. For example:
386      *
387      * <pre>
388      * {@code
389      *
390      *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)}
391      * </pre>
392      *
393      * ... returns {@code "catbab"}.
394      *
395      * @param sequence
396      *            the sequence to trim
397      * @param trimOption
398      *            LEADING, TRAILING, or BOTH
399      * @param spanCondition
400      *            SIMPLE, CONTAINED or NOT_CONTAINED
401      * @return a subsequence
402      *
403      * @stable ICU 54
404      */
trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition)405     public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) {
406         int endLeadContained, startTrailContained;
407         final int length = sequence.length();
408         if (trimOption != TrimOption.TRAILING) {
409             endLeadContained = unicodeSet.span(sequence, spanCondition);
410             if (endLeadContained == length) {
411                 return "";
412             }
413         } else {
414             endLeadContained = 0;
415         }
416         if (trimOption != TrimOption.LEADING) {
417             startTrailContained = unicodeSet.spanBack(sequence, spanCondition);
418         } else {
419             startTrailContained = length;
420         }
421         return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence(
422                 endLeadContained, startTrailContained);
423     }
424 
425 }
426