• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2014-2016, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 
12 import ohos.global.icu.text.UnicodeSet.SpanCondition;
13 import ohos.global.icu.util.OutputInt;
14 
15 /**
16  * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
17  * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
18  * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with
19  * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on.
20  * For the inverse, the following mapping is used:
21  * <ul>
22  * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
23  * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
24  * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li>
25  * </ul>
26  * These are actually not complete inverses. However, the alternating works because there are no gaps.
27  * For example, with [a{ab}{bc}], you get the following behavior when scanning forward:
28  *
29  * <table border="1">
30  * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr>
31  * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr>
32  * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr>
33  * </table>
34  * <p>So here is what happens when you alternate:
35  *
36  * <table border="1">
37  * <tr><th>start</th><td>|xxxabcyyy</td></tr>
38  * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr>
39  * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr>
40  * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr>
41  * </table>
42  * <p>The entire string is traversed.
43  */
44 public class UnicodeSetSpanner {
45 
46     private final UnicodeSet unicodeSet;
47 
48     /**
49      * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class
50      * can be used with a non-frozen version to avoid the cost of freezing.
51      *
52      * @param source
53      *            the original UnicodeSet
54      */
UnicodeSetSpanner(UnicodeSet source)55     public UnicodeSetSpanner(UnicodeSet source) {
56         unicodeSet = source;
57     }
58 
59     /**
60      * Returns the UnicodeSet used for processing. It is frozen iff the original was.
61      *
62      * @return the construction set.
63      */
getUnicodeSet()64     public UnicodeSet getUnicodeSet() {
65         return unicodeSet;
66     }
67 
68 
69     /**
70      * {@inheritDoc}
71      */
72     @Override
equals(Object other)73     public boolean equals(Object other) {
74         return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet);
75     }
76 
77     /**
78      * {@inheritDoc}
79      */
80     @Override
hashCode()81     public int hashCode() {
82         return unicodeSet.hashCode();
83     }
84 
85     /**
86      * Options for replaceFrom and countIn to control how to treat each matched span.
87      * It is similar to whether one is replacing [abc] by x, or [abc]* by x.
88      */
89     public enum CountMethod {
90         /**
91          * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
92          * set elements.
93          */
94         WHOLE_SPAN,
95         /**
96          * Use the smallest number of elements in the spanned range for counting and modification,
97          * based on the {@link UnicodeSet.SpanCondition}.
98          * If the set has no strings, this will be the same as the number of spanned code points.
99          * <p>For example, in the string "abab" with SpanCondition.SIMPLE:
100          * <ul>
101          * <li>spanning with [ab] will count four MIN_ELEMENTS.</li>
102          * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
103          * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
104          * </ul>
105          */
106         MIN_ELEMENTS,
107         // Note: could in the future have an additional option MAX_ELEMENTS
108     }
109 
110     /**
111      * Returns the number of matching characters found in a character sequence,
112      * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
113      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
114      * @param sequence
115      *            the sequence to count characters in
116      * @return the count. Zero if there are none.
117      */
countIn(CharSequence sequence)118     public int countIn(CharSequence sequence) {
119         return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
120     }
121 
122     /**
123      * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE.
124      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
125      * @param sequence
126      *            the sequence to count characters in
127      * @param countMethod
128      *            whether to treat an entire span as a match, or individual elements as matches
129      * @return the count. Zero if there are none.
130      */
countIn(CharSequence sequence, CountMethod countMethod)131     public int countIn(CharSequence sequence, CountMethod countMethod) {
132         return countIn(sequence, countMethod, SpanCondition.SIMPLE);
133     }
134 
135     /**
136      * Returns the number of matching characters found in a character sequence.
137      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
138      * @param sequence
139      *            the sequence to count characters in
140      * @param countMethod
141      *            whether to treat an entire span as a match, or individual elements as matches
142      * @param spanCondition
143      *            the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span;
144      *            NOT_CONTAINED is the reverse.
145      *            <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases.
146      * @return the count. Zero if there are none.
147      */
countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition)148     public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) {
149         int count = 0;
150         int start = 0;
151         SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
152                 : SpanCondition.NOT_CONTAINED;
153         final int length = sequence.length();
154         OutputInt spanCount = null;
155         while (start != length) {
156             int endOfSpan = unicodeSet.span(sequence, start, skipSpan);
157             if (endOfSpan == length) {
158                 break;
159             }
160             if (countMethod == CountMethod.WHOLE_SPAN) {
161                 start = unicodeSet.span(sequence, endOfSpan, spanCondition);
162                 count += 1;
163             } else {
164                 if (spanCount == null) {
165                     spanCount = new OutputInt();
166                 }
167                 start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount);
168                 count += spanCount.value;
169             }
170         }
171         return count;
172     }
173 
174     /**
175      * Delete all the matching spans in sequence, using SpanCondition.SIMPLE
176      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
177      * @param sequence
178      *            charsequence to replace matching spans in.
179      * @return modified string.
180      */
deleteFrom(CharSequence sequence)181     public String deleteFrom(CharSequence sequence) {
182         return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE);
183     }
184 
185     /**
186      * Delete all matching spans in sequence, according to the spanCondition.
187      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
188      * @param sequence
189      *            charsequence to replace matching spans in.
190      * @param spanCondition
191      *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED)
192      * @return modified string.
193      */
deleteFrom(CharSequence sequence, SpanCondition spanCondition)194     public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) {
195         return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition);
196     }
197 
198     /**
199      * Replace all matching spans in sequence by the replacement,
200      * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
201      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
202      * @param sequence
203      *            charsequence to replace matching spans in.
204      * @param replacement
205      *            replacement sequence. To delete, use ""
206      * @return modified string.
207      */
replaceFrom(CharSequence sequence, CharSequence replacement)208     public String replaceFrom(CharSequence sequence, CharSequence replacement) {
209         return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
210     }
211 
212     /**
213      * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE.
214      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
215      *
216      * @param sequence
217      *            charsequence to replace matching spans in.
218      * @param replacement
219      *            replacement sequence. To delete, use ""
220      * @param countMethod
221      *            whether to treat an entire span as a match, or individual elements as matches
222      * @return modified string.
223      */
replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod)224     public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) {
225         return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE);
226     }
227 
228     /**
229      * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition.
230      * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
231      * @param sequence
232      *            charsequence to replace matching spans in.
233      * @param replacement
234      *            replacement sequence. To delete, use ""
235      * @param countMethod
236      *            whether to treat an entire span as a match, or individual elements as matches
237      * @param spanCondition
238      *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching
239      *            (NOT_CONTAINED)
240      * @return modified string.
241      */
replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, SpanCondition spanCondition)242     public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod,
243             SpanCondition spanCondition) {
244         SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
245                 : SpanCondition.NOT_CONTAINED;
246         final boolean remove = replacement.length() == 0;
247         StringBuilder result = new StringBuilder();
248         // TODO, we can optimize this to
249         // avoid this allocation unless needed
250 
251         final int length = sequence.length();
252         OutputInt spanCount = null;
253         for (int endCopy = 0; endCopy != length;) {
254             int endModify;
255             if (countMethod == CountMethod.WHOLE_SPAN) {
256                 endModify = unicodeSet.span(sequence, endCopy, spanCondition);
257             } else {
258                 if (spanCount == null) {
259                     spanCount = new OutputInt();
260                 }
261                 endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount);
262             }
263             if (remove || endModify == 0) {
264                 // do nothing
265             } else if (countMethod == CountMethod.WHOLE_SPAN) {
266                 result.append(replacement);
267             } else {
268                 for (int i = spanCount.value; i > 0; --i) {
269                     result.append(replacement);
270                 }
271             }
272             if (endModify == length) {
273                 break;
274             }
275             endCopy = unicodeSet.span(sequence, endModify, copySpan);
276             result.append(sequence.subSequence(endModify, endCopy));
277         }
278         return result.toString();
279     }
280 
281     /**
282      * Options for the trim() method
283      */
284     public enum TrimOption {
285         /**
286          * Trim leading spans.
287          */
288         LEADING,
289         /**
290          * Trim leading and trailing spans.
291          */
292         BOTH,
293         /**
294          * Trim trailing spans.
295          */
296         TRAILING;
297     }
298 
299     /**
300      * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and
301      * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example:
302      *
303      * <pre>
304      * {@code
305      *
306      *   new UnicodeSet("[ab]").trim("abacatbab")}
307      * </pre>
308      *
309      * ... returns {@code "cat"}.
310      * @param sequence
311      *            the sequence to trim
312      * @return a subsequence
313      */
trim(CharSequence sequence)314     public CharSequence trim(CharSequence sequence) {
315         return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE);
316     }
317 
318     /**
319      * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
320      * end of the string, using the trimOption and SpanCondition.SIMPLE. For example:
321      *
322      * <pre>
323      * {@code
324      *
325      *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)}
326      * </pre>
327      *
328      * ... returns {@code "catbab"}.
329      *
330      * @param sequence
331      *            the sequence to trim
332      * @param trimOption
333      *            LEADING, TRAILING, or BOTH
334      * @return a subsequence
335      */
trim(CharSequence sequence, TrimOption trimOption)336     public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
337         return trim(sequence, trimOption, SpanCondition.SIMPLE);
338     }
339 
340     /**
341      * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
342      * end of the string, depending on the trimOption and spanCondition. For example:
343      *
344      * <pre>
345      * {@code
346      *
347      *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)}
348      * </pre>
349      *
350      * ... returns {@code "catbab"}.
351      *
352      * @param sequence
353      *            the sequence to trim
354      * @param trimOption
355      *            LEADING, TRAILING, or BOTH
356      * @param spanCondition
357      *            SIMPLE, CONTAINED or NOT_CONTAINED
358      * @return a subsequence
359      */
trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition)360     public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) {
361         int endLeadContained, startTrailContained;
362         final int length = sequence.length();
363         if (trimOption != TrimOption.TRAILING) {
364             endLeadContained = unicodeSet.span(sequence, spanCondition);
365             if (endLeadContained == length) {
366                 return "";
367             }
368         } else {
369             endLeadContained = 0;
370         }
371         if (trimOption != TrimOption.LEADING) {
372             startTrailContained = unicodeSet.spanBack(sequence, spanCondition);
373         } else {
374             startTrailContained = length;
375         }
376         return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence(
377                 endLeadContained, startTrailContained);
378     }
379 
380 }
381