• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import java.io.IOException;
12 import java.text.ParsePosition;
13 import java.util.ArrayList;
14 import java.util.Arrays;
15 import java.util.Collection;
16 import java.util.Collections;
17 import java.util.Iterator;
18 import java.util.NoSuchElementException;
19 import java.util.SortedSet;
20 import java.util.TreeSet;
21 
22 import com.ibm.icu.impl.BMPSet;
23 import com.ibm.icu.impl.CharacterPropertiesImpl;
24 import com.ibm.icu.impl.PatternProps;
25 import com.ibm.icu.impl.RuleCharacterIterator;
26 import com.ibm.icu.impl.SortedSetRelation;
27 import com.ibm.icu.impl.StringRange;
28 import com.ibm.icu.impl.UCaseProps;
29 import com.ibm.icu.impl.UPropertyAliases;
30 import com.ibm.icu.impl.UnicodeSetStringSpan;
31 import com.ibm.icu.impl.Utility;
32 import com.ibm.icu.lang.CharSequences;
33 import com.ibm.icu.lang.CharacterProperties;
34 import com.ibm.icu.lang.UCharacter;
35 import com.ibm.icu.lang.UProperty;
36 import com.ibm.icu.lang.UScript;
37 import com.ibm.icu.util.Freezable;
38 import com.ibm.icu.util.ICUUncheckedIOException;
39 import com.ibm.icu.util.OutputInt;
40 import com.ibm.icu.util.ULocale;
41 import com.ibm.icu.util.VersionInfo;
42 
43 /**
44  * A mutable set of Unicode characters and multicharacter strings.
45  * Objects of this class represent <em>character classes</em> used
46  * in regular expressions. A character specifies a subset of Unicode
47  * code points.  Legal code points are U+0000 to U+10FFFF, inclusive.
48  *
49  * Note: method freeze() will not only make the set immutable, but
50  * also makes important methods much higher performance:
51  * contains(c), containsNone(...), span(...), spanBack(...) etc.
52  * After the object is frozen, any subsequent call that wants to change
53  * the object will throw UnsupportedOperationException.
54  *
55  * <p>The UnicodeSet class is not designed to be subclassed.
56  *
57  * <p><code>UnicodeSet</code> supports two APIs. The first is the
58  * <em>operand</em> API that allows the caller to modify the value of
59  * a <code>UnicodeSet</code> object. It conforms to Java 2's
60  * <code>java.util.Set</code> interface, although
61  * <code>UnicodeSet</code> does not actually implement that
62  * interface. All methods of <code>Set</code> are supported, with the
63  * modification that they take a character range or single character
64  * instead of an <code>Object</code>, and they take a
65  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
66  * operand API may be thought of in terms of boolean logic: a boolean
67  * OR is implemented by <code>add</code>, a boolean AND is implemented
68  * by <code>retain</code>, a boolean XOR is implemented by
69  * <code>complement</code> taking an argument, and a boolean NOT is
70  * implemented by <code>complement</code> with no argument.  In terms
71  * of traditional set theory function names, <code>add</code> is a
72  * union, <code>retain</code> is an intersection, <code>remove</code>
73  * is an asymmetric difference, and <code>complement</code> with no
74  * argument is a set complement with respect to the superset range
75  * <code>MIN_VALUE-MAX_VALUE</code>
76  *
77  * <p>The second API is the
78  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
79  * <code>java.text.Format</code>-derived classes.  Unlike the
80  * methods that add characters, add categories, and control the logic
81  * of the set, the method <code>applyPattern()</code> sets all
82  * attributes of a <code>UnicodeSet</code> at once, based on a
83  * string pattern.
84  *
85  * <p><b>Pattern syntax</b></p>
86  *
87  * Patterns are accepted by the constructors and the
88  * <code>applyPattern()</code> methods and returned by the
89  * <code>toPattern()</code> method.  These patterns follow a syntax
90  * similar to that employed by version 8 regular expression character
91  * classes.  Here are some simple examples:
92  *
93  * <blockquote>
94  *   <table>
95  *     <tr style="vertical-align: top">
96  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td>
97  *       <td style="vertical-align: top;">No characters</td>
98  *     </tr><tr style="vertical-align: top">
99  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td>
100  *       <td style="vertical-align: top;">The character 'a'</td>
101  *     </tr><tr style="vertical-align: top">
102  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td>
103  *       <td style="vertical-align: top;">The characters 'a' and 'e'</td>
104  *     </tr>
105  *     <tr>
106  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td>
107  *       <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code
108  *       point order</td>
109  *     </tr>
110  *     <tr>
111  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td>
112  *       <td style="vertical-align: top;">The character U+4E01</td>
113  *     </tr>
114  *     <tr>
115  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td>
116  *       <td style="vertical-align: top;">The character 'a' and the multicharacter strings &quot;ab&quot; and
117  *       &quot;ac&quot;</td>
118  *     </tr>
119  *     <tr>
120  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td>
121  *       <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td>
122  *     </tr>
123  *   </table>
124  * </blockquote>
125  *
126  * Any character may be preceded by a backslash in order to remove any special
127  * meaning.  White space characters, as defined by the Unicode Pattern_White_Space property, are
128  * ignored, unless they are escaped.
129  *
130  * <p>Property patterns specify a set of characters having a certain
131  * property as defined by the Unicode standard.  Both the POSIX-like
132  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
133  * complete list of supported property patterns, see the User's Guide
134  * for UnicodeSet at
135  * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
136  * http://www.icu-project.org/userguide/unicodeSet.html</a>.
137  * Actual determination of property data is defined by the underlying
138  * Unicode database as implemented by UCharacter.
139  *
140  * <p>Patterns specify individual characters, ranges of characters, and
141  * Unicode property sets.  When elements are concatenated, they
142  * specify their union.  To complement a set, place a '^' immediately
143  * after the opening '['.  Property patterns are inverted by modifying
144  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
145  * '^' has no special meaning.
146  *
147  * <p>Ranges are indicated by placing two a '-' between two
148  * characters, as in "a-z".  This specifies the range of all
149  * characters from the left to the right, in Unicode order.  If the
150  * left character is greater than or equal to the
151  * right character it is a syntax error.  If a '-' occurs as the first
152  * character after the opening '[' or '[^', or if it occurs as the
153  * last character before the closing ']', then it is taken as a
154  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
155  * set of three characters, 'a', 'b', and '-'.
156  *
157  * <p>Sets may be intersected using the '&amp;' operator or the asymmetric
158  * set difference may be taken using the '-' operator, for example,
159  * "[[:L:]&amp;[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
160  * with values less than 4096.  Operators ('&amp;' and '|') have equal
161  * precedence and bind left-to-right.  Thus
162  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
163  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
164  * difference; intersection is commutative.
165  *
166  * <table>
167  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a'
168  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a'
169  * through 'z' and all letters in between, in Unicode order
170  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing
171  * all characters but 'a' through 'z',
172  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
173  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
174  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
175  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&amp;[<em>pat2</em>]]</code>
176  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
177  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
178  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
179  * <em>pat2</em>
180  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code>
181  * <td>The set of characters having the specified
182  * Unicode property; in
183  * this case, Unicode uppercase letters
184  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code>
185  * <td>The set of characters <em>not</em> having the given
186  * Unicode property
187  * </table>
188  *
189  * <p><b>Formal syntax</b></p>
190  *
191  * <blockquote>
192  *   <table>
193  *     <tr style="vertical-align: top">
194  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern :=&nbsp; </code></td>
195  *       <td style="vertical-align: top;"><code>('[' '^'? item* ']') |
196  *       property</code></td>
197  *     </tr>
198  *     <tr style="vertical-align: top">
199  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item :=&nbsp; </code></td>
200  *       <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br>
201  *       </code></td>
202  *     </tr>
203  *     <tr style="vertical-align: top">
204  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr :=&nbsp; </code></td>
205  *       <td style="vertical-align: top;"><code>pattern | pattern-expr pattern |
206  *       pattern-expr op pattern<br>
207  *       </code></td>
208  *     </tr>
209  *     <tr style="vertical-align: top">
210  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op :=&nbsp; </code></td>
211  *       <td style="vertical-align: top;"><code>'&amp;' | '-'<br>
212  *       </code></td>
213  *     </tr>
214  *     <tr style="vertical-align: top">
215  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special :=&nbsp; </code></td>
216  *       <td style="vertical-align: top;"><code>'[' | ']' | '-'<br>
217  *       </code></td>
218  *     </tr>
219  *     <tr style="vertical-align: top">
220  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char :=&nbsp; </code></td>
221  *       <td style="vertical-align: top;"><em>any character that is not</em><code> special<br>
222  *       | ('\\' </code><em>any character</em><code>)<br>
223  *       | ('&#92;u' hex hex hex hex)<br>
224  *       </code></td>
225  *     </tr>
226  *     <tr style="vertical-align: top">
227  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex :=&nbsp; </code></td>
228  *       <td style="vertical-align: top;"><em>any character for which
229  *       </em><code>Character.digit(c, 16)</code><em>
230  *       returns a non-negative result</em></td>
231  *     </tr>
232  *     <tr>
233  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property :=&nbsp; </code></td>
234  *       <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td>
235  *     </tr>
236  *   </table>
237  *   <br>
238  *   <table border="1">
239  *     <tr>
240  *       <td>Legend: <table>
241  *         <tr>
242  *           <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td>
243  *           <td style="width: 20; vertical-align: top;">&nbsp; </td>
244  *           <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td>
245  *         </tr>
246  *         <tr>
247  *           <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td>
248  *           <td style="vertical-align: top;"></td>
249  *           <td style="vertical-align: top;">zero or one instance of <code>a</code><br>
250  *           </td>
251  *         </tr>
252  *         <tr>
253  *           <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td>
254  *           <td style="vertical-align: top;"></td>
255  *           <td style="vertical-align: top;">one or more instances of <code>a</code><br>
256  *           </td>
257  *         </tr>
258  *         <tr>
259  *           <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td>
260  *           <td style="vertical-align: top;"></td>
261  *           <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br>
262  *           </td>
263  *         </tr>
264  *         <tr>
265  *           <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td>
266  *           <td style="vertical-align: top;"></td>
267  *           <td style="vertical-align: top;">the literal string between the quotes </td>
268  *         </tr>
269  *       </table>
270  *       </td>
271  *     </tr>
272  *   </table>
273  * </blockquote>
274  * <p>To iterate over contents of UnicodeSet, the following are available:
275  * <ul><li>{@link #ranges()} to iterate through the ranges</li>
276  * <li>{@link #strings()} to iterate through the strings</li>
277  * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
278  * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
279  * </ul>
280  * All of the above can be used in <b>for</b> loops.
281  * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
282  * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
283  *
284  * @author Alan Liu
285  * @stable ICU 2.0
286  * @see UnicodeSetIterator
287  * @see UnicodeSetSpanner
288  */
289 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> {
290     private static final SortedSet<String> EMPTY_STRINGS =
291             Collections.unmodifiableSortedSet(new TreeSet<String>());
292 
293     /**
294      * Constant for the empty set.
295      * @stable ICU 4.8
296      */
297     public static final UnicodeSet EMPTY = new UnicodeSet().freeze();
298     /**
299      * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.)
300      * @stable ICU 4.8
301      */
302     public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze();
303 
304     private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing
305 
306     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
307     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
308     // 110000 for codepoints
309 
310     /**
311      * Enough for sets with few ranges.
312      * For example, White_Space has 10 ranges, list length 21.
313      */
314     private static final int INITIAL_CAPACITY = 25;
315 
316     /** Max list [0, 1, 2, ..., max code point, HIGH] */
317     private static final int MAX_LENGTH = HIGH + 1;
318 
319     /**
320      * Minimum value that can be stored in a UnicodeSet.
321      * @stable ICU 2.0
322      */
323     public static final int MIN_VALUE = LOW;
324 
325     /**
326      * Maximum value that can be stored in a UnicodeSet.
327      * @stable ICU 2.0
328      */
329     public static final int MAX_VALUE = HIGH - 1;
330 
331     private int len;      // length used; list may be longer to minimize reallocs
332     private int[] list;   // MUST be terminated with HIGH
333     private int[] rangeList; // internal buffer
334     private int[] buffer; // internal buffer
335 
336     // is not private so that UnicodeSetIterator can get access
337     SortedSet<String> strings = EMPTY_STRINGS;
338 
339     /**
340      * The pattern representation of this set.  This may not be the
341      * most economical pattern.  It is the pattern supplied to
342      * applyPattern(), with variables substituted and whitespace
343      * removed.  For sets constructed without applyPattern(), or
344      * modified using the non-pattern API, this string will be null,
345      * indicating that toPattern() must generate a pattern
346      * representation from the inversion list.
347      */
348     private String pat = null;
349 
350     // Special property set IDs
351     private static final String ANY_ID   = "ANY";   // [\u0000-\U0010FFFF]
352     private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
353     private static final String ASSIGNED = "Assigned"; // [:^Cn:]
354 
355     private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
356     private volatile UnicodeSetStringSpan stringSpan;
357     //----------------------------------------------------------------
358     // Public API
359     //----------------------------------------------------------------
360 
361     /**
362      * Constructs an empty set.
363      * @stable ICU 2.0
364      */
UnicodeSet()365     public UnicodeSet() {
366         list = new int[INITIAL_CAPACITY];
367         list[0] = HIGH;
368         len = 1;
369     }
370 
371     /**
372      * Constructs a copy of an existing set.
373      * @stable ICU 2.0
374      */
UnicodeSet(UnicodeSet other)375     public UnicodeSet(UnicodeSet other) {
376         set(other);
377     }
378 
379     /**
380      * Constructs a set containing the given range. If <code>end &gt;
381      * start</code> then an empty set is created.
382      *
383      * @param start first character, inclusive, of range
384      * @param end last character, inclusive, of range
385      * @stable ICU 2.0
386      */
UnicodeSet(int start, int end)387     public UnicodeSet(int start, int end) {
388         this();
389         add(start, end);
390     }
391 
392     /**
393      * Quickly constructs a set from a set of ranges &lt;s0, e0, s1, e1, s2, e2, ..., sn, en&gt;.
394      * There must be an even number of integers, and they must be all greater than zero,
395      * all less than or equal to Character.MAX_CODE_POINT.
396      * In each pair (..., si, ei, ...) it must be true that si &lt;= ei
397      * Between adjacent pairs (...ei, sj...), it must be true that ei+1 &lt; sj
398      * @param pairs pairs of character representing ranges
399      * @stable ICU 4.4
400      */
UnicodeSet(int... pairs)401     public UnicodeSet(int... pairs) {
402         if ((pairs.length & 1) != 0) {
403             throw new IllegalArgumentException("Must have even number of integers");
404         }
405         list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set.
406         len = list.length;
407         int last = -1; // used to ensure that the results are monotonically increasing.
408         int i = 0;
409         while (i < pairs.length) {
410             int start = pairs[i];
411             if (last >= start) {
412                 throw new IllegalArgumentException("Must be monotonically increasing.");
413             }
414             list[i++] = start;
415             int limit = pairs[i] + 1;
416             if (start >= limit) {
417                 throw new IllegalArgumentException("Must be monotonically increasing.");
418             }
419             list[i++] = last = limit;
420         }
421         list[i] = HIGH; // terminate
422     }
423 
424     /**
425      * Constructs a set from the given pattern.  See the class description
426      * for the syntax of the pattern language.  Whitespace is ignored.
427      * @param pattern a string specifying what characters are in the set
428      * @exception java.lang.IllegalArgumentException if the pattern contains
429      * a syntax error.
430      * @stable ICU 2.0
431      */
UnicodeSet(String pattern)432     public UnicodeSet(String pattern) {
433         this();
434         applyPattern(pattern, null, null, IGNORE_SPACE);
435     }
436 
437     /**
438      * Constructs a set from the given pattern.  See the class description
439      * for the syntax of the pattern language.
440      * @param pattern a string specifying what characters are in the set
441      * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters
442      * @exception java.lang.IllegalArgumentException if the pattern contains
443      * a syntax error.
444      * @stable ICU 2.0
445      */
UnicodeSet(String pattern, boolean ignoreWhitespace)446     public UnicodeSet(String pattern, boolean ignoreWhitespace) {
447         this();
448         applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
449     }
450 
451     /**
452      * Constructs a set from the given pattern.  See the class description
453      * for the syntax of the pattern language.
454      * @param pattern a string specifying what characters are in the set
455      * @param options a bitmask indicating which options to apply.
456      * Valid options are IGNORE_SPACE and CASE.
457      * @exception java.lang.IllegalArgumentException if the pattern contains
458      * a syntax error.
459      * @stable ICU 3.8
460      */
UnicodeSet(String pattern, int options)461     public UnicodeSet(String pattern, int options) {
462         this();
463         applyPattern(pattern, null, null, options);
464     }
465 
466     /**
467      * Constructs a set from the given pattern.  See the class description
468      * for the syntax of the pattern language.
469      * @param pattern a string specifying what characters are in the set
470      * @param pos on input, the position in pattern at which to start parsing.
471      * On output, the position after the last character parsed.
472      * @param symbols a symbol table mapping variables to char[] arrays
473      * and chars to UnicodeSets
474      * @exception java.lang.IllegalArgumentException if the pattern
475      * contains a syntax error.
476      * @stable ICU 2.0
477      */
UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)478     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) {
479         this();
480         applyPattern(pattern, pos, symbols, IGNORE_SPACE);
481     }
482 
483     /**
484      * Constructs a set from the given pattern.  See the class description
485      * for the syntax of the pattern language.
486      * @param pattern a string specifying what characters are in the set
487      * @param pos on input, the position in pattern at which to start parsing.
488      * On output, the position after the last character parsed.
489      * @param symbols a symbol table mapping variables to char[] arrays
490      * and chars to UnicodeSets
491      * @param options a bitmask indicating which options to apply.
492      * Valid options are IGNORE_SPACE and CASE.
493      * @exception java.lang.IllegalArgumentException if the pattern
494      * contains a syntax error.
495      * @stable ICU 3.2
496      */
UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)497     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) {
498         this();
499         applyPattern(pattern, pos, symbols, options);
500     }
501 
502 
503     /**
504      * Return a new set that is equivalent to this one.
505      * @stable ICU 2.0
506      */
507     @Override
clone()508     public Object clone() {
509         if (isFrozen()) {
510             return this;
511         }
512         return new UnicodeSet(this);
513     }
514 
515     /**
516      * Make this object represent the range <code>start - end</code>.
517      * If <code>start &gt; end</code> then this object is set to an empty range.
518      *
519      * @param start first character in the set, inclusive
520      * @param end last character in the set, inclusive
521      * @stable ICU 2.0
522      */
set(int start, int end)523     public UnicodeSet set(int start, int end) {
524         checkFrozen();
525         clear();
526         complement(start, end);
527         return this;
528     }
529 
530     /**
531      * Make this object represent the same set as <code>other</code>.
532      * @param other a <code>UnicodeSet</code> whose value will be
533      * copied to this object
534      * @stable ICU 2.0
535      */
set(UnicodeSet other)536     public UnicodeSet set(UnicodeSet other) {
537         checkFrozen();
538         list = Arrays.copyOf(other.list, other.len);
539         len = other.len;
540         pat = other.pat;
541         if (other.hasStrings()) {
542             strings = new TreeSet<>(other.strings);
543         } else {
544             strings = EMPTY_STRINGS;
545         }
546         return this;
547     }
548 
549     /**
550      * Modifies this set to represent the set specified by the given pattern.
551      * See the class description for the syntax of the pattern language.
552      * Whitespace is ignored.
553      * @param pattern a string specifying what characters are in the set
554      * @exception java.lang.IllegalArgumentException if the pattern
555      * contains a syntax error.
556      * @stable ICU 2.0
557      */
applyPattern(String pattern)558     public final UnicodeSet applyPattern(String pattern) {
559         checkFrozen();
560         return applyPattern(pattern, null, null, IGNORE_SPACE);
561     }
562 
563     /**
564      * Modifies this set to represent the set specified by the given pattern,
565      * optionally ignoring whitespace.
566      * See the class description for the syntax of the pattern language.
567      * @param pattern a string specifying what characters are in the set
568      * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored
569      * @exception java.lang.IllegalArgumentException if the pattern
570      * contains a syntax error.
571      * @stable ICU 2.0
572      */
applyPattern(String pattern, boolean ignoreWhitespace)573     public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) {
574         checkFrozen();
575         return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
576     }
577 
578     /**
579      * Modifies this set to represent the set specified by the given pattern,
580      * optionally ignoring whitespace.
581      * See the class description for the syntax of the pattern language.
582      * @param pattern a string specifying what characters are in the set
583      * @param options a bitmask indicating which options to apply.
584      * Valid options are IGNORE_SPACE and CASE.
585      * @exception java.lang.IllegalArgumentException if the pattern
586      * contains a syntax error.
587      * @stable ICU 3.8
588      */
applyPattern(String pattern, int options)589     public UnicodeSet applyPattern(String pattern, int options) {
590         checkFrozen();
591         return applyPattern(pattern, null, null, options);
592     }
593 
594     /**
595      * Return true if the given position, in the given pattern, appears
596      * to be the start of a UnicodeSet pattern.
597      * @stable ICU 2.0
598      */
resemblesPattern(String pattern, int pos)599     public static boolean resemblesPattern(String pattern, int pos) {
600         return ((pos+1) < pattern.length() &&
601                 pattern.charAt(pos) == '[') ||
602                 resemblesPropertyPattern(pattern, pos);
603     }
604 
605     /**
606      * TODO: create Appendable version of UTF16.append(buf, c),
607      * maybe in new class Appendables?
608      * @throws IOException
609      */
appendCodePoint(Appendable app, int c)610     private static void appendCodePoint(Appendable app, int c) {
611         assert 0 <= c && c <= 0x10ffff;
612         try {
613             if (c <= 0xffff) {
614                 app.append((char) c);
615             } else {
616                 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c));
617             }
618         } catch (IOException e) {
619             throw new ICUUncheckedIOException(e);
620         }
621     }
622 
623     /**
624      * TODO: create class Appendables?
625      * @throws IOException
626      */
append(Appendable app, CharSequence s)627     private static void append(Appendable app, CharSequence s) {
628         try {
629             app.append(s);
630         } catch (IOException e) {
631             throw new ICUUncheckedIOException(e);
632         }
633     }
634 
635     /**
636      * Append the <code>toPattern()</code> representation of a
637      * string to the given <code>Appendable</code>.
638      */
_appendToPat(T buf, String s, boolean escapeUnprintable)639     private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) {
640         int cp;
641         for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
642             cp = s.codePointAt(i);
643             _appendToPat(buf, cp, escapeUnprintable);
644         }
645         return buf;
646     }
647 
648     /**
649      * Append the <code>toPattern()</code> representation of a
650      * character to the given <code>Appendable</code>.
651      */
_appendToPat(T buf, int c, boolean escapeUnprintable)652     private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
653         try {
654             if (escapeUnprintable && Utility.isUnprintable(c)) {
655                 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
656                 // unprintable
657                 if (Utility.escapeUnprintable(buf, c)) {
658                     return buf;
659                 }
660             }
661             // Okay to let ':' pass through
662             switch (c) {
663             case '[': // SET_OPEN:
664             case ']': // SET_CLOSE:
665             case '-': // HYPHEN:
666             case '^': // COMPLEMENT:
667             case '&': // INTERSECTION:
668             case '\\': //BACKSLASH:
669             case '{':
670             case '}':
671             case '$':
672             case ':':
673                 buf.append('\\');
674                 break;
675             default:
676                 // Escape whitespace
677                 if (PatternProps.isWhiteSpace(c)) {
678                     buf.append('\\');
679                 }
680                 break;
681             }
682             appendCodePoint(buf, c);
683             return buf;
684         } catch (IOException e) {
685             throw new ICUUncheckedIOException(e);
686         }
687     }
688 
689     /**
690      * Returns a string representation of this set.  If the result of
691      * calling this function is passed to a UnicodeSet constructor, it
692      * will produce another set that is equal to this one.
693      * @stable ICU 2.0
694      */
695     @Override
toPattern(boolean escapeUnprintable)696     public String toPattern(boolean escapeUnprintable) {
697         if (pat != null && !escapeUnprintable) {
698             return pat;
699         }
700         StringBuilder result = new StringBuilder();
701         return _toPattern(result, escapeUnprintable).toString();
702     }
703 
704     /**
705      * Append a string representation of this set to result.  This will be
706      * a cleaned version of the string passed to applyPattern(), if there
707      * is one.  Otherwise it will be generated.
708      */
_toPattern(T result, boolean escapeUnprintable)709     private <T extends Appendable> T _toPattern(T result,
710             boolean escapeUnprintable) {
711         if (pat == null) {
712             return appendNewPattern(result, escapeUnprintable, true);
713         }
714         try {
715             if (!escapeUnprintable) {
716                 result.append(pat);
717                 return result;
718             }
719             boolean oddNumberOfBackslashes = false;
720             for (int i=0; i<pat.length(); ) {
721                 int c = pat.codePointAt(i);
722                 i += Character.charCount(c);
723                 if (Utility.isUnprintable(c)) {
724                     // If the unprintable character is preceded by an odd
725                     // number of backslashes, then it has been escaped
726                     // and we omit the last backslash.
727                     Utility.escapeUnprintable(result, c);
728                     oddNumberOfBackslashes = false;
729                 } else if (!oddNumberOfBackslashes && c == '\\') {
730                     // Temporarily withhold an odd-numbered backslash.
731                     oddNumberOfBackslashes = true;
732                 } else {
733                     if (oddNumberOfBackslashes) {
734                         result.append('\\');
735                     }
736                     appendCodePoint(result, c);
737                     oddNumberOfBackslashes = false;
738                 }
739             }
740             if (oddNumberOfBackslashes) {
741                 result.append('\\');
742             }
743             return result;
744         } catch (IOException e) {
745             throw new ICUUncheckedIOException(e);
746         }
747     }
748 
749     /**
750      * Generate and append a string representation of this set to result.
751      * This does not use this.pat, the cleaned up copy of the string
752      * passed to applyPattern().
753      * @param result the buffer into which to generate the pattern
754      * @param escapeUnprintable escape unprintable characters if true
755      * @stable ICU 2.0
756      */
_generatePattern(StringBuffer result, boolean escapeUnprintable)757     public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) {
758         return _generatePattern(result, escapeUnprintable, true);
759     }
760 
761     /**
762      * Generate and append a string representation of this set to result.
763      * This does not use this.pat, the cleaned up copy of the string
764      * passed to applyPattern().
765      * @param includeStrings if false, doesn't include the strings.
766      * @stable ICU 3.8
767      */
_generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)768     public StringBuffer _generatePattern(StringBuffer result,
769             boolean escapeUnprintable, boolean includeStrings) {
770         return appendNewPattern(result, escapeUnprintable, includeStrings);
771     }
772 
appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)773     private <T extends Appendable> T appendNewPattern(
774             T result, boolean escapeUnprintable, boolean includeStrings) {
775         try {
776             result.append('[');
777 
778             int count = getRangeCount();
779 
780             // If the set contains at least 2 intervals and includes both
781             // MIN_VALUE and MAX_VALUE, then the inverse representation will
782             // be more economical.
783             if (count > 1 &&
784                     getRangeStart(0) == MIN_VALUE &&
785                     getRangeEnd(count-1) == MAX_VALUE) {
786 
787                 // Emit the inverse
788                 result.append('^');
789 
790                 for (int i = 1; i < count; ++i) {
791                     int start = getRangeEnd(i-1)+1;
792                     int end = getRangeStart(i)-1;
793                     _appendToPat(result, start, escapeUnprintable);
794                     if (start != end) {
795                         if ((start+1) != end) {
796                             result.append('-');
797                         }
798                         _appendToPat(result, end, escapeUnprintable);
799                     }
800                 }
801             }
802 
803             // Default; emit the ranges as pairs
804             else {
805                 for (int i = 0; i < count; ++i) {
806                     int start = getRangeStart(i);
807                     int end = getRangeEnd(i);
808                     _appendToPat(result, start, escapeUnprintable);
809                     if (start != end) {
810                         if ((start+1) != end) {
811                             result.append('-');
812                         }
813                         _appendToPat(result, end, escapeUnprintable);
814                     }
815                 }
816             }
817 
818             if (includeStrings && hasStrings()) {
819                 for (String s : strings) {
820                     result.append('{');
821                     _appendToPat(result, s, escapeUnprintable);
822                     result.append('}');
823                 }
824             }
825             result.append(']');
826             return result;
827         } catch (IOException e) {
828             throw new ICUUncheckedIOException(e);
829         }
830     }
831 
hasStrings()832     boolean hasStrings() {
833         return !strings.isEmpty();
834     }
835 
836     /**
837      * Returns the number of elements in this set (its cardinality)
838      * Note than the elements of a set may include both individual
839      * codepoints and strings.
840      *
841      * @return the number of elements in this set (its cardinality).
842      * @stable ICU 2.0
843      */
size()844     public int size() {
845         int n = 0;
846         int count = getRangeCount();
847         for (int i = 0; i < count; ++i) {
848             n += getRangeEnd(i) - getRangeStart(i) + 1;
849         }
850         return n + strings.size();
851     }
852 
853     /**
854      * Returns <tt>true</tt> if this set contains no elements.
855      *
856      * @return <tt>true</tt> if this set contains no elements.
857      * @stable ICU 2.0
858      */
isEmpty()859     public boolean isEmpty() {
860         return len == 1 && !hasStrings();
861     }
862 
863     /**
864      * Implementation of UnicodeMatcher API.  Returns <tt>true</tt> if
865      * this set contains any character whose low byte is the given
866      * value.  This is used by <tt>RuleBasedTransliterator</tt> for
867      * indexing.
868      * @stable ICU 2.0
869      */
870     @Override
matchesIndexValue(int v)871     public boolean matchesIndexValue(int v) {
872         /* The index value v, in the range [0,255], is contained in this set if
873          * it is contained in any pair of this set.  Pairs either have the high
874          * bytes equal, or unequal.  If the high bytes are equal, then we have
875          * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
876          * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
877          * Then v is contained if xx <= v || v <= yy.  (This is identical to the
878          * time zone month containment logic.)
879          */
880         for (int i=0; i<getRangeCount(); ++i) {
881             int low = getRangeStart(i);
882             int high = getRangeEnd(i);
883             if ((low & ~0xFF) == (high & ~0xFF)) {
884                 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
885                     return true;
886                 }
887             } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
888                 return true;
889             }
890         }
891         if (hasStrings()) {
892             for (String s : strings) {
893                 if (s.isEmpty()) {
894                     continue;  // skip the empty string
895                 }
896                 int c = UTF16.charAt(s, 0);
897                 if ((c & 0xFF) == v) {
898                     return true;
899                 }
900             }
901         }
902         return false;
903     }
904 
905     /**
906      * Implementation of UnicodeMatcher.matches().  Always matches the
907      * longest possible multichar string.
908      * @stable ICU 2.0
909      */
910     @Override
matches(Replaceable text, int[] offset, int limit, boolean incremental)911     public int matches(Replaceable text,
912             int[] offset,
913             int limit,
914             boolean incremental) {
915 
916         if (offset[0] == limit) {
917             if (contains(UnicodeMatcher.ETHER)) {
918                 return incremental ? U_PARTIAL_MATCH : U_MATCH;
919             } else {
920                 return U_MISMATCH;
921             }
922         } else {
923             if (hasStrings()) { // try strings first
924 
925                 // might separate forward and backward loops later
926                 // for now they are combined
927 
928                 // TODO Improve efficiency of this, at least in the forward
929                 // direction, if not in both.  In the forward direction we
930                 // can assume the strings are sorted.
931 
932                 boolean forward = offset[0] < limit;
933 
934                 // firstChar is the leftmost char to match in the
935                 // forward direction or the rightmost char to match in
936                 // the reverse direction.
937                 char firstChar = text.charAt(offset[0]);
938 
939                 // If there are multiple strings that can match we
940                 // return the longest match.
941                 int highWaterLength = 0;
942 
943                 for (String trial : strings) {
944                     if (trial.isEmpty()) {
945                         continue;  // skip the empty string
946                     }
947 
948                     char c = trial.charAt(forward ? 0 : trial.length() - 1);
949 
950                     // Strings are sorted, so we can optimize in the
951                     // forward direction.
952                     if (forward && c > firstChar) break;
953                     if (c != firstChar) continue;
954 
955                     int length = matchRest(text, offset[0], limit, trial);
956 
957                     if (incremental) {
958                         int maxLen = forward ? limit-offset[0] : offset[0]-limit;
959                         if (length == maxLen) {
960                             // We have successfully matched but only up to limit.
961                             return U_PARTIAL_MATCH;
962                         }
963                     }
964 
965                     if (length == trial.length()) {
966                         // We have successfully matched the whole string.
967                         if (length > highWaterLength) {
968                             highWaterLength = length;
969                         }
970                         // In the forward direction we know strings
971                         // are sorted so we can bail early.
972                         if (forward && length < highWaterLength) {
973                             break;
974                         }
975                         continue;
976                     }
977                 }
978 
979                 // We've checked all strings without a partial match.
980                 // If we have full matches, return the longest one.
981                 if (highWaterLength != 0) {
982                     offset[0] += forward ? highWaterLength : -highWaterLength;
983                     return U_MATCH;
984                 }
985             }
986             return super.matches(text, offset, limit, incremental);
987         }
988     }
989 
990     /**
991      * Returns the longest match for s in text at the given position.
992      * If limit > start then match forward from start+1 to limit
993      * matching all characters except s.charAt(0).  If limit < start,
994      * go backward starting from start-1 matching all characters
995      * except s.charAt(s.length()-1).  This method assumes that the
996      * first character, text.charAt(start), matches s, so it does not
997      * check it.
998      * @param text the text to match
999      * @param start the first character to match.  In the forward
1000      * direction, text.charAt(start) is matched against s.charAt(0).
1001      * In the reverse direction, it is matched against
1002      * s.charAt(s.length()-1).
1003      * @param limit the limit offset for matching, either last+1 in
1004      * the forward direction, or last-1 in the reverse direction,
1005      * where last is the index of the last character to match.
1006      * @return If part of s matches up to the limit, return |limit -
1007      * start|.  If all of s matches before reaching the limit, return
1008      * s.length().  If there is a mismatch between s and text, return
1009      * 0
1010      */
matchRest(Replaceable text, int start, int limit, String s)1011     private static int matchRest (Replaceable text, int start, int limit, String s) {
1012         int maxLen;
1013         int slen = s.length();
1014         if (start < limit) {
1015             maxLen = limit - start;
1016             if (maxLen > slen) maxLen = slen;
1017             for (int i = 1; i < maxLen; ++i) {
1018                 if (text.charAt(start + i) != s.charAt(i)) return 0;
1019             }
1020         } else {
1021             maxLen = start - limit;
1022             if (maxLen > slen) maxLen = slen;
1023             --slen; // <=> slen = s.length() - 1;
1024             for (int i = 1; i < maxLen; ++i) {
1025                 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
1026             }
1027         }
1028         return maxLen;
1029     }
1030 
1031     /**
1032      * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1.
1033      * @internal
1034      * @deprecated This API is ICU internal only.
1035      */
1036     @Deprecated
matchesAt(CharSequence text, int offset)1037     public int matchesAt(CharSequence text, int offset) {
1038         int lastLen = -1;
1039         strings:
1040             if (hasStrings()) {
1041                 char firstChar = text.charAt(offset);
1042                 String trial = null;
1043                 // find the first string starting with firstChar
1044                 Iterator<String> it = strings.iterator();
1045                 while (it.hasNext()) {
1046                     trial = it.next();
1047                     char firstStringChar = trial.charAt(0);
1048                     if (firstStringChar < firstChar) continue;
1049                     if (firstStringChar > firstChar) break strings;
1050                 }
1051 
1052                 // now keep checking string until we get the longest one
1053                 for (;;) {
1054                     int tempLen = matchesAt(text, offset, trial);
1055                     if (lastLen > tempLen) break strings;
1056                     lastLen = tempLen;
1057                     if (!it.hasNext()) break;
1058                     trial = it.next();
1059                 }
1060             }
1061 
1062         if (lastLen < 2) {
1063             int cp = UTF16.charAt(text, offset);
1064             if (contains(cp)) lastLen = UTF16.getCharCount(cp);
1065         }
1066 
1067         return offset+lastLen;
1068     }
1069 
1070     /**
1071      * Does one string contain another, starting at a specific offset?
1072      * @param text text to match
1073      * @param offsetInText offset within that text
1074      * @param substring substring to match at offset in text
1075      * @return -1 if match fails, otherwise other.length()
1076      */
1077     // Note: This method was moved from CollectionUtilities
matchesAt(CharSequence text, int offsetInText, CharSequence substring)1078     private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) {
1079         int len = substring.length();
1080         int textLength = text.length();
1081         if (textLength + offsetInText > len) {
1082             return -1;
1083         }
1084         int i = 0;
1085         for (int j = offsetInText; i < len; ++i, ++j) {
1086             char pc = substring.charAt(i);
1087             char tc = text.charAt(j);
1088             if (pc != tc) return -1;
1089         }
1090         return i;
1091     }
1092 
1093     /**
1094      * Implementation of UnicodeMatcher API.  Union the set of all
1095      * characters that may be matched by this object into the given
1096      * set.
1097      * @param toUnionTo the set into which to union the source characters
1098      * @stable ICU 2.2
1099      */
1100     @Override
addMatchSetTo(UnicodeSet toUnionTo)1101     public void addMatchSetTo(UnicodeSet toUnionTo) {
1102         toUnionTo.addAll(this);
1103     }
1104 
1105     /**
1106      * Returns the index of the given character within this set, where
1107      * the set is ordered by ascending code point.  If the character
1108      * is not in this set, return -1.  The inverse of this method is
1109      * <code>charAt()</code>.
1110      * @return an index from 0..size()-1, or -1
1111      * @stable ICU 2.0
1112      */
indexOf(int c)1113     public int indexOf(int c) {
1114         if (c < MIN_VALUE || c > MAX_VALUE) {
1115             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1116         }
1117         int i = 0;
1118         int n = 0;
1119         for (;;) {
1120             int start = list[i++];
1121             if (c < start) {
1122                 return -1;
1123             }
1124             int limit = list[i++];
1125             if (c < limit) {
1126                 return n + c - start;
1127             }
1128             n += limit - start;
1129         }
1130     }
1131 
1132     /**
1133      * Returns the character at the given index within this set, where
1134      * the set is ordered by ascending code point.  If the index is
1135      * out of range, return -1.  The inverse of this method is
1136      * <code>indexOf()</code>.
1137      * @param index an index from 0..size()-1
1138      * @return the character at the given index, or -1.
1139      * @stable ICU 2.0
1140      */
charAt(int index)1141     public int charAt(int index) {
1142         if (index >= 0) {
1143             // len2 is the largest even integer <= len, that is, it is len
1144             // for even values and len-1 for odd values.  With odd values
1145             // the last entry is UNICODESET_HIGH.
1146             int len2 = len & ~1;
1147             for (int i=0; i < len2;) {
1148                 int start = list[i++];
1149                 int count = list[i++] - start;
1150                 if (index < count) {
1151                     return start + index;
1152                 }
1153                 index -= count;
1154             }
1155         }
1156         return -1;
1157     }
1158 
1159     /**
1160      * Adds the specified range to this set if it is not already
1161      * present.  If this set already contains the specified range,
1162      * the call leaves this set unchanged.  If <code>start &gt; end</code>
1163      * then an empty range is added, leaving the set unchanged.
1164      *
1165      * @param start first character, inclusive, of range to be added
1166      * to this set.
1167      * @param end last character, inclusive, of range to be added
1168      * to this set.
1169      * @stable ICU 2.0
1170      */
add(int start, int end)1171     public UnicodeSet add(int start, int end) {
1172         checkFrozen();
1173         return add_unchecked(start, end);
1174     }
1175 
1176     /**
1177      * Adds all characters in range (uses preferred naming convention).
1178      * @param start The index of where to start on adding all characters.
1179      * @param end The index of where to end on adding all characters.
1180      * @return a reference to this object
1181      * @stable ICU 4.4
1182      */
addAll(int start, int end)1183     public UnicodeSet addAll(int start, int end) {
1184         checkFrozen();
1185         return add_unchecked(start, end);
1186     }
1187 
1188     // for internal use, after checkFrozen has been called
add_unchecked(int start, int end)1189     private UnicodeSet add_unchecked(int start, int end) {
1190         if (start < MIN_VALUE || start > MAX_VALUE) {
1191             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1192         }
1193         if (end < MIN_VALUE || end > MAX_VALUE) {
1194             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1195         }
1196         if (start < end) {
1197             int limit = end + 1;
1198             // Fast path for adding a new range after the last one.
1199             // Odd list length: [..., lastStart, lastLimit, HIGH]
1200             if ((len & 1) != 0) {
1201                 // If the list is empty, set lastLimit low enough to not be adjacent to 0.
1202                 int lastLimit = len == 1 ? -2 : list[len - 2];
1203                 if (lastLimit <= start) {
1204                     checkFrozen();
1205                     if (lastLimit == start) {
1206                         // Extend the last range.
1207                         list[len - 2] = limit;
1208                         if (limit == HIGH) {
1209                             --len;
1210                         }
1211                     } else {
1212                         list[len - 1] = start;
1213                         if (limit < HIGH) {
1214                             ensureCapacity(len + 2);
1215                             list[len++] = limit;
1216                             list[len++] = HIGH;
1217                         } else {  // limit == HIGH
1218                             ensureCapacity(len + 1);
1219                             list[len++] = HIGH;
1220                         }
1221                     }
1222                     pat = null;
1223                     return this;
1224                 }
1225             }
1226             // This is slow. Could be much faster using findCodePoint(start)
1227             // and modifying the list, dealing with adjacent & overlapping ranges.
1228             add(range(start, end), 2, 0);
1229         } else if (start == end) {
1230             add(start);
1231         }
1232         return this;
1233     }
1234 
1235     //    /**
1236     //     * Format out the inversion list as a string, for debugging.  Uncomment when
1237     //     * needed.
1238     //     */
1239     //    public final String dump() {
1240     //        StringBuffer buf = new StringBuffer("[");
1241     //        for (int i=0; i<len; ++i) {
1242     //            if (i != 0) buf.append(", ");
1243     //            int c = list[i];
1244     //            //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') {
1245     //            //    buf.append((char) c);
1246     //            //} else {
1247     //                buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
1248     //            //}
1249     //        }
1250     //        buf.append("]");
1251     //        return buf.toString();
1252     //    }
1253 
1254     /**
1255      * Adds the specified character to this set if it is not already
1256      * present.  If this set already contains the specified character,
1257      * the call leaves this set unchanged.
1258      * @stable ICU 2.0
1259      */
add(int c)1260     public final UnicodeSet add(int c) {
1261         checkFrozen();
1262         return add_unchecked(c);
1263     }
1264 
1265     // for internal use only, after checkFrozen has been called
add_unchecked(int c)1266     private final UnicodeSet add_unchecked(int c) {
1267         if (c < MIN_VALUE || c > MAX_VALUE) {
1268             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1269         }
1270 
1271         // find smallest i such that c < list[i]
1272         // if odd, then it is IN the set
1273         // if even, then it is OUT of the set
1274         int i = findCodePoint(c);
1275 
1276         // already in set?
1277         if ((i & 1) != 0) return this;
1278 
1279         // HIGH is 0x110000
1280         // assert(list[len-1] == HIGH);
1281 
1282         // empty = [HIGH]
1283         // [start_0, limit_0, start_1, limit_1, HIGH]
1284 
1285         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1286         //                             ^
1287         //                             list[i]
1288 
1289         // i == 0 means c is before the first range
1290         // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into
1291         //      add_unchecked, the maximum value that "c" will be compared to
1292         //      is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will
1293         //      never be reached according to this logic.
1294         if (c == list[i]-1) {
1295             // c is before start of next range
1296             list[i] = c;
1297             // if we touched the HIGH mark, then add a new one
1298             if (c == MAX_VALUE) {
1299                 ensureCapacity(len+1);
1300                 list[len++] = HIGH;
1301             }
1302             if (i > 0 && c == list[i-1]) {
1303                 // collapse adjacent ranges
1304 
1305                 // [..., start_k-1, c, c, limit_k, ..., HIGH]
1306                 //                     ^
1307                 //                     list[i]
1308                 System.arraycopy(list, i+1, list, i-1, len-i-1);
1309                 len -= 2;
1310             }
1311         }
1312 
1313         else if (i > 0 && c == list[i-1]) {
1314             // c is after end of prior range
1315             list[i-1]++;
1316             // no need to check for collapse here
1317         }
1318 
1319         else {
1320             // At this point we know the new char is not adjacent to
1321             // any existing ranges, and it is not 10FFFF.
1322 
1323 
1324             // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1325             //                             ^
1326             //                             list[i]
1327 
1328             // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
1329             //                             ^
1330             //                             list[i]
1331 
1332             // Don't use ensureCapacity() to save on copying.
1333             // NOTE: This has no measurable impact on performance,
1334             // but it might help in some usage patterns.
1335             if (len+2 > list.length) {
1336                 int[] temp = new int[nextCapacity(len + 2)];
1337                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
1338                 System.arraycopy(list, i, temp, i+2, len-i);
1339                 list = temp;
1340             } else {
1341                 System.arraycopy(list, i, list, i+2, len-i);
1342             }
1343 
1344             list[i] = c;
1345             list[i+1] = c+1;
1346             len += 2;
1347         }
1348 
1349         pat = null;
1350         return this;
1351     }
1352 
1353     /**
1354      * Adds the specified multicharacter to this set if it is not already
1355      * present.  If this set already contains the multicharacter,
1356      * the call leaves this set unchanged.
1357      * Thus "ch" =&gt; {"ch"}
1358      *
1359      * @param s the source string
1360      * @return this object, for chaining
1361      * @stable ICU 2.0
1362      */
add(CharSequence s)1363     public final UnicodeSet add(CharSequence s) {
1364         checkFrozen();
1365         int cp = getSingleCP(s);
1366         if (cp < 0) {
1367             String str = s.toString();
1368             if (!strings.contains(str)) {
1369                 addString(str);
1370                 pat = null;
1371             }
1372         } else {
1373             add_unchecked(cp, cp);
1374         }
1375         return this;
1376     }
1377 
addString(CharSequence s)1378     private void addString(CharSequence s) {
1379         if (strings == EMPTY_STRINGS) {
1380             strings = new TreeSet<>();
1381         }
1382         strings.add(s.toString());
1383     }
1384 
1385     /**
1386      * Utility for getting code point from single code point CharSequence.
1387      * See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE).
1388      *
1389      * @return a code point IF the string consists of a single one.
1390      * otherwise returns -1.
1391      * @param s to test
1392      */
getSingleCP(CharSequence s)1393     private static int getSingleCP(CharSequence s) {
1394         if (s.length() == 1) return s.charAt(0);
1395         if (s.length() == 2) {
1396             int cp = Character.codePointAt(s, 0);
1397             if (cp > 0xFFFF) { // is surrogate pair
1398                 return cp;
1399             }
1400         }
1401         return -1;
1402     }
1403 
1404     /**
1405      * Adds each of the characters in this string to the set. Thus "ch" =&gt; {"c", "h"}
1406      * If this set already any particular character, it has no effect on that character.
1407      * @param s the source string
1408      * @return this object, for chaining
1409      * @stable ICU 2.0
1410      */
addAll(CharSequence s)1411     public final UnicodeSet addAll(CharSequence s) {
1412         checkFrozen();
1413         int cp;
1414         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1415             cp = UTF16.charAt(s, i);
1416             add_unchecked(cp, cp);
1417         }
1418         return this;
1419     }
1420 
1421     /**
1422      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1423      * If this set already any particular character, it has no effect on that character.
1424      * @param s the source string
1425      * @return this object, for chaining
1426      * @stable ICU 2.0
1427      */
retainAll(CharSequence s)1428     public final UnicodeSet retainAll(CharSequence s) {
1429         return retainAll(fromAll(s));
1430     }
1431 
1432     /**
1433      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1434      * If this set already any particular character, it has no effect on that character.
1435      * @param s the source string
1436      * @return this object, for chaining
1437      * @stable ICU 2.0
1438      */
complementAll(CharSequence s)1439     public final UnicodeSet complementAll(CharSequence s) {
1440         return complementAll(fromAll(s));
1441     }
1442 
1443     /**
1444      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1445      * If this set already any particular character, it has no effect on that character.
1446      * @param s the source string
1447      * @return this object, for chaining
1448      * @stable ICU 2.0
1449      */
removeAll(CharSequence s)1450     public final UnicodeSet removeAll(CharSequence s) {
1451         return removeAll(fromAll(s));
1452     }
1453 
1454     /**
1455      * Remove all strings from this UnicodeSet
1456      * @return this object, for chaining
1457      * @stable ICU 4.2
1458      */
removeAllStrings()1459     public final UnicodeSet removeAllStrings() {
1460         checkFrozen();
1461         if (hasStrings()) {
1462             strings.clear();
1463             pat = null;
1464         }
1465         return this;
1466     }
1467 
1468     /**
1469      * Makes a set from a multicharacter string. Thus "ch" =&gt; {"ch"}
1470      *
1471      * @param s the source string
1472      * @return a newly created set containing the given string
1473      * @stable ICU 2.0
1474      */
from(CharSequence s)1475     public static UnicodeSet from(CharSequence s) {
1476         return new UnicodeSet().add(s);
1477     }
1478 
1479 
1480     /**
1481      * Makes a set from each of the characters in the string. Thus "ch" =&gt; {"c", "h"}
1482      * @param s the source string
1483      * @return a newly created set containing the given characters
1484      * @stable ICU 2.0
1485      */
fromAll(CharSequence s)1486     public static UnicodeSet fromAll(CharSequence s) {
1487         return new UnicodeSet().addAll(s);
1488     }
1489 
1490 
1491     /**
1492      * Retain only the elements in this set that are contained in the
1493      * specified range.  If <code>start &gt; end</code> then an empty range is
1494      * retained, leaving the set empty.
1495      *
1496      * @param start first character, inclusive, of range
1497      * @param end last character, inclusive, of range
1498      * @stable ICU 2.0
1499      */
retain(int start, int end)1500     public UnicodeSet retain(int start, int end) {
1501         checkFrozen();
1502         if (start < MIN_VALUE || start > MAX_VALUE) {
1503             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1504         }
1505         if (end < MIN_VALUE || end > MAX_VALUE) {
1506             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1507         }
1508         if (start <= end) {
1509             retain(range(start, end), 2, 0);
1510         } else {
1511             clear();
1512         }
1513         return this;
1514     }
1515 
1516     /**
1517      * Retain the specified character from this set if it is present.
1518      * Upon return this set will be empty if it did not contain c, or
1519      * will only contain c if it did contain c.
1520      * @param c the character to be retained
1521      * @return this object, for chaining
1522      * @stable ICU 2.0
1523      */
retain(int c)1524     public final UnicodeSet retain(int c) {
1525         return retain(c, c);
1526     }
1527 
1528     /**
1529      * Retain the specified string in this set if it is present.
1530      * Upon return this set will be empty if it did not contain s, or
1531      * will only contain s if it did contain s.
1532      * @param cs the string to be retained
1533      * @return this object, for chaining
1534      * @stable ICU 2.0
1535      */
retain(CharSequence cs)1536     public final UnicodeSet retain(CharSequence cs) {
1537         int cp = getSingleCP(cs);
1538         if (cp < 0) {
1539             checkFrozen();
1540             String s = cs.toString();
1541             boolean isIn = strings.contains(s);
1542             // Check for getRangeCount() first to avoid somewhat-expensive size()
1543             // when there are single code points.
1544             if (isIn && getRangeCount() == 0 && size() == 1) {
1545                 return this;
1546             }
1547             clear();
1548             if (isIn) {
1549                 addString(s);
1550             }
1551             pat = null;
1552         } else {
1553             retain(cp, cp);
1554         }
1555         return this;
1556     }
1557 
1558     /**
1559      * Removes the specified range from this set if it is present.
1560      * The set will not contain the specified range once the call
1561      * returns.  If <code>start &gt; end</code> then an empty range is
1562      * removed, leaving the set unchanged.
1563      *
1564      * @param start first character, inclusive, of range to be removed
1565      * from this set.
1566      * @param end last character, inclusive, of range to be removed
1567      * from this set.
1568      * @stable ICU 2.0
1569      */
remove(int start, int end)1570     public UnicodeSet remove(int start, int end) {
1571         checkFrozen();
1572         if (start < MIN_VALUE || start > MAX_VALUE) {
1573             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1574         }
1575         if (end < MIN_VALUE || end > MAX_VALUE) {
1576             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1577         }
1578         if (start <= end) {
1579             retain(range(start, end), 2, 2);
1580         }
1581         return this;
1582     }
1583 
1584     /**
1585      * Removes the specified character from this set if it is present.
1586      * The set will not contain the specified character once the call
1587      * returns.
1588      * @param c the character to be removed
1589      * @return this object, for chaining
1590      * @stable ICU 2.0
1591      */
remove(int c)1592     public final UnicodeSet remove(int c) {
1593         return remove(c, c);
1594     }
1595 
1596     /**
1597      * Removes the specified string from this set if it is present.
1598      * The set will not contain the specified string once the call
1599      * returns.
1600      * @param s the string to be removed
1601      * @return this object, for chaining
1602      * @stable ICU 2.0
1603      */
remove(CharSequence s)1604     public final UnicodeSet remove(CharSequence s) {
1605         int cp = getSingleCP(s);
1606         if (cp < 0) {
1607             checkFrozen();
1608             String str = s.toString();
1609             if (strings.contains(str)) {
1610                 strings.remove(str);
1611                 pat = null;
1612             }
1613         } else {
1614             remove(cp, cp);
1615         }
1616         return this;
1617     }
1618 
1619     /**
1620      * Complements the specified range in this set.  Any character in
1621      * the range will be removed if it is in this set, or will be
1622      * added if it is not in this set.  If <code>start &gt; end</code>
1623      * then an empty range is complemented, leaving the set unchanged.
1624      *
1625      * @param start first character, inclusive, of range
1626      * @param end last character, inclusive, of range
1627      * @stable ICU 2.0
1628      */
complement(int start, int end)1629     public UnicodeSet complement(int start, int end) {
1630         checkFrozen();
1631         if (start < MIN_VALUE || start > MAX_VALUE) {
1632             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1633         }
1634         if (end < MIN_VALUE || end > MAX_VALUE) {
1635             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1636         }
1637         if (start <= end) {
1638             xor(range(start, end), 2, 0);
1639         }
1640         pat = null;
1641         return this;
1642     }
1643 
1644     /**
1645      * Complements the specified character in this set.  The character
1646      * will be removed if it is in this set, or will be added if it is
1647      * not in this set.
1648      * @stable ICU 2.0
1649      */
complement(int c)1650     public final UnicodeSet complement(int c) {
1651         return complement(c, c);
1652     }
1653 
1654     /**
1655      * This is equivalent to
1656      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1657      * @stable ICU 2.0
1658      */
complement()1659     public UnicodeSet complement() {
1660         checkFrozen();
1661         if (list[0] == LOW) {
1662             System.arraycopy(list, 1, list, 0, len-1);
1663             --len;
1664         } else {
1665             ensureCapacity(len+1);
1666             System.arraycopy(list, 0, list, 1, len);
1667             list[0] = LOW;
1668             ++len;
1669         }
1670         pat = null;
1671         return this;
1672     }
1673 
1674     /**
1675      * Complement the specified string in this set.
1676      * The set will not contain the specified string once the call
1677      * returns.
1678      *
1679      * @param s the string to complement
1680      * @return this object, for chaining
1681      * @stable ICU 2.0
1682      */
complement(CharSequence s)1683     public final UnicodeSet complement(CharSequence s) {
1684         checkFrozen();
1685         int cp = getSingleCP(s);
1686         if (cp < 0) {
1687             String s2 = s.toString();
1688             if (strings.contains(s2)) {
1689                 strings.remove(s2);
1690             } else {
1691                 addString(s2);
1692             }
1693             pat = null;
1694         } else {
1695             complement(cp, cp);
1696         }
1697         return this;
1698     }
1699 
1700     /**
1701      * Returns true if this set contains the given character.
1702      * @param c character to be checked for containment
1703      * @return true if the test condition is met
1704      * @stable ICU 2.0
1705      */
1706     @Override
contains(int c)1707     public boolean contains(int c) {
1708         if (c < MIN_VALUE || c > MAX_VALUE) {
1709             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1710         }
1711         if (bmpSet != null) {
1712             return bmpSet.contains(c);
1713         }
1714         if (stringSpan != null) {
1715             return stringSpan.contains(c);
1716         }
1717 
1718         /*
1719         // Set i to the index of the start item greater than ch
1720         // We know we will terminate without length test!
1721         int i = -1;
1722         while (true) {
1723             if (c < list[++i]) break;
1724         }
1725          */
1726 
1727         int i = findCodePoint(c);
1728 
1729         return ((i & 1) != 0); // return true if odd
1730     }
1731 
1732     /**
1733      * Returns the smallest value i such that c < list[i].  Caller
1734      * must ensure that c is a legal value or this method will enter
1735      * an infinite loop.  This method performs a binary search.
1736      * @param c a character in the range MIN_VALUE..MAX_VALUE
1737      * inclusive
1738      * @return the smallest integer i in the range 0..len-1,
1739      * inclusive, such that c < list[i]
1740      */
findCodePoint(int c)1741     private final int findCodePoint(int c) {
1742         /* Examples:
1743                                            findCodePoint(c)
1744            set              list[]         c=0 1 3 4 7 8
1745            ===              ==============   ===========
1746            []               [110000]         0 0 0 0 0 0
1747            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
1748            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
1749            [:all:]          [0, 110000]      1 1 1 1 1 1
1750          */
1751 
1752         // Return the smallest i such that c < list[i].  Assume
1753         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
1754         if (c < list[0]) return 0;
1755         // High runner test.  c is often after the last range, so an
1756         // initial check for this condition pays off.
1757         if (len >= 2 && c >= list[len-2]) return len-1;
1758         int lo = 0;
1759         int hi = len - 1;
1760         // invariant: c >= list[lo]
1761         // invariant: c < list[hi]
1762         for (;;) {
1763             int i = (lo + hi) >>> 1;
1764         if (i == lo) return hi;
1765         if (c < list[i]) {
1766             hi = i;
1767         } else {
1768             lo = i;
1769         }
1770         }
1771     }
1772 
1773     //    //----------------------------------------------------------------
1774     //    // Unrolled binary search
1775     //    //----------------------------------------------------------------
1776     //
1777     //    private int validLen = -1; // validated value of len
1778     //    private int topOfLow;
1779     //    private int topOfHigh;
1780     //    private int power;
1781     //    private int deltaStart;
1782     //
1783     //    private void validate() {
1784     //        if (len <= 1) {
1785     //            throw new IllegalArgumentException("list.len==" + len + "; must be >1");
1786     //        }
1787     //
1788     //        // find greatest power of 2 less than or equal to len
1789     //        for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
1790     //
1791     //        // assert(exp2[power] <= len);
1792     //
1793     //        // determine the starting points
1794     //        topOfLow = exp2[power] - 1;
1795     //        topOfHigh = len - 1;
1796     //        deltaStart = exp2[power-1];
1797     //        validLen = len;
1798     //    }
1799     //
1800     //    private static final int exp2[] = {
1801     //        0x1, 0x2, 0x4, 0x8,
1802     //        0x10, 0x20, 0x40, 0x80,
1803     //        0x100, 0x200, 0x400, 0x800,
1804     //        0x1000, 0x2000, 0x4000, 0x8000,
1805     //        0x10000, 0x20000, 0x40000, 0x80000,
1806     //        0x100000, 0x200000, 0x400000, 0x800000,
1807     //        0x1000000, 0x2000000, 0x4000000, 0x8000000,
1808     //        0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
1809     //    };
1810     //
1811     //    /**
1812     //     * Unrolled lowest index GT.
1813     //     */
1814     //    private final int leastIndexGT(int searchValue) {
1815     //
1816     //        if (len != validLen) {
1817     //            if (len == 1) return 0;
1818     //            validate();
1819     //        }
1820     //        int temp;
1821     //
1822     //        // set up initial range to search. Each subrange is a power of two in length
1823     //        int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
1824     //
1825     //        // Completely unrolled binary search, folhighing "Programming Pearls"
1826     //        // Each case deliberately falls through to the next
1827     //        // Logically, list[-1] < all_search_values && list[count] > all_search_values
1828     //        // although the values -1 and count are never actually touched.
1829     //
1830     //        // The bounds at each point are low & high,
1831     //        // where low == high - delta*2
1832     //        // so high - delta is the midpoint
1833     //
1834     //        // The invariant AFTER each line is that list[low] < searchValue <= list[high]
1835     //
1836     //        switch (power) {
1837     //        //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
1838     //        case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
1839     //        case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
1840     //
1841     //        case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
1842     //        case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
1843     //        case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
1844     //        case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
1845     //
1846     //        case 24: if (searchValue < list[temp = high-  0x800000]) high = temp;
1847     //        case 23: if (searchValue < list[temp = high-  0x400000]) high = temp;
1848     //        case 22: if (searchValue < list[temp = high-  0x200000]) high = temp;
1849     //        case 21: if (searchValue < list[temp = high-  0x100000]) high = temp;
1850     //
1851     //        case 20: if (searchValue < list[temp = high-   0x80000]) high = temp;
1852     //        case 19: if (searchValue < list[temp = high-   0x40000]) high = temp;
1853     //        case 18: if (searchValue < list[temp = high-   0x20000]) high = temp;
1854     //        case 17: if (searchValue < list[temp = high-   0x10000]) high = temp;
1855     //
1856     //        case 16: if (searchValue < list[temp = high-    0x8000]) high = temp;
1857     //        case 15: if (searchValue < list[temp = high-    0x4000]) high = temp;
1858     //        case 14: if (searchValue < list[temp = high-    0x2000]) high = temp;
1859     //        case 13: if (searchValue < list[temp = high-    0x1000]) high = temp;
1860     //
1861     //        case 12: if (searchValue < list[temp = high-     0x800]) high = temp;
1862     //        case 11: if (searchValue < list[temp = high-     0x400]) high = temp;
1863     //        case 10: if (searchValue < list[temp = high-     0x200]) high = temp;
1864     //        case  9: if (searchValue < list[temp = high-     0x100]) high = temp;
1865     //
1866     //        case  8: if (searchValue < list[temp = high-      0x80]) high = temp;
1867     //        case  7: if (searchValue < list[temp = high-      0x40]) high = temp;
1868     //        case  6: if (searchValue < list[temp = high-      0x20]) high = temp;
1869     //        case  5: if (searchValue < list[temp = high-      0x10]) high = temp;
1870     //
1871     //        case  4: if (searchValue < list[temp = high-       0x8]) high = temp;
1872     //        case  3: if (searchValue < list[temp = high-       0x4]) high = temp;
1873     //        case  2: if (searchValue < list[temp = high-       0x2]) high = temp;
1874     //        case  1: if (searchValue < list[temp = high-       0x1]) high = temp;
1875     //        }
1876     //
1877     //        return high;
1878     //    }
1879     //
1880     //    // For debugging only
1881     //    public int len() {
1882     //        return len;
1883     //    }
1884     //
1885     //    //----------------------------------------------------------------
1886     //    //----------------------------------------------------------------
1887 
1888     /**
1889      * Returns true if this set contains every character
1890      * of the given range.
1891      * @param start first character, inclusive, of the range
1892      * @param end last character, inclusive, of the range
1893      * @return true if the test condition is met
1894      * @stable ICU 2.0
1895      */
contains(int start, int end)1896     public boolean contains(int start, int end) {
1897         if (start < MIN_VALUE || start > MAX_VALUE) {
1898             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1899         }
1900         if (end < MIN_VALUE || end > MAX_VALUE) {
1901             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1902         }
1903         //int i = -1;
1904         //while (true) {
1905         //    if (start < list[++i]) break;
1906         //}
1907         int i = findCodePoint(start);
1908         return ((i & 1) != 0 && end < list[i]);
1909     }
1910 
1911     /**
1912      * Returns <tt>true</tt> if this set contains the given
1913      * multicharacter string.
1914      * @param s string to be checked for containment
1915      * @return <tt>true</tt> if this set contains the specified string
1916      * @stable ICU 2.0
1917      */
contains(CharSequence s)1918     public final boolean contains(CharSequence s) {
1919 
1920         int cp = getSingleCP(s);
1921         if (cp < 0) {
1922             return strings.contains(s.toString());
1923         } else {
1924             return contains(cp);
1925         }
1926     }
1927 
1928     /**
1929      * Returns true if this set contains all the characters and strings
1930      * of the given set.
1931      * @param b set to be checked for containment
1932      * @return true if the test condition is met
1933      * @stable ICU 2.0
1934      */
containsAll(UnicodeSet b)1935     public boolean containsAll(UnicodeSet b) {
1936         // The specified set is a subset if all of its pairs are contained in
1937         // this set. This implementation accesses the lists directly for speed.
1938         // TODO: this could be faster if size() were cached. But that would affect building speed
1939         // so it needs investigation.
1940         int[] listB = b.list;
1941         boolean needA = true;
1942         boolean needB = true;
1943         int aPtr = 0;
1944         int bPtr = 0;
1945         int aLen = len - 1;
1946         int bLen = b.len - 1;
1947         int startA = 0, startB = 0, limitA = 0, limitB = 0;
1948         while (true) {
1949             // double iterations are such a pain...
1950             if (needA) {
1951                 if (aPtr >= aLen) {
1952                     // ran out of A. If B is also exhausted, then break;
1953                     if (needB && bPtr >= bLen) {
1954                         break;
1955                     }
1956                     return false;
1957                 }
1958                 startA = list[aPtr++];
1959                 limitA = list[aPtr++];
1960             }
1961             if (needB) {
1962                 if (bPtr >= bLen) {
1963                     // ran out of B. Since we got this far, we have an A and we are ok so far
1964                     break;
1965                 }
1966                 startB = listB[bPtr++];
1967                 limitB = listB[bPtr++];
1968             }
1969             // if B doesn't overlap and is greater than A, get new A
1970             if (startB >= limitA) {
1971                 needA = true;
1972                 needB = false;
1973                 continue;
1974             }
1975             // if B is wholy contained in A, then get a new B
1976             if (startB >= startA && limitB <= limitA) {
1977                 needA = false;
1978                 needB = true;
1979                 continue;
1980             }
1981             // all other combinations mean we fail
1982             return false;
1983         }
1984 
1985         if (!strings.containsAll(b.strings)) return false;
1986         return true;
1987     }
1988 
1989     //    /**
1990     //     * Returns true if this set contains all the characters and strings
1991     //     * of the given set.
1992     //     * @param c set to be checked for containment
1993     //     * @return true if the test condition is met
1994     //     * @stable ICU 2.0
1995     //     */
1996     //    public boolean containsAllOld(UnicodeSet c) {
1997     //        // The specified set is a subset if all of its pairs are contained in
1998     //        // this set.  It's possible to code this more efficiently in terms of
1999     //        // direct manipulation of the inversion lists if the need arises.
2000     //        int n = c.getRangeCount();
2001     //        for (int i=0; i<n; ++i) {
2002     //            if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
2003     //                return false;
2004     //            }
2005     //        }
2006     //        if (!strings.containsAll(c.strings)) return false;
2007     //        return true;
2008     //    }
2009 
2010     /**
2011      * Returns true if there is a partition of the string such that this set contains each of the partitioned strings.
2012      * For example, for the Unicode set [a{bc}{cd}]<br>
2013      * containsAll is true for each of: "a", "bc", ""cdbca"<br>
2014      * containsAll is false for each of: "acb", "bcda", "bcx"<br>
2015      * @param s string containing characters to be checked for containment
2016      * @return true if the test condition is met
2017      * @stable ICU 2.0
2018      */
containsAll(String s)2019     public boolean containsAll(String s) {
2020         int cp;
2021         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
2022             cp = UTF16.charAt(s, i);
2023             if (!contains(cp))  {
2024                 if (!hasStrings()) {
2025                     return false;
2026                 }
2027                 return containsAll(s, 0);
2028             }
2029         }
2030         return true;
2031     }
2032 
2033     /**
2034      * Recursive routine called if we fail to find a match in containsAll, and there are strings
2035      * @param s source string
2036      * @param i point to match to the end on
2037      * @return true if ok
2038      */
containsAll(String s, int i)2039     private boolean containsAll(String s, int i) {
2040         if (i >= s.length()) {
2041             return true;
2042         }
2043         int  cp= UTF16.charAt(s, i);
2044         if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
2045             return true;
2046         }
2047         for (String setStr : strings) {
2048             if (!setStr.isEmpty() &&  // skip the empty string
2049                     s.startsWith(setStr, i) &&  containsAll(s, i+setStr.length())) {
2050                 return true;
2051             }
2052         }
2053         return false;
2054 
2055     }
2056 
2057     /**
2058      * Get the Regex equivalent for this UnicodeSet
2059      * @return regex pattern equivalent to this UnicodeSet
2060      * @internal
2061      * @deprecated This API is ICU internal only.
2062      */
2063     @Deprecated
getRegexEquivalent()2064     public String getRegexEquivalent() {
2065         if (!hasStrings()) {
2066             return toString();
2067         }
2068         StringBuilder result = new StringBuilder("(?:");
2069         appendNewPattern(result, true, false);
2070         for (String s : strings) {
2071             result.append('|');
2072             _appendToPat(result, s, true);
2073         }
2074         return result.append(")").toString();
2075     }
2076 
2077     /**
2078      * Returns true if this set contains none of the characters
2079      * of the given range.
2080      * @param start first character, inclusive, of the range
2081      * @param end last character, inclusive, of the range
2082      * @return true if the test condition is met
2083      * @stable ICU 2.0
2084      */
containsNone(int start, int end)2085     public boolean containsNone(int start, int end) {
2086         if (start < MIN_VALUE || start > MAX_VALUE) {
2087             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
2088         }
2089         if (end < MIN_VALUE || end > MAX_VALUE) {
2090             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
2091         }
2092         int i = -1;
2093         while (true) {
2094             if (start < list[++i]) break;
2095         }
2096         return ((i & 1) == 0 && end < list[i]);
2097     }
2098 
2099     /**
2100      * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
2101      * For example, for the Unicode set [a{bc}{cd}]<br>
2102      * containsNone is true for: "xy", "cb"<br>
2103      * containsNone is false for: "a", "bc", "bcd"<br>
2104      * @param b set to be checked for containment
2105      * @return true if the test condition is met
2106      * @stable ICU 2.0
2107      */
containsNone(UnicodeSet b)2108     public boolean containsNone(UnicodeSet b) {
2109         // The specified set is a subset if some of its pairs overlap with some of this set's pairs.
2110         // This implementation accesses the lists directly for speed.
2111         int[] listB = b.list;
2112         boolean needA = true;
2113         boolean needB = true;
2114         int aPtr = 0;
2115         int bPtr = 0;
2116         int aLen = len - 1;
2117         int bLen = b.len - 1;
2118         int startA = 0, startB = 0, limitA = 0, limitB = 0;
2119         while (true) {
2120             // double iterations are such a pain...
2121             if (needA) {
2122                 if (aPtr >= aLen) {
2123                     // ran out of A: break so we test strings
2124                     break;
2125                 }
2126                 startA = list[aPtr++];
2127                 limitA = list[aPtr++];
2128             }
2129             if (needB) {
2130                 if (bPtr >= bLen) {
2131                     // ran out of B: break so we test strings
2132                     break;
2133                 }
2134                 startB = listB[bPtr++];
2135                 limitB = listB[bPtr++];
2136             }
2137             // if B is higher than any part of A, get new A
2138             if (startB >= limitA) {
2139                 needA = true;
2140                 needB = false;
2141                 continue;
2142             }
2143             // if A is higher than any part of B, get new B
2144             if (startA >= limitB) {
2145                 needA = false;
2146                 needB = true;
2147                 continue;
2148             }
2149             // all other combinations mean we fail
2150             return false;
2151         }
2152 
2153         if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false;
2154         return true;
2155     }
2156 
2157     //    /**
2158     //     * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
2159     //     * For example, for the Unicode set [a{bc}{cd}]<br>
2160     //     * containsNone is true for: "xy", "cb"<br>
2161     //     * containsNone is false for: "a", "bc", "bcd"<br>
2162     //     * @param c set to be checked for containment
2163     //     * @return true if the test condition is met
2164     //     * @stable ICU 2.0
2165     //     */
2166     //    public boolean containsNoneOld(UnicodeSet c) {
2167     //        // The specified set is a subset if all of its pairs are contained in
2168     //        // this set.  It's possible to code this more efficiently in terms of
2169     //        // direct manipulation of the inversion lists if the need arises.
2170     //        int n = c.getRangeCount();
2171     //        for (int i=0; i<n; ++i) {
2172     //            if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
2173     //                return false;
2174     //            }
2175     //        }
2176     //        if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false;
2177     //        return true;
2178     //    }
2179 
2180     /**
2181      * Returns true if this set contains none of the characters
2182      * of the given string.
2183      * @param s string containing characters to be checked for containment
2184      * @return true if the test condition is met
2185      * @stable ICU 2.0
2186      */
containsNone(CharSequence s)2187     public boolean containsNone(CharSequence s) {
2188         return span(s, SpanCondition.NOT_CONTAINED) == s.length();
2189     }
2190 
2191     /**
2192      * Returns true if this set contains one or more of the characters
2193      * in the given range.
2194      * @param start first character, inclusive, of the range
2195      * @param end last character, inclusive, of the range
2196      * @return true if the condition is met
2197      * @stable ICU 2.0
2198      */
containsSome(int start, int end)2199     public final boolean containsSome(int start, int end) {
2200         return !containsNone(start, end);
2201     }
2202 
2203     /**
2204      * Returns true if this set contains one or more of the characters
2205      * and strings of the given set.
2206      * @param s set to be checked for containment
2207      * @return true if the condition is met
2208      * @stable ICU 2.0
2209      */
containsSome(UnicodeSet s)2210     public final boolean containsSome(UnicodeSet s) {
2211         return !containsNone(s);
2212     }
2213 
2214     /**
2215      * Returns true if this set contains one or more of the characters
2216      * of the given string.
2217      * @param s string containing characters to be checked for containment
2218      * @return true if the condition is met
2219      * @stable ICU 2.0
2220      */
containsSome(CharSequence s)2221     public final boolean containsSome(CharSequence s) {
2222         return !containsNone(s);
2223     }
2224 
2225 
2226     /**
2227      * Adds all of the elements in the specified set to this set if
2228      * they're not already present.  This operation effectively
2229      * modifies this set so that its value is the <i>union</i> of the two
2230      * sets.  The behavior of this operation is unspecified if the specified
2231      * collection is modified while the operation is in progress.
2232      *
2233      * @param c set whose elements are to be added to this set.
2234      * @stable ICU 2.0
2235      */
addAll(UnicodeSet c)2236     public UnicodeSet addAll(UnicodeSet c) {
2237         checkFrozen();
2238         add(c.list, c.len, 0);
2239         if (c.hasStrings()) {
2240             if (strings == EMPTY_STRINGS) {
2241                 strings = new TreeSet<>(c.strings);
2242             } else {
2243                 strings.addAll(c.strings);
2244             }
2245         }
2246         return this;
2247     }
2248 
2249     /**
2250      * Retains only the elements in this set that are contained in the
2251      * specified set.  In other words, removes from this set all of
2252      * its elements that are not contained in the specified set.  This
2253      * operation effectively modifies this set so that its value is
2254      * the <i>intersection</i> of the two sets.
2255      *
2256      * @param c set that defines which elements this set will retain.
2257      * @stable ICU 2.0
2258      */
retainAll(UnicodeSet c)2259     public UnicodeSet retainAll(UnicodeSet c) {
2260         checkFrozen();
2261         retain(c.list, c.len, 0);
2262         if (hasStrings()) {
2263             if (!c.hasStrings()) {
2264                 strings.clear();
2265             } else {
2266                 strings.retainAll(c.strings);
2267             }
2268         }
2269         return this;
2270     }
2271 
2272     /**
2273      * Removes from this set all of its elements that are contained in the
2274      * specified set.  This operation effectively modifies this
2275      * set so that its value is the <i>asymmetric set difference</i> of
2276      * the two sets.
2277      *
2278      * @param c set that defines which elements will be removed from
2279      *          this set.
2280      * @stable ICU 2.0
2281      */
removeAll(UnicodeSet c)2282     public UnicodeSet removeAll(UnicodeSet c) {
2283         checkFrozen();
2284         retain(c.list, c.len, 2);
2285         if (hasStrings() && c.hasStrings()) {
2286             strings.removeAll(c.strings);
2287         }
2288         return this;
2289     }
2290 
2291     /**
2292      * Complements in this set all elements contained in the specified
2293      * set.  Any character in the other set will be removed if it is
2294      * in this set, or will be added if it is not in this set.
2295      *
2296      * @param c set that defines which elements will be complemented from
2297      *          this set.
2298      * @stable ICU 2.0
2299      */
complementAll(UnicodeSet c)2300     public UnicodeSet complementAll(UnicodeSet c) {
2301         checkFrozen();
2302         xor(c.list, c.len, 0);
2303         if (c.hasStrings()) {
2304             if (strings == EMPTY_STRINGS) {
2305                 strings = new TreeSet<>(c.strings);
2306             } else {
2307                 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings);
2308             }
2309         }
2310         return this;
2311     }
2312 
2313     /**
2314      * Removes all of the elements from this set.  This set will be
2315      * empty after this call returns.
2316      * @stable ICU 2.0
2317      */
clear()2318     public UnicodeSet clear() {
2319         checkFrozen();
2320         list[0] = HIGH;
2321         len = 1;
2322         pat = null;
2323         if (hasStrings()) {
2324             strings.clear();
2325         }
2326         return this;
2327     }
2328 
2329     /**
2330      * Iteration method that returns the number of ranges contained in
2331      * this set.
2332      * @see #getRangeStart
2333      * @see #getRangeEnd
2334      * @stable ICU 2.0
2335      */
getRangeCount()2336     public int getRangeCount() {
2337         return len/2;
2338     }
2339 
2340     /**
2341      * Iteration method that returns the first character in the
2342      * specified range of this set.
2343      * @exception ArrayIndexOutOfBoundsException if index is outside
2344      * the range <code>0..getRangeCount()-1</code>
2345      * @see #getRangeCount
2346      * @see #getRangeEnd
2347      * @stable ICU 2.0
2348      */
getRangeStart(int index)2349     public int getRangeStart(int index) {
2350         return list[index*2];
2351     }
2352 
2353     /**
2354      * Iteration method that returns the last character in the
2355      * specified range of this set.
2356      * @exception ArrayIndexOutOfBoundsException if index is outside
2357      * the range <code>0..getRangeCount()-1</code>
2358      * @see #getRangeStart
2359      * @see #getRangeEnd
2360      * @stable ICU 2.0
2361      */
getRangeEnd(int index)2362     public int getRangeEnd(int index) {
2363         return (list[index*2 + 1] - 1);
2364     }
2365 
2366     /**
2367      * Reallocate this objects internal structures to take up the least
2368      * possible space, without changing this object's value.
2369      * @stable ICU 2.0
2370      */
compact()2371     public UnicodeSet compact() {
2372         checkFrozen();
2373         if ((len + 7) < list.length) {
2374             // If we have more than a little unused capacity, shrink it to len.
2375             list = Arrays.copyOf(list, len);
2376         }
2377         rangeList = null;
2378         buffer = null;
2379         if (strings != EMPTY_STRINGS && strings.isEmpty()) {
2380             strings = EMPTY_STRINGS;
2381         }
2382         return this;
2383     }
2384 
2385     /**
2386      * Compares the specified object with this set for equality.  Returns
2387      * <tt>true</tt> if the specified object is also a set, the two sets
2388      * have the same size, and every member of the specified set is
2389      * contained in this set (or equivalently, every member of this set is
2390      * contained in the specified set).
2391      *
2392      * @param o Object to be compared for equality with this set.
2393      * @return <tt>true</tt> if the specified Object is equal to this set.
2394      * @stable ICU 2.0
2395      */
2396     @Override
equals(Object o)2397     public boolean equals(Object o) {
2398         if (o == null) {
2399             return false;
2400         }
2401         if (this == o) {
2402             return true;
2403         }
2404         try {
2405             UnicodeSet that = (UnicodeSet) o;
2406             if (len != that.len) return false;
2407             for (int i = 0; i < len; ++i) {
2408                 if (list[i] != that.list[i]) return false;
2409             }
2410             if (!strings.equals(that.strings)) return false;
2411         } catch (Exception e) {
2412             return false;
2413         }
2414         return true;
2415     }
2416 
2417     /**
2418      * Returns the hash code value for this set.
2419      *
2420      * @return the hash code value for this set.
2421      * @see java.lang.Object#hashCode()
2422      * @stable ICU 2.0
2423      */
2424     @Override
hashCode()2425     public int hashCode() {
2426         int result = len;
2427         for (int i = 0; i < len; ++i) {
2428             result *= 1000003;
2429             result += list[i];
2430         }
2431         return result;
2432     }
2433 
2434     /**
2435      * Return a programmer-readable string representation of this object.
2436      * @stable ICU 2.0
2437      */
2438     @Override
toString()2439     public String toString() {
2440         return toPattern(true);
2441     }
2442 
2443     //----------------------------------------------------------------
2444     // Implementation: Pattern parsing
2445     //----------------------------------------------------------------
2446 
2447     /**
2448      * Parses the given pattern, starting at the given position.  The character
2449      * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
2450      * Parsing continues until the corresponding closing ']'.  If a syntax error
2451      * is encountered between the opening and closing brace, the parse fails.
2452      * Upon return from a successful parse, the ParsePosition is updated to
2453      * point to the character following the closing ']', and an inversion
2454      * list for the parsed pattern is returned.  This method
2455      * calls itself recursively to parse embedded subpatterns.
2456      *
2457      * @param pattern the string containing the pattern to be parsed.  The
2458      * portion of the string from pos.getIndex(), which must be a '[', to the
2459      * corresponding closing ']', is parsed.
2460      * @param pos upon entry, the position at which to being parsing.  The
2461      * character at pattern.charAt(pos.getIndex()) must be a '['.  Upon return
2462      * from a successful parse, pos.getIndex() is either the character after the
2463      * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
2464      * is the last character of the pattern string.
2465      * @return an inversion list for the parsed substring
2466      * of <code>pattern</code>
2467      * @exception java.lang.IllegalArgumentException if the parse fails.
2468      * @internal
2469      * @deprecated This API is ICU internal only.
2470      */
2471     @Deprecated
applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2472     public UnicodeSet applyPattern(String pattern,
2473             ParsePosition pos,
2474             SymbolTable symbols,
2475             int options) {
2476 
2477         // Need to build the pattern in a temporary string because
2478         // _applyPattern calls add() etc., which set pat to empty.
2479         boolean parsePositionWasNull = pos == null;
2480         if (parsePositionWasNull) {
2481             pos = new ParsePosition(0);
2482         }
2483 
2484         StringBuilder rebuiltPat = new StringBuilder();
2485         RuleCharacterIterator chars =
2486                 new RuleCharacterIterator(pattern, symbols, pos);
2487         applyPattern(chars, symbols, rebuiltPat, options, 0);
2488         if (chars.inVariable()) {
2489             syntaxError(chars, "Extra chars in variable value");
2490         }
2491         pat = rebuiltPat.toString();
2492         if (parsePositionWasNull) {
2493             int i = pos.getIndex();
2494 
2495             // Skip over trailing whitespace
2496             if ((options & IGNORE_SPACE) != 0) {
2497                 i = PatternProps.skipWhiteSpace(pattern, i);
2498             }
2499 
2500             if (i != pattern.length()) {
2501                 throw new IllegalArgumentException("Parse of \"" + pattern +
2502                         "\" failed at " + i);
2503             }
2504         }
2505         return this;
2506     }
2507 
2508     // Add constants to make the applyPattern() code easier to follow.
2509 
2510     private static final int LAST0_START = 0,
2511             LAST1_RANGE = 1,
2512             LAST2_SET = 2;
2513 
2514     private static final int MODE0_NONE = 0,
2515             MODE1_INBRACKET = 1,
2516             MODE2_OUTBRACKET = 2;
2517 
2518     private static final int SETMODE0_NONE = 0,
2519             SETMODE1_UNICODESET = 1,
2520             SETMODE2_PROPERTYPAT = 2,
2521             SETMODE3_PREPARSED = 3;
2522 
2523     private static final int MAX_DEPTH = 100;
2524 
2525     /**
2526      * Parse the pattern from the given RuleCharacterIterator.  The
2527      * iterator is advanced over the parsed pattern.
2528      * @param chars iterator over the pattern characters.  Upon return
2529      * it will be advanced to the first character after the parsed
2530      * pattern, or the end of the iteration if all characters are
2531      * parsed.
2532      * @param symbols symbol table to use to parse and dereference
2533      * variables, or null if none.
2534      * @param rebuiltPat the pattern that was parsed, rebuilt or
2535      * copied from the input pattern, as appropriate.
2536      * @param options a bit mask of zero or more of the following:
2537      * IGNORE_SPACE, CASE.
2538      */
applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth)2539     private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
2540             Appendable rebuiltPat, int options, int depth) {
2541         if (depth > MAX_DEPTH) {
2542             syntaxError(chars, "Pattern nested too deeply");
2543         }
2544 
2545         // Syntax characters: [ ] ^ - & { }
2546 
2547         // Recognized special forms for chars, sets: c-c s-s s&s
2548 
2549         int opts = RuleCharacterIterator.PARSE_VARIABLES |
2550                 RuleCharacterIterator.PARSE_ESCAPES;
2551         if ((options & IGNORE_SPACE) != 0) {
2552             opts |= RuleCharacterIterator.SKIP_WHITESPACE;
2553         }
2554 
2555         StringBuilder patBuf = new StringBuilder(), buf = null;
2556         boolean usePat = false;
2557         UnicodeSet scratch = null;
2558         Object backup = null;
2559 
2560         // mode: 0=before [, 1=between [...], 2=after ]
2561         // lastItem: 0=none, 1=char, 2=set
2562         int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE;
2563         char op = 0;
2564 
2565         boolean invert = false;
2566 
2567         clear();
2568         String lastString = null;
2569 
2570         while (mode != MODE2_OUTBRACKET && !chars.atEnd()) {
2571             //Eclipse stated the following is "dead code"
2572             /*
2573             if (false) {
2574                 // Debugging assertion
2575                 if (!((lastItem == 0 && op == 0) ||
2576                         (lastItem == 1 && (op == 0 || op == '-')) ||
2577                         (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
2578                     throw new IllegalArgumentException();
2579                 }
2580             }*/
2581 
2582             int c = 0;
2583             boolean literal = false;
2584             UnicodeSet nested = null;
2585 
2586             // -------- Check for property pattern
2587 
2588             // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
2589             int setMode = SETMODE0_NONE;
2590             if (resemblesPropertyPattern(chars, opts)) {
2591                 setMode = SETMODE2_PROPERTYPAT;
2592             }
2593 
2594             // -------- Parse '[' of opening delimiter OR nested set.
2595             // If there is a nested set, use `setMode' to define how
2596             // the set should be parsed.  If the '[' is part of the
2597             // opening delimiter for this pattern, parse special
2598             // strings "[", "[^", "[-", and "[^-".  Check for stand-in
2599             // characters representing a nested set in the symbol
2600             // table.
2601 
2602             else {
2603                 // Prepare to backup if necessary
2604                 backup = chars.getPos(backup);
2605                 c = chars.next(opts);
2606                 literal = chars.isEscaped();
2607 
2608                 if (c == '[' && !literal) {
2609                     if (mode == MODE1_INBRACKET) {
2610                         chars.setPos(backup); // backup
2611                         setMode = SETMODE1_UNICODESET;
2612                     } else {
2613                         // Handle opening '[' delimiter
2614                         mode = MODE1_INBRACKET;
2615                         patBuf.append('[');
2616                         backup = chars.getPos(backup); // prepare to backup
2617                         c = chars.next(opts);
2618                         literal = chars.isEscaped();
2619                         if (c == '^' && !literal) {
2620                             invert = true;
2621                             patBuf.append('^');
2622                             backup = chars.getPos(backup); // prepare to backup
2623                             c = chars.next(opts);
2624                             literal = chars.isEscaped();
2625                         }
2626                         // Fall through to handle special leading '-';
2627                         // otherwise restart loop for nested [], \p{}, etc.
2628                         if (c == '-') {
2629                             literal = true;
2630                             // Fall through to handle literal '-' below
2631                         } else {
2632                             chars.setPos(backup); // backup
2633                             continue;
2634                         }
2635                     }
2636                 } else if (symbols != null) {
2637                     UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
2638                     if (m != null) {
2639                         try {
2640                             nested = (UnicodeSet) m;
2641                             setMode = SETMODE3_PREPARSED;
2642                         } catch (ClassCastException e) {
2643                             syntaxError(chars, "Syntax error");
2644                         }
2645                     }
2646                 }
2647             }
2648 
2649             // -------- Handle a nested set.  This either is inline in
2650             // the pattern or represented by a stand-in that has
2651             // previously been parsed and was looked up in the symbol
2652             // table.
2653 
2654             if (setMode != SETMODE0_NONE) {
2655                 if (lastItem == LAST1_RANGE) {
2656                     if (op != 0) {
2657                         syntaxError(chars, "Char expected after operator");
2658                     }
2659                     add_unchecked(lastChar, lastChar);
2660                     _appendToPat(patBuf, lastChar, false);
2661                     lastItem = LAST0_START;
2662                     op = 0;
2663                 }
2664 
2665                 if (op == '-' || op == '&') {
2666                     patBuf.append(op);
2667                 }
2668 
2669                 if (nested == null) {
2670                     if (scratch == null) scratch = new UnicodeSet();
2671                     nested = scratch;
2672                 }
2673                 switch (setMode) {
2674                 case SETMODE1_UNICODESET:
2675                     nested.applyPattern(chars, symbols, patBuf, options, depth + 1);
2676                     break;
2677                 case SETMODE2_PROPERTYPAT:
2678                     chars.skipIgnored(opts);
2679                     nested.applyPropertyPattern(chars, patBuf, symbols);
2680                     break;
2681                 case SETMODE3_PREPARSED: // `nested' already parsed
2682                     nested._toPattern(patBuf, false);
2683                     break;
2684                 }
2685 
2686                 usePat = true;
2687 
2688                 if (mode == MODE0_NONE) {
2689                     // Entire pattern is a category; leave parse loop
2690                     set(nested);
2691                     mode = MODE2_OUTBRACKET;
2692                     break;
2693                 }
2694 
2695                 switch (op) {
2696                 case '-':
2697                     removeAll(nested);
2698                     break;
2699                 case '&':
2700                     retainAll(nested);
2701                     break;
2702                 case 0:
2703                     addAll(nested);
2704                     break;
2705                 }
2706 
2707                 op = 0;
2708                 lastItem = LAST2_SET;
2709 
2710                 continue;
2711             }
2712 
2713             if (mode == MODE0_NONE) {
2714                 syntaxError(chars, "Missing '['");
2715             }
2716 
2717             // -------- Parse special (syntax) characters.  If the
2718             // current character is not special, or if it is escaped,
2719             // then fall through and handle it below.
2720 
2721             if (!literal) {
2722                 switch (c) {
2723                 case ']':
2724                     if (lastItem == LAST1_RANGE) {
2725                         add_unchecked(lastChar, lastChar);
2726                         _appendToPat(patBuf, lastChar, false);
2727                     }
2728                     // Treat final trailing '-' as a literal
2729                     if (op == '-') {
2730                         add_unchecked(op, op);
2731                         patBuf.append(op);
2732                     } else if (op == '&') {
2733                         syntaxError(chars, "Trailing '&'");
2734                     }
2735                     patBuf.append(']');
2736                     mode = MODE2_OUTBRACKET;
2737                     continue;
2738                 case '-':
2739                     if (op == 0) {
2740                         if (lastItem != LAST0_START) {
2741                             op = (char) c;
2742                             continue;
2743                         } else if (lastString != null) {
2744                             op = (char) c;
2745                             continue;
2746                         } else {
2747                             // Treat final trailing '-' as a literal
2748                             add_unchecked(c, c);
2749                             c = chars.next(opts);
2750                             literal = chars.isEscaped();
2751                             if (c == ']' && !literal) {
2752                                 patBuf.append("-]");
2753                                 mode = MODE2_OUTBRACKET;
2754                                 continue;
2755                             }
2756                         }
2757                     }
2758                     syntaxError(chars, "'-' not after char, string, or set");
2759                     break;
2760                 case '&':
2761                     if (lastItem == LAST2_SET && op == 0) {
2762                         op = (char) c;
2763                         continue;
2764                     }
2765                     syntaxError(chars, "'&' not after set");
2766                     break;
2767                 case '^':
2768                     syntaxError(chars, "'^' not after '['");
2769                     break;
2770                 case '{':
2771                     if (op != 0 && op != '-') {
2772                         syntaxError(chars, "Missing operand after operator");
2773                     }
2774                     if (lastItem == LAST1_RANGE) {
2775                         add_unchecked(lastChar, lastChar);
2776                         _appendToPat(patBuf, lastChar, false);
2777                     }
2778                     lastItem = LAST0_START;
2779                     if (buf == null) {
2780                         buf = new StringBuilder();
2781                     } else {
2782                         buf.setLength(0);
2783                     }
2784                     boolean ok = false;
2785                     while (!chars.atEnd()) {
2786                         c = chars.next(opts);
2787                         literal = chars.isEscaped();
2788                         if (c == '}' && !literal) {
2789                             ok = true;
2790                             break;
2791                         }
2792                         appendCodePoint(buf, c);
2793                     }
2794                     if (!ok) {
2795                         syntaxError(chars, "Invalid multicharacter string");
2796                     }
2797                     // We have new string. Add it to set and continue;
2798                     // we don't need to drop through to the further
2799                     // processing
2800                     String curString = buf.toString();
2801                     if (op == '-') {
2802                         int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString);
2803                         int curSingle = CharSequences.getSingleCodePoint(curString);
2804                         if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) {
2805                             add(lastSingle,curSingle);
2806                         } else {
2807                             if (strings == EMPTY_STRINGS) {
2808                                 strings = new TreeSet<>();
2809                             }
2810                             try {
2811                                 StringRange.expand(lastString, curString, true, strings);
2812                             } catch (Exception e) {
2813                                 syntaxError(chars, e.getMessage());
2814                             }
2815                         }
2816                         lastString = null;
2817                         op = 0;
2818                     } else {
2819                         add(curString);
2820                         lastString = curString;
2821                     }
2822                     patBuf.append('{');
2823                     _appendToPat(patBuf, curString, false);
2824                     patBuf.append('}');
2825                     continue;
2826                 case SymbolTable.SYMBOL_REF:
2827                     //         symbols  nosymbols
2828                     // [a-$]   error    error (ambiguous)
2829                     // [a$]    anchor   anchor
2830                     // [a-$x]  var "x"* literal '$'
2831                     // [a-$.]  error    literal '$'
2832                     // *We won't get here in the case of var "x"
2833                     backup = chars.getPos(backup);
2834                     c = chars.next(opts);
2835                     literal = chars.isEscaped();
2836                     boolean anchor = (c == ']' && !literal);
2837                     if (symbols == null && !anchor) {
2838                         c = SymbolTable.SYMBOL_REF;
2839                         chars.setPos(backup);
2840                         break; // literal '$'
2841                     }
2842                     if (anchor && op == 0) {
2843                         if (lastItem == LAST1_RANGE) {
2844                             add_unchecked(lastChar, lastChar);
2845                             _appendToPat(patBuf, lastChar, false);
2846                         }
2847                         add_unchecked(UnicodeMatcher.ETHER);
2848                         usePat = true;
2849                         patBuf.append(SymbolTable.SYMBOL_REF).append(']');
2850                         mode = MODE2_OUTBRACKET;
2851                         continue;
2852                     }
2853                     syntaxError(chars, "Unquoted '$'");
2854                     break;
2855                 default:
2856                     break;
2857                 }
2858             }
2859 
2860             // -------- Parse literal characters.  This includes both
2861             // escaped chars ("\u4E01") and non-syntax characters
2862             // ("a").
2863 
2864             switch (lastItem) {
2865             case LAST0_START:
2866                 if (op == '-' && lastString != null) {
2867                     syntaxError(chars, "Invalid range");
2868                 }
2869                 lastItem = LAST1_RANGE;
2870                 lastChar = c;
2871                 lastString = null;
2872                 break;
2873             case LAST1_RANGE:
2874                 if (op == '-') {
2875                     if (lastString != null) {
2876                         syntaxError(chars, "Invalid range");
2877                     }
2878                     if (lastChar >= c) {
2879                         // Don't allow redundant (a-a) or empty (b-a) ranges;
2880                         // these are most likely typos.
2881                         syntaxError(chars, "Invalid range");
2882                     }
2883                     add_unchecked(lastChar, c);
2884                     _appendToPat(patBuf, lastChar, false);
2885                     patBuf.append(op);
2886                     _appendToPat(patBuf, c, false);
2887                     lastItem = LAST0_START;
2888                     op = 0;
2889                 } else {
2890                     add_unchecked(lastChar, lastChar);
2891                     _appendToPat(patBuf, lastChar, false);
2892                     lastChar = c;
2893                 }
2894                 break;
2895             case LAST2_SET:
2896                 if (op != 0) {
2897                     syntaxError(chars, "Set expected after operator");
2898                 }
2899                 lastChar = c;
2900                 lastItem = LAST1_RANGE;
2901                 break;
2902             }
2903         }
2904 
2905         if (mode != MODE2_OUTBRACKET) {
2906             syntaxError(chars, "Missing ']'");
2907         }
2908 
2909         chars.skipIgnored(opts);
2910 
2911         /**
2912          * Handle global flags (invert, case insensitivity).  If this
2913          * pattern should be compiled case-insensitive, then we need
2914          * to close over case BEFORE COMPLEMENTING.  This makes
2915          * patterns like /[^abc]/i work.
2916          */
2917         if ((options & CASE) != 0) {
2918             closeOver(CASE);
2919         }
2920         if (invert) {
2921             complement();
2922         }
2923 
2924         // Use the rebuilt pattern (pat) only if necessary.  Prefer the
2925         // generated pattern.
2926         if (usePat) {
2927             append(rebuiltPat, patBuf.toString());
2928         } else {
2929             appendNewPattern(rebuiltPat, false, true);
2930         }
2931     }
2932 
syntaxError(RuleCharacterIterator chars, String msg)2933     private static void syntaxError(RuleCharacterIterator chars, String msg) {
2934         throw new IllegalArgumentException("Error: " + msg + " at \"" +
2935                 Utility.escape(chars.toString()) +
2936                 '"');
2937     }
2938 
2939     /**
2940      * Add the contents of the UnicodeSet (as strings) into a collection.
2941      * @param target collection to add into
2942      * @stable ICU 4.4
2943      */
addAllTo(T target)2944     public <T extends Collection<String>> T addAllTo(T target) {
2945         return addAllTo(this, target);
2946     }
2947 
2948 
2949     /**
2950      * Add the contents of the UnicodeSet (as strings) into a collection.
2951      * @param target collection to add into
2952      * @stable ICU 4.4
2953      */
addAllTo(String[] target)2954     public String[] addAllTo(String[] target) {
2955         return addAllTo(this, target);
2956     }
2957 
2958     /**
2959      * Add the contents of the UnicodeSet (as strings) into an array.
2960      * @stable ICU 4.4
2961      */
toArray(UnicodeSet set)2962     public static String[] toArray(UnicodeSet set) {
2963         return addAllTo(set, new String[set.size()]);
2964     }
2965 
2966     /**
2967      * Add the contents of the collection (as strings) into this UnicodeSet.
2968      * The collection must not contain null.
2969      * @param source the collection to add
2970      * @return a reference to this object
2971      * @stable ICU 4.4
2972      */
add(Iterable<?> source)2973     public UnicodeSet add(Iterable<?> source) {
2974         return addAll(source);
2975     }
2976 
2977     /**
2978      * Add a collection (as strings) into this UnicodeSet.
2979      * Uses standard naming convention.
2980      * @param source collection to add into
2981      * @return a reference to this object
2982      * @stable ICU 4.4
2983      */
addAll(Iterable<?> source)2984     public UnicodeSet addAll(Iterable<?> source) {
2985         checkFrozen();
2986         for (Object o : source) {
2987             add(o.toString());
2988         }
2989         return this;
2990     }
2991 
2992     //----------------------------------------------------------------
2993     // Implementation: Utility methods
2994     //----------------------------------------------------------------
2995 
nextCapacity(int minCapacity)2996     private int nextCapacity(int minCapacity) {
2997         // Grow exponentially to reduce the frequency of allocations.
2998         if (minCapacity < INITIAL_CAPACITY) {
2999             return minCapacity + INITIAL_CAPACITY;
3000         } else if (minCapacity <= 2500) {
3001             return 5 * minCapacity;
3002         } else {
3003             int newCapacity = 2 * minCapacity;
3004             if (newCapacity > MAX_LENGTH) {
3005                 newCapacity = MAX_LENGTH;
3006             }
3007             return newCapacity;
3008         }
3009     }
3010 
ensureCapacity(int newLen)3011     private void ensureCapacity(int newLen) {
3012         if (newLen > MAX_LENGTH) {
3013             newLen = MAX_LENGTH;
3014         }
3015         if (newLen <= list.length) return;
3016         int newCapacity = nextCapacity(newLen);
3017         int[] temp = new int[newCapacity];
3018         // Copy only the actual contents.
3019         System.arraycopy(list, 0, temp, 0, len);
3020         list = temp;
3021     }
3022 
ensureBufferCapacity(int newLen)3023     private void ensureBufferCapacity(int newLen) {
3024         if (newLen > MAX_LENGTH) {
3025             newLen = MAX_LENGTH;
3026         }
3027         if (buffer != null && newLen <= buffer.length) return;
3028         int newCapacity = nextCapacity(newLen);
3029         buffer = new int[newCapacity];
3030         // The buffer has no contents to be copied.
3031         // It is always filled from scratch after this call.
3032     }
3033 
3034     /**
3035      * Assumes start <= end.
3036      */
range(int start, int end)3037     private int[] range(int start, int end) {
3038         if (rangeList == null) {
3039             rangeList = new int[] { start, end+1, HIGH };
3040         } else {
3041             rangeList[0] = start;
3042             rangeList[1] = end+1;
3043         }
3044         return rangeList;
3045     }
3046 
3047     //----------------------------------------------------------------
3048     // Implementation: Fundamental operations
3049     //----------------------------------------------------------------
3050 
3051     // polarity = 0, 3 is normal: x xor y
3052     // polarity = 1, 2: x xor ~y == x === y
3053 
xor(int[] other, int otherLen, int polarity)3054     private UnicodeSet xor(int[] other, int otherLen, int polarity) {
3055         ensureBufferCapacity(len + otherLen);
3056         int i = 0, j = 0, k = 0;
3057         int a = list[i++];
3058         int b;
3059         // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used
3060         //      so the following if statement will not be called.
3061         ///CLOVER:OFF
3062         if (polarity == 1 || polarity == 2) {
3063             b = LOW;
3064             if (other[j] == LOW) { // skip base if already LOW
3065                 ++j;
3066                 b = other[j];
3067             }
3068             ///CLOVER:ON
3069         } else {
3070             b = other[j++];
3071         }
3072         // simplest of all the routines
3073         // sort the values, discarding identicals!
3074         while (true) {
3075             if (a < b) {
3076                 buffer[k++] = a;
3077                 a = list[i++];
3078             } else if (b < a) {
3079                 buffer[k++] = b;
3080                 b = other[j++];
3081             } else if (a != HIGH) { // at this point, a == b
3082                 // discard both values!
3083                 a = list[i++];
3084                 b = other[j++];
3085             } else { // DONE!
3086                 buffer[k++] = HIGH;
3087                 len = k;
3088                 break;
3089             }
3090         }
3091         // swap list and buffer
3092         int[] temp = list;
3093         list = buffer;
3094         buffer = temp;
3095         pat = null;
3096         return this;
3097     }
3098 
3099     // polarity = 0 is normal: x union y
3100     // polarity = 2: x union ~y
3101     // polarity = 1: ~x union y
3102     // polarity = 3: ~x union ~y
3103 
add(int[] other, int otherLen, int polarity)3104     private UnicodeSet add(int[] other, int otherLen, int polarity) {
3105         ensureBufferCapacity(len + otherLen);
3106         int i = 0, j = 0, k = 0;
3107         int a = list[i++];
3108         int b = other[j++];
3109         // change from xor is that we have to check overlapping pairs
3110         // polarity bit 1 means a is second, bit 2 means b is.
3111         main:
3112             while (true) {
3113                 switch (polarity) {
3114                 case 0: // both first; take lower if unequal
3115                     if (a < b) { // take a
3116                         // Back up over overlapping ranges in buffer[]
3117                         if (k > 0 && a <= buffer[k-1]) {
3118                             // Pick latter end value in buffer[] vs. list[]
3119                             a = max(list[i], buffer[--k]);
3120                         } else {
3121                             // No overlap
3122                             buffer[k++] = a;
3123                             a = list[i];
3124                         }
3125                         i++; // Common if/else code factored out
3126                         polarity ^= 1;
3127                     } else if (b < a) { // take b
3128                         if (k > 0 && b <= buffer[k-1]) {
3129                             b = max(other[j], buffer[--k]);
3130                         } else {
3131                             buffer[k++] = b;
3132                             b = other[j];
3133                         }
3134                         j++;
3135                         polarity ^= 2;
3136                     } else { // a == b, take a, drop b
3137                         if (a == HIGH) break main;
3138                         // This is symmetrical; it doesn't matter if
3139                         // we backtrack with a or b. - liu
3140                         if (k > 0 && a <= buffer[k-1]) {
3141                             a = max(list[i], buffer[--k]);
3142                         } else {
3143                             // No overlap
3144                             buffer[k++] = a;
3145                             a = list[i];
3146                         }
3147                         i++;
3148                         polarity ^= 1;
3149                         b = other[j++]; polarity ^= 2;
3150                     }
3151                     break;
3152                 case 3: // both second; take higher if unequal, and drop other
3153                     if (b <= a) { // take a
3154                         if (a == HIGH) break main;
3155                         buffer[k++] = a;
3156                     } else { // take b
3157                         if (b == HIGH) break main;
3158                         buffer[k++] = b;
3159                     }
3160                     a = list[i++]; polarity ^= 1;   // factored common code
3161                     b = other[j++]; polarity ^= 2;
3162                     break;
3163                 case 1: // a second, b first; if b < a, overlap
3164                     if (a < b) { // no overlap, take a
3165                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3166                     } else if (b < a) { // OVERLAP, drop b
3167                         b = other[j++]; polarity ^= 2;
3168                     } else { // a == b, drop both!
3169                         if (a == HIGH) break main;
3170                         a = list[i++]; polarity ^= 1;
3171                         b = other[j++]; polarity ^= 2;
3172                     }
3173                     break;
3174                 case 2: // a first, b second; if a < b, overlap
3175                     if (b < a) { // no overlap, take b
3176                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3177                     } else  if (a < b) { // OVERLAP, drop a
3178                         a = list[i++]; polarity ^= 1;
3179                     } else { // a == b, drop both!
3180                         if (a == HIGH) break main;
3181                         a = list[i++]; polarity ^= 1;
3182                         b = other[j++]; polarity ^= 2;
3183                     }
3184                     break;
3185                 }
3186             }
3187         buffer[k++] = HIGH;    // terminate
3188         len = k;
3189         // swap list and buffer
3190         int[] temp = list;
3191         list = buffer;
3192         buffer = temp;
3193         pat = null;
3194         return this;
3195     }
3196 
3197     // polarity = 0 is normal: x intersect y
3198     // polarity = 2: x intersect ~y == set-minus
3199     // polarity = 1: ~x intersect y
3200     // polarity = 3: ~x intersect ~y
3201 
retain(int[] other, int otherLen, int polarity)3202     private UnicodeSet retain(int[] other, int otherLen, int polarity) {
3203         ensureBufferCapacity(len + otherLen);
3204         int i = 0, j = 0, k = 0;
3205         int a = list[i++];
3206         int b = other[j++];
3207         // change from xor is that we have to check overlapping pairs
3208         // polarity bit 1 means a is second, bit 2 means b is.
3209         main:
3210             while (true) {
3211                 switch (polarity) {
3212                 case 0: // both first; drop the smaller
3213                     if (a < b) { // drop a
3214                         a = list[i++]; polarity ^= 1;
3215                     } else if (b < a) { // drop b
3216                         b = other[j++]; polarity ^= 2;
3217                     } else { // a == b, take one, drop other
3218                         if (a == HIGH) break main;
3219                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3220                         b = other[j++]; polarity ^= 2;
3221                     }
3222                     break;
3223                 case 3: // both second; take lower if unequal
3224                     if (a < b) { // take a
3225                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3226                     } else if (b < a) { // take b
3227                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3228                     } else { // a == b, take one, drop other
3229                         if (a == HIGH) break main;
3230                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3231                         b = other[j++]; polarity ^= 2;
3232                     }
3233                     break;
3234                 case 1: // a second, b first;
3235                     if (a < b) { // NO OVERLAP, drop a
3236                         a = list[i++]; polarity ^= 1;
3237                     } else if (b < a) { // OVERLAP, take b
3238                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3239                     } else { // a == b, drop both!
3240                         if (a == HIGH) break main;
3241                         a = list[i++]; polarity ^= 1;
3242                         b = other[j++]; polarity ^= 2;
3243                     }
3244                     break;
3245                 case 2: // a first, b second; if a < b, overlap
3246                     if (b < a) { // no overlap, drop b
3247                         b = other[j++]; polarity ^= 2;
3248                     } else  if (a < b) { // OVERLAP, take a
3249                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3250                     } else { // a == b, drop both!
3251                         if (a == HIGH) break main;
3252                         a = list[i++]; polarity ^= 1;
3253                         b = other[j++]; polarity ^= 2;
3254                     }
3255                     break;
3256                 }
3257             }
3258         buffer[k++] = HIGH;    // terminate
3259         len = k;
3260         // swap list and buffer
3261         int[] temp = list;
3262         list = buffer;
3263         buffer = temp;
3264         pat = null;
3265         return this;
3266     }
3267 
max(int a, int b)3268     private static final int max(int a, int b) {
3269         return (a > b) ? a : b;
3270     }
3271 
3272     //----------------------------------------------------------------
3273     // Generic filter-based scanning code
3274     //----------------------------------------------------------------
3275 
3276     private static interface Filter {
contains(int codePoint)3277         boolean contains(int codePoint);
3278     }
3279 
3280     private static final class NumericValueFilter implements Filter {
3281         double value;
NumericValueFilter(double value)3282         NumericValueFilter(double value) { this.value = value; }
3283         @Override
contains(int ch)3284         public boolean contains(int ch) {
3285             return UCharacter.getUnicodeNumericValue(ch) == value;
3286         }
3287     }
3288 
3289     private static final class GeneralCategoryMaskFilter implements Filter {
3290         int mask;
GeneralCategoryMaskFilter(int mask)3291         GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
3292         @Override
contains(int ch)3293         public boolean contains(int ch) {
3294             return ((1 << UCharacter.getType(ch)) & mask) != 0;
3295         }
3296     }
3297 
3298     private static final class IntPropertyFilter implements Filter {
3299         int prop;
3300         int value;
IntPropertyFilter(int prop, int value)3301         IntPropertyFilter(int prop, int value) {
3302             this.prop = prop;
3303             this.value = value;
3304         }
3305         @Override
contains(int ch)3306         public boolean contains(int ch) {
3307             return UCharacter.getIntPropertyValue(ch, prop) == value;
3308         }
3309     }
3310 
3311     private static final class ScriptExtensionsFilter implements Filter {
3312         int script;
ScriptExtensionsFilter(int script)3313         ScriptExtensionsFilter(int script) { this.script = script; }
3314         @Override
contains(int c)3315         public boolean contains(int c) {
3316             return UScript.hasScript(c, script);
3317         }
3318     }
3319 
3320     // VersionInfo for unassigned characters
3321     private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
3322 
3323     private static final class VersionFilter implements Filter {
3324         VersionInfo version;
VersionFilter(VersionInfo version)3325         VersionFilter(VersionInfo version) { this.version = version; }
3326         @Override
contains(int ch)3327         public boolean contains(int ch) {
3328             VersionInfo v = UCharacter.getAge(ch);
3329             // Reference comparison ok; VersionInfo caches and reuses
3330             // unique objects.
3331             return !Utility.sameObjects(v, NO_VERSION) &&
3332                     v.compareTo(version) <= 0;
3333         }
3334     }
3335 
3336     /**
3337      * Generic filter-based scanning code for UCD property UnicodeSets.
3338      */
applyFilter(Filter filter, UnicodeSet inclusions)3339     private void applyFilter(Filter filter, UnicodeSet inclusions) {
3340         // Logically, walk through all Unicode characters, noting the start
3341         // and end of each range for which filter.contain(c) is
3342         // true.  Add each range to a set.
3343         //
3344         // To improve performance, use an inclusions set which
3345         // encodes information about character ranges that are known
3346         // to have identical properties.
3347         // inclusions contains the first characters of
3348         // same-value ranges for the given property.
3349 
3350         clear();
3351 
3352         int startHasProperty = -1;
3353         int limitRange = inclusions.getRangeCount();
3354 
3355         for (int j=0; j<limitRange; ++j) {
3356             // get current range
3357             int start = inclusions.getRangeStart(j);
3358             int end = inclusions.getRangeEnd(j);
3359 
3360             // for all the code points in the range, process
3361             for (int ch = start; ch <= end; ++ch) {
3362                 // only add to the unicodeset on inflection points --
3363                 // where the hasProperty value changes to false
3364                 if (filter.contains(ch)) {
3365                     if (startHasProperty < 0) {
3366                         startHasProperty = ch;
3367                     }
3368                 } else if (startHasProperty >= 0) {
3369                     add_unchecked(startHasProperty, ch-1);
3370                     startHasProperty = -1;
3371                 }
3372             }
3373         }
3374         if (startHasProperty >= 0) {
3375             add_unchecked(startHasProperty, 0x10FFFF);
3376         }
3377     }
3378 
3379     /**
3380      * Remove leading and trailing Pattern_White_Space and compress
3381      * internal Pattern_White_Space to a single space character.
3382      */
mungeCharName(String source)3383     private static String mungeCharName(String source) {
3384         source = PatternProps.trimWhiteSpace(source);
3385         StringBuilder buf = null;
3386         for (int i=0; i<source.length(); ++i) {
3387             char ch = source.charAt(i);
3388             if (PatternProps.isWhiteSpace(ch)) {
3389                 if (buf == null) {
3390                     buf = new StringBuilder().append(source, 0, i);
3391                 } else if (buf.charAt(buf.length() - 1) == ' ') {
3392                     continue;
3393                 }
3394                 ch = ' '; // convert to ' '
3395             }
3396             if (buf != null) {
3397                 buf.append(ch);
3398             }
3399         }
3400         return buf == null ? source : buf.toString();
3401     }
3402 
3403     //----------------------------------------------------------------
3404     // Property set API
3405     //----------------------------------------------------------------
3406 
3407     /**
3408      * Modifies this set to contain those code points which have the
3409      * given value for the given binary or enumerated property, as
3410      * returned by UCharacter.getIntPropertyValue.  Prior contents of
3411      * this set are lost.
3412      *
3413      * @param prop a property in the range
3414      * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or
3415      * UProperty.INT_START..UProperty.INT_LIMIT-1 or.
3416      * UProperty.MASK_START..UProperty.MASK_LIMIT-1.
3417      *
3418      * @param value a value in the range
3419      * UCharacter.getIntPropertyMinValue(prop)..
3420      * UCharacter.getIntPropertyMaxValue(prop), with one exception.
3421      * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be
3422      * a UCharacter.getType() result, but rather a mask value produced
3423      * by logically ORing (1 &lt;&lt; UCharacter.getType()) values together.
3424      * This allows grouped categories such as [:L:] to be represented.
3425      *
3426      * @return a reference to this set
3427      *
3428      * @stable ICU 2.4
3429      */
applyIntPropertyValue(int prop, int value)3430     public UnicodeSet applyIntPropertyValue(int prop, int value) {
3431         // All of the following include checkFrozen() before modifying this set.
3432         if (prop == UProperty.GENERAL_CATEGORY_MASK) {
3433             UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
3434             applyFilter(new GeneralCategoryMaskFilter(value), inclusions);
3435         } else if (prop == UProperty.SCRIPT_EXTENSIONS) {
3436             UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
3437             applyFilter(new ScriptExtensionsFilter(value), inclusions);
3438         } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) {
3439             if (value == 0 || value == 1) {
3440                 set(CharacterProperties.getBinaryPropertySet(prop));
3441                 if (value == 0) {
3442                     complement();
3443                 }
3444             } else {
3445                 clear();
3446             }
3447         } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) {
3448             UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
3449             applyFilter(new IntPropertyFilter(prop, value), inclusions);
3450         } else {
3451             throw new IllegalArgumentException("unsupported property " + prop);
3452         }
3453         return this;
3454     }
3455 
3456 
3457 
3458     /**
3459      * Modifies this set to contain those code points which have the
3460      * given value for the given property.  Prior contents of this
3461      * set are lost.
3462      *
3463      * @param propertyAlias a property alias, either short or long.
3464      * The name is matched loosely.  See PropertyAliases.txt for names
3465      * and a description of loose matching.  If the value string is
3466      * empty, then this string is interpreted as either a
3467      * General_Category value alias, a Script value alias, a binary
3468      * property alias, or a special ID.  Special IDs are matched
3469      * loosely and correspond to the following sets:
3470      *
3471      * "ANY" = [\\u0000-\\U0010FFFF],
3472      * "ASCII" = [\\u0000-\\u007F].
3473      *
3474      * @param valueAlias a value alias, either short or long.  The
3475      * name is matched loosely.  See PropertyValueAliases.txt for
3476      * names and a description of loose matching.  In addition to
3477      * aliases listed, numeric values and canonical combining classes
3478      * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc",
3479      * "220").  The value string may also be empty.
3480      *
3481      * @return a reference to this set
3482      *
3483      * @stable ICU 2.4
3484      */
applyPropertyAlias(String propertyAlias, String valueAlias)3485     public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) {
3486         return applyPropertyAlias(propertyAlias, valueAlias, null);
3487     }
3488 
3489     /**
3490      * Modifies this set to contain those code points which have the
3491      * given value for the given property.  Prior contents of this
3492      * set are lost.
3493      * @param propertyAlias A string of the property alias.
3494      * @param valueAlias A string of the value alias.
3495      * @param symbols if not null, then symbols are first called to see if a property
3496      * is available. If true, then everything else is skipped.
3497      * @return this set
3498      * @stable ICU 3.2
3499      */
applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3500     public UnicodeSet applyPropertyAlias(String propertyAlias,
3501             String valueAlias, SymbolTable symbols) {
3502         checkFrozen();
3503         int p;
3504         int v;
3505         boolean invert = false;
3506 
3507         if (symbols != null
3508                 && (symbols instanceof XSymbolTable)
3509                 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
3510             return this;
3511         }
3512 
3513         if (XSYMBOL_TABLE != null) {
3514             if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) {
3515                 return this;
3516             }
3517         }
3518 
3519         if (valueAlias.length() > 0) {
3520             p = UCharacter.getPropertyEnum(propertyAlias);
3521 
3522             // Treat gc as gcm
3523             if (p == UProperty.GENERAL_CATEGORY) {
3524                 p = UProperty.GENERAL_CATEGORY_MASK;
3525             }
3526 
3527             if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) ||
3528                     (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) ||
3529                     (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) {
3530                 try {
3531                     v = UCharacter.getPropertyValueEnum(p, valueAlias);
3532                 } catch (IllegalArgumentException e) {
3533                     // Handle numeric CCC
3534                     if (p == UProperty.CANONICAL_COMBINING_CLASS ||
3535                             p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
3536                             p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
3537                         v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias));
3538                         // Anything between 0 and 255 is valid even if unused.
3539                         if (v < 0 || v > 255) throw e;
3540                     } else {
3541                         throw e;
3542                     }
3543                 }
3544             }
3545 
3546             else {
3547                 switch (p) {
3548                 case UProperty.NUMERIC_VALUE:
3549                 {
3550                     double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias));
3551                     applyFilter(new NumericValueFilter(value),
3552                             CharacterPropertiesImpl.getInclusionsForProperty(p));
3553                     return this;
3554                 }
3555                 case UProperty.NAME:
3556                 {
3557                     // Must munge name, since
3558                     // UCharacter.charFromName() does not do
3559                     // 'loose' matching.
3560                     String buf = mungeCharName(valueAlias);
3561                     int ch = UCharacter.getCharFromExtendedName(buf);
3562                     if (ch == -1) {
3563                         throw new IllegalArgumentException("Invalid character name");
3564                     }
3565                     clear();
3566                     add_unchecked(ch);
3567                     return this;
3568                 }
3569                 case UProperty.UNICODE_1_NAME:
3570                     // ICU 49 deprecates the Unicode_1_Name property APIs.
3571                     throw new IllegalArgumentException("Unicode_1_Name (na1) not supported");
3572                 case UProperty.AGE:
3573                 {
3574                     // Must munge name, since
3575                     // VersionInfo.getInstance() does not do
3576                     // 'loose' matching.
3577                     VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
3578                     applyFilter(new VersionFilter(version),
3579                             CharacterPropertiesImpl.getInclusionsForProperty(p));
3580                     return this;
3581                 }
3582                 case UProperty.SCRIPT_EXTENSIONS:
3583                     v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias);
3584                     // fall through to calling applyIntPropertyValue()
3585                     break;
3586                 default:
3587                     // p is a non-binary, non-enumerated property that we
3588                     // don't support (yet).
3589                     throw new IllegalArgumentException("Unsupported property");
3590                 }
3591             }
3592         }
3593 
3594         else {
3595             // valueAlias is empty.  Interpret as General Category, Script,
3596             // Binary property, or ANY or ASCII.  Upon success, p and v will
3597             // be set.
3598             UPropertyAliases pnames = UPropertyAliases.INSTANCE;
3599             p = UProperty.GENERAL_CATEGORY_MASK;
3600             v = pnames.getPropertyValueEnum(p, propertyAlias);
3601             if (v == UProperty.UNDEFINED) {
3602                 p = UProperty.SCRIPT;
3603                 v = pnames.getPropertyValueEnum(p, propertyAlias);
3604                 if (v == UProperty.UNDEFINED) {
3605                     p = pnames.getPropertyEnum(propertyAlias);
3606                     if (p == UProperty.UNDEFINED) {
3607                         p = -1;
3608                     }
3609                     if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) {
3610                         v = 1;
3611                     } else if (p == -1) {
3612                         if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) {
3613                             set(MIN_VALUE, MAX_VALUE);
3614                             return this;
3615                         } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) {
3616                             set(0, 0x7F);
3617                             return this;
3618                         } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) {
3619                             // [:Assigned:]=[:^Cn:]
3620                             p = UProperty.GENERAL_CATEGORY_MASK;
3621                             v = (1<<UCharacter.UNASSIGNED);
3622                             invert = true;
3623                         } else {
3624                             // Property name was never matched.
3625                             throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias);
3626                         }
3627                     } else {
3628                         // Valid propery name, but it isn't binary, so the value
3629                         // must be supplied.
3630                         throw new IllegalArgumentException("Missing property value");
3631                     }
3632                 }
3633             }
3634         }
3635 
3636         applyIntPropertyValue(p, v);
3637         if(invert) {
3638             complement();
3639         }
3640 
3641         return this;
3642     }
3643 
3644     //----------------------------------------------------------------
3645     // Property set patterns
3646     //----------------------------------------------------------------
3647 
3648     /**
3649      * Return true if the given position, in the given pattern, appears
3650      * to be the start of a property set pattern.
3651      */
resemblesPropertyPattern(String pattern, int pos)3652     private static boolean resemblesPropertyPattern(String pattern, int pos) {
3653         // Patterns are at least 5 characters long
3654         if ((pos+5) > pattern.length()) {
3655             return false;
3656         }
3657 
3658         // Look for an opening [:, [:^, \p, or \P
3659         return pattern.regionMatches(pos, "[:", 0, 2) ||
3660                 pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3661                 pattern.regionMatches(pos, "\\N", 0, 2);
3662     }
3663 
3664     /**
3665      * Return true if the given iterator appears to point at a
3666      * property pattern.  Regardless of the result, return with the
3667      * iterator unchanged.
3668      * @param chars iterator over the pattern characters.  Upon return
3669      * it will be unchanged.
3670      * @param iterOpts RuleCharacterIterator options
3671      */
resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3672     private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
3673             int iterOpts) {
3674         boolean result = false;
3675         iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
3676         Object pos = chars.getPos(null);
3677         int c = chars.next(iterOpts);
3678         if (c == '[' || c == '\\') {
3679             int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
3680             result = (c == '[') ? (d == ':') :
3681                 (d == 'N' || d == 'p' || d == 'P');
3682         }
3683         chars.setPos(pos);
3684         return result;
3685     }
3686 
3687     /**
3688      * Parse the given property pattern at the given parse position.
3689      * @param symbols TODO
3690      */
applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3691     private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
3692         int pos = ppos.getIndex();
3693 
3694         // On entry, ppos should point to one of the following locations:
3695 
3696         // Minimum length is 5 characters, e.g. \p{L}
3697         if ((pos+5) > pattern.length()) {
3698             return null;
3699         }
3700 
3701         boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
3702         boolean isName = false; // true for \N{pat}, o/w false
3703         boolean invert = false;
3704 
3705         // Look for an opening [:, [:^, \p, or \P
3706         if (pattern.regionMatches(pos, "[:", 0, 2)) {
3707             posix = true;
3708             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
3709             if (pos < pattern.length() && pattern.charAt(pos) == '^') {
3710                 ++pos;
3711                 invert = true;
3712             }
3713         } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3714                 pattern.regionMatches(pos, "\\N", 0, 2)) {
3715             char c = pattern.charAt(pos+1);
3716             invert = (c == 'P');
3717             isName = (c == 'N');
3718             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
3719             if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
3720                 // Syntax error; "\p" or "\P" not followed by "{"
3721                 return null;
3722             }
3723         } else {
3724             // Open delimiter not seen
3725             return null;
3726         }
3727 
3728         // Look for the matching close delimiter, either :] or }
3729         int close = pattern.indexOf(posix ? ":]" : "}", pos);
3730         if (close < 0) {
3731             // Syntax error; close delimiter missing
3732             return null;
3733         }
3734 
3735         // Look for an '=' sign.  If this is present, we will parse a
3736         // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
3737         // pattern.
3738         int equals = pattern.indexOf('=', pos);
3739         String propName, valueName;
3740         if (equals >= 0 && equals < close && !isName) {
3741             // Equals seen; parse medium/long pattern
3742             propName = pattern.substring(pos, equals);
3743             valueName = pattern.substring(equals+1, close);
3744         }
3745 
3746         else {
3747             // Handle case where no '=' is seen, and \N{}
3748             propName = pattern.substring(pos, close);
3749             valueName = "";
3750 
3751             // Handle \N{name}
3752             if (isName) {
3753                 // This is a little inefficient since it means we have to
3754                 // parse "na" back to UProperty.NAME even though we already
3755                 // know it's UProperty.NAME.  If we refactor the API to
3756                 // support args of (int, String) then we can remove
3757                 // "na" and make this a little more efficient.
3758                 valueName = propName;
3759                 propName = "na";
3760             }
3761         }
3762 
3763         applyPropertyAlias(propName, valueName, symbols);
3764 
3765         if (invert) {
3766             complement();
3767         }
3768 
3769         // Move to the limit position after the close delimiter
3770         ppos.setIndex(close + (posix ? 2 : 1));
3771 
3772         return this;
3773     }
3774 
3775     /**
3776      * Parse a property pattern.
3777      * @param chars iterator over the pattern characters.  Upon return
3778      * it will be advanced to the first character after the parsed
3779      * pattern, or the end of the iteration if all characters are
3780      * parsed.
3781      * @param rebuiltPat the pattern that was parsed, rebuilt or
3782      * copied from the input pattern, as appropriate.
3783      * @param symbols TODO
3784      */
applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3785     private void applyPropertyPattern(RuleCharacterIterator chars,
3786             Appendable rebuiltPat, SymbolTable symbols) {
3787         String patStr = chars.lookahead();
3788         ParsePosition pos = new ParsePosition(0);
3789         applyPropertyPattern(patStr, pos, symbols);
3790         if (pos.getIndex() == 0) {
3791             syntaxError(chars, "Invalid property pattern");
3792         }
3793         chars.jumpahead(pos.getIndex());
3794         append(rebuiltPat, patStr.substring(0, pos.getIndex()));
3795     }
3796 
3797     //----------------------------------------------------------------
3798     // Case folding API
3799     //----------------------------------------------------------------
3800 
3801     /**
3802      * Bitmask for constructor and applyPattern() indicating that
3803      * white space should be ignored.  If set, ignore Unicode Pattern_White_Space characters,
3804      * unless they are quoted or escaped.  This may be ORed together
3805      * with other selectors.
3806      * @stable ICU 3.8
3807      */
3808     public static final int IGNORE_SPACE = 1;
3809 
3810     /**
3811      * Bitmask for constructor, applyPattern(), and closeOver()
3812      * indicating letter case.  This may be ORed together with other
3813      * selectors.
3814      *
3815      * Enable case insensitive matching.  E.g., "[ab]" with this flag
3816      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
3817      * match all except 'a', 'A', 'b', and 'B'. This performs a full
3818      * closure over case mappings, e.g. U+017F for s.
3819      *
3820      * The resulting set is a superset of the input for the code points but
3821      * not for the strings.
3822      * It performs a case mapping closure of the code points and adds
3823      * full case folding strings for the code points, and reduces strings of
3824      * the original set to their full case folding equivalents.
3825      *
3826      * This is designed for case-insensitive matches, for example
3827      * in regular expressions. The full code point case closure allows checking of
3828      * an input character directly against the closure set.
3829      * Strings are matched by comparing the case-folded form from the closure
3830      * set with an incremental case folding of the string in question.
3831      *
3832      * The closure set will also contain single code points if the original
3833      * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
3834      * This is not necessary (that is, redundant) for the above matching method
3835      * but results in the same closure sets regardless of whether the original
3836      * set contained the code point or a string.
3837      * @stable ICU 3.8
3838      */
3839     public static final int CASE = 2;
3840 
3841     /**
3842      * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
3843      * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
3844      * @see #CASE
3845      * @stable ICU 3.4
3846      */
3847     public static final int CASE_INSENSITIVE = 2;
3848 
3849     /**
3850      * Bitmask for constructor, applyPattern(), and closeOver()
3851      * indicating letter case.  This may be ORed together with other
3852      * selectors.
3853      *
3854      * Enable case insensitive matching.  E.g., "[ab]" with this flag
3855      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
3856      * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
3857      * title-, and uppercase mappings as well as the case folding
3858      * of each existing element in the set.
3859      * @stable ICU 3.4
3860      */
3861     public static final int ADD_CASE_MAPPINGS = 4;
3862 
3863     //  add the result of a full case mapping to the set
3864     //  use str as a temporary string to avoid constructing one
addCaseMapping(UnicodeSet set, int result, StringBuilder full)3865     private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
3866         if(result >= 0) {
3867             if(result > UCaseProps.MAX_STRING_LENGTH) {
3868                 // add a single-code point case mapping
3869                 set.add(result);
3870             } else {
3871                 // add a string case mapping from full with length result
3872                 set.add(full.toString());
3873                 full.setLength(0);
3874             }
3875         }
3876         // result < 0: the code point mapped to itself, no need to add it
3877         // see UCaseProps
3878     }
3879 
3880     /**
3881      * Close this set over the given attribute.  For the attribute
3882      * CASE, the result is to modify this set so that:
3883      *
3884      * 1. For each character or string 'a' in this set, all strings
3885      * 'b' such that foldCase(a) == foldCase(b) are added to this set.
3886      * (For most 'a' that are single characters, 'b' will have
3887      * b.length() == 1.)
3888      *
3889      * 2. For each string 'e' in the resulting set, if e !=
3890      * foldCase(e), 'e' will be removed.
3891      *
3892      * Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
3893      *
3894      * (Here foldCase(x) refers to the operation
3895      * UCharacter.foldCase(x, true), and a == b actually denotes
3896      * a.equals(b), not pointer comparison.)
3897      *
3898      * @param attribute bitmask for attributes to close over.
3899      * Currently only the CASE bit is supported.  Any undefined bits
3900      * are ignored.
3901      * @return a reference to this set.
3902      * @stable ICU 3.8
3903      */
closeOver(int attribute)3904     public UnicodeSet closeOver(int attribute) {
3905         checkFrozen();
3906         if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
3907             UCaseProps csp = UCaseProps.INSTANCE;
3908             UnicodeSet foldSet = new UnicodeSet(this);
3909             ULocale root = ULocale.ROOT;
3910 
3911             // start with input set to guarantee inclusion
3912             // CASE: remove strings because the strings will actually be reduced (folded);
3913             //       therefore, start with no strings and add only those needed
3914             if((attribute & CASE) != 0 && foldSet.hasStrings()) {
3915                 foldSet.strings.clear();
3916             }
3917 
3918             int n = getRangeCount();
3919             int result;
3920             StringBuilder full = new StringBuilder();
3921 
3922             for (int i=0; i<n; ++i) {
3923                 int start = getRangeStart(i);
3924                 int end   = getRangeEnd(i);
3925 
3926                 if((attribute & CASE) != 0) {
3927                     // full case closure
3928                     for (int cp=start; cp<=end; ++cp) {
3929                         csp.addCaseClosure(cp, foldSet);
3930                     }
3931                 } else {
3932                     // add case mappings
3933                     // (does not add long s for regular s, or Kelvin for k, for example)
3934                     for (int cp=start; cp<=end; ++cp) {
3935                         result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
3936                         addCaseMapping(foldSet, result, full);
3937 
3938                         result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
3939                         addCaseMapping(foldSet, result, full);
3940 
3941                         result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
3942                         addCaseMapping(foldSet, result, full);
3943 
3944                         result = csp.toFullFolding(cp, full, 0);
3945                         addCaseMapping(foldSet, result, full);
3946                     }
3947                 }
3948             }
3949             if (hasStrings()) {
3950                 if ((attribute & CASE) != 0) {
3951                     for (String s : strings) {
3952                         String str = UCharacter.foldCase(s, 0);
3953                         if(!csp.addStringCaseClosure(str, foldSet)) {
3954                             foldSet.add(str); // does not map to code points: add the folded string itself
3955                         }
3956                     }
3957                 } else {
3958                     BreakIterator bi = BreakIterator.getWordInstance(root);
3959                     for (String str : strings) {
3960                         // TODO: call lower-level functions
3961                         foldSet.add(UCharacter.toLowerCase(root, str));
3962                         foldSet.add(UCharacter.toTitleCase(root, str, bi));
3963                         foldSet.add(UCharacter.toUpperCase(root, str));
3964                         foldSet.add(UCharacter.foldCase(str, 0));
3965                     }
3966                 }
3967             }
3968             set(foldSet);
3969         }
3970         return this;
3971     }
3972 
3973     /**
3974      * Internal class for customizing UnicodeSet parsing of properties.
3975      * TODO: extend to allow customizing of codepoint ranges
3976      * @draft ICU3.8 (retain)
3977      * @author medavis
3978      */
3979     abstract public static class XSymbolTable implements SymbolTable {
3980         /**
3981          * Default constructor
3982          * @draft ICU3.8 (retain)
3983          */
XSymbolTable()3984         public XSymbolTable(){}
3985         /**
3986          * Supplies default implementation for SymbolTable (no action).
3987          * @draft ICU3.8 (retain)
3988          */
3989         @Override
lookupMatcher(int i)3990         public UnicodeMatcher lookupMatcher(int i) {
3991             return null;
3992         }
3993 
3994         /**
3995          * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style
3996          * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be.
3997          * <p>
3998          * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a
3999          * propertyName+propertyValue combination.
4000          *
4001          * @param propertyName
4002          *            the name of the property
4003          * @param propertyValue
4004          *            the name of the property value
4005          * @param result UnicodeSet value to change
4006          *            a set to which the characters having the propertyName+propertyValue are to be added.
4007          * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters
4008          *         with that property have been added to the UnicodeSet, and returns false if the
4009          *         propertyName+propertyValue combination is not recognized (in which case result is unaltered).
4010          * @draft ICU3.8 (retain)
4011          */
applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)4012         public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) {
4013             return false;
4014         }
4015         /**
4016          * Supplies default implementation for SymbolTable (no action).
4017          * @draft ICU3.8 (retain)
4018          */
4019         @Override
lookup(String s)4020         public char[] lookup(String s) {
4021             return null;
4022         }
4023         /**
4024          * Supplies default implementation for SymbolTable (no action).
4025          * @draft ICU3.8 (retain)
4026          */
4027         @Override
parseReference(String text, ParsePosition pos, int limit)4028         public String parseReference(String text, ParsePosition pos, int limit) {
4029             return null;
4030         }
4031     }
4032 
4033     /**
4034      * Is this frozen, according to the Freezable interface?
4035      *
4036      * @return value
4037      * @stable ICU 3.8
4038      */
4039     @Override
isFrozen()4040     public boolean isFrozen() {
4041         return (bmpSet != null || stringSpan != null);
4042     }
4043 
4044     /**
4045      * Freeze this class, according to the Freezable interface.
4046      *
4047      * @return this
4048      * @stable ICU 4.4
4049      */
4050     @Override
freeze()4051     public UnicodeSet freeze() {
4052         if (!isFrozen()) {
4053             compact();
4054 
4055             // Optimize contains() and span() and similar functions.
4056             if (hasStrings()) {
4057                 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL);
4058             }
4059             if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
4060                 // Optimize for code point spans.
4061                 // There are no strings, or
4062                 // all strings are irrelevant for span() etc. because
4063                 // all of each string's code points are contained in this set.
4064                 // However, fully contained strings are relevant for spanAndCount(),
4065                 // so we create both objects.
4066                 bmpSet = new BMPSet(list, len);
4067             }
4068         }
4069         return this;
4070     }
4071 
4072     /**
4073      * Span a string using this UnicodeSet.
4074      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4075      * @param s The string to be spanned
4076      * @param spanCondition The span condition
4077      * @return the length of the span
4078      * @stable ICU 4.4
4079      */
span(CharSequence s, SpanCondition spanCondition)4080     public int span(CharSequence s, SpanCondition spanCondition) {
4081         return span(s, 0, spanCondition);
4082     }
4083 
4084     /**
4085      * Span a string using this UnicodeSet.
4086      *   If the start index is less than 0, span will start from 0.
4087      *   If the start index is greater than the string length, span returns the string length.
4088      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4089      * @param s The string to be spanned
4090      * @param start The start index that the span begins
4091      * @param spanCondition The span condition
4092      * @return the string index which ends the span (i.e. exclusive)
4093      * @stable ICU 4.4
4094      */
span(CharSequence s, int start, SpanCondition spanCondition)4095     public int span(CharSequence s, int start, SpanCondition spanCondition) {
4096         int end = s.length();
4097         if (start < 0) {
4098             start = 0;
4099         } else if (start >= end) {
4100             return end;
4101         }
4102         if (bmpSet != null) {
4103             // Frozen set without strings, or no string is relevant for span().
4104             return bmpSet.span(s, start, spanCondition, null);
4105         }
4106         if (stringSpan != null) {
4107             return stringSpan.span(s, start, spanCondition);
4108         } else if (hasStrings()) {
4109             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
4110                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
4111             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which);
4112             if (strSpan.needsStringSpanUTF16()) {
4113                 return strSpan.span(s, start, spanCondition);
4114             }
4115         }
4116 
4117         return spanCodePointsAndCount(s, start, spanCondition, null);
4118     }
4119 
4120     /**
4121      * Same as span() but also counts the smallest number of set elements on any path across the span.
4122      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4123      * @param outCount An output-only object (must not be null) for returning the count.
4124      * @return the limit (exclusive end) of the span
4125      * @internal
4126      * @deprecated This API is ICU internal only.
4127      */
4128     @Deprecated
spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4129     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
4130         if (outCount == null) {
4131             throw new IllegalArgumentException("outCount must not be null");
4132         }
4133         int end = s.length();
4134         if (start < 0) {
4135             start = 0;
4136         } else if (start >= end) {
4137             return end;
4138         }
4139         if (stringSpan != null) {
4140             // We might also have bmpSet != null,
4141             // but fully-contained strings are relevant for counting elements.
4142             return stringSpan.spanAndCount(s, start, spanCondition, outCount);
4143         } else if (bmpSet != null) {
4144             return bmpSet.span(s, start, spanCondition, outCount);
4145         } else if (hasStrings()) {
4146             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
4147                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
4148             which |= UnicodeSetStringSpan.WITH_COUNT;
4149             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which);
4150             return strSpan.spanAndCount(s, start, spanCondition, outCount);
4151         }
4152 
4153         return spanCodePointsAndCount(s, start, spanCondition, outCount);
4154     }
4155 
spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4156     private int spanCodePointsAndCount(CharSequence s, int start,
4157             SpanCondition spanCondition, OutputInt outCount) {
4158         // Pin to 0/1 values.
4159         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
4160 
4161         int c;
4162         int next = start;
4163         int length = s.length();
4164         int count = 0;
4165         do {
4166             c = Character.codePointAt(s, next);
4167             if (spanContained != contains(c)) {
4168                 break;
4169             }
4170             ++count;
4171             next += Character.charCount(c);
4172         } while (next < length);
4173         if (outCount != null) { outCount.value = count; }
4174         return next;
4175     }
4176 
4177     /**
4178      * Span a string backwards (from the end) using this UnicodeSet.
4179      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4180      * @param s The string to be spanned
4181      * @param spanCondition The span condition
4182      * @return The string index which starts the span (i.e. inclusive).
4183      * @stable ICU 4.4
4184      */
spanBack(CharSequence s, SpanCondition spanCondition)4185     public int spanBack(CharSequence s, SpanCondition spanCondition) {
4186         return spanBack(s, s.length(), spanCondition);
4187     }
4188 
4189     /**
4190      * Span a string backwards (from the fromIndex) using this UnicodeSet.
4191      * If the fromIndex is less than 0, spanBack will return 0.
4192      * If fromIndex is greater than the string length, spanBack will start from the string length.
4193      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4194      * @param s The string to be spanned
4195      * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
4196      * @param spanCondition The span condition
4197      * @return The string index which starts the span (i.e. inclusive).
4198      * @stable ICU 4.4
4199      */
spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4200     public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
4201         if (fromIndex <= 0) {
4202             return 0;
4203         }
4204         if (fromIndex > s.length()) {
4205             fromIndex = s.length();
4206         }
4207         if (bmpSet != null) {
4208             // Frozen set without strings, or no string is relevant for spanBack().
4209             return bmpSet.spanBack(s, fromIndex, spanCondition);
4210         }
4211         if (stringSpan != null) {
4212             return stringSpan.spanBack(s, fromIndex, spanCondition);
4213         } else if (hasStrings()) {
4214             int which = (spanCondition == SpanCondition.NOT_CONTAINED)
4215                     ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
4216                             : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
4217             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which);
4218             if (strSpan.needsStringSpanUTF16()) {
4219                 return strSpan.spanBack(s, fromIndex, spanCondition);
4220             }
4221         }
4222 
4223         // Pin to 0/1 values.
4224         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
4225 
4226         int c;
4227         int prev = fromIndex;
4228         do {
4229             c = Character.codePointBefore(s, prev);
4230             if (spanContained != contains(c)) {
4231                 break;
4232             }
4233             prev -= Character.charCount(c);
4234         } while (prev > 0);
4235         return prev;
4236     }
4237 
4238     /**
4239      * Clone a thawed version of this class, according to the Freezable interface.
4240      * @return the clone, not frozen
4241      * @stable ICU 4.4
4242      */
4243     @Override
cloneAsThawed()4244     public UnicodeSet cloneAsThawed() {
4245         UnicodeSet result = new UnicodeSet(this);
4246         assert !result.isFrozen();
4247         return result;
4248     }
4249 
4250     // internal function
checkFrozen()4251     private void checkFrozen() {
4252         if (isFrozen()) {
4253             throw new UnsupportedOperationException("Attempt to modify frozen object");
4254         }
4255     }
4256 
4257     // ************************
4258     // Additional methods for integration with Generics and Collections
4259     // ************************
4260 
4261     /**
4262      * A struct-like class used for iteration through ranges, for faster iteration than by String.
4263      * Read about the restrictions on usage in {@link UnicodeSet#ranges()}.
4264      *
4265      * @stable ICU 54
4266      */
4267     public static class EntryRange {
4268         /**
4269          * The starting code point of the range.
4270          *
4271          * @stable ICU 54
4272          */
4273         public int codepoint;
4274         /**
4275          * The ending code point of the range
4276          *
4277          * @stable ICU 54
4278          */
4279         public int codepointEnd;
4280 
EntryRange()4281         EntryRange() {
4282         }
4283 
4284         /**
4285          * {@inheritDoc}
4286          *
4287          * @stable ICU 54
4288          */
4289         @Override
toString()4290         public String toString() {
4291             StringBuilder b = new StringBuilder();
4292             return (
4293                     codepoint == codepointEnd ? _appendToPat(b, codepoint, false)
4294                             : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false))
4295                             .toString();
4296         }
4297     }
4298 
4299     /**
4300      * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points.
4301      * The UnicodeSet must not be altered during the iteration.
4302      * The EntryRange instance is the same each time; the contents are just reset.
4303      *
4304      * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
4305      *
4306      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
4307      * Do not alter the UnicodeSet while iterating.
4308      *
4309      * <pre>
4310      * // Sample code
4311      * for (EntryRange range : us1.ranges()) {
4312      *     // do something with code points between range.codepoint and range.codepointEnd;
4313      * }
4314      * for (String s : us1.strings()) {
4315      *     // do something with each string;
4316      * }
4317      * </pre>
4318      *
4319      * @stable ICU 54
4320      */
ranges()4321     public Iterable<EntryRange> ranges() {
4322         return new EntryRangeIterable();
4323     }
4324 
4325     private class EntryRangeIterable implements Iterable<EntryRange> {
4326         @Override
iterator()4327         public Iterator<EntryRange> iterator() {
4328             return new EntryRangeIterator();
4329         }
4330     }
4331 
4332     private class EntryRangeIterator implements Iterator<EntryRange> {
4333         int pos;
4334         EntryRange result = new EntryRange();
4335 
4336         @Override
hasNext()4337         public boolean hasNext() {
4338             return pos < len-1;
4339         }
4340         @Override
next()4341         public EntryRange next() {
4342             if (pos < len-1) {
4343                 result.codepoint = list[pos++];
4344                 result.codepointEnd = list[pos++]-1;
4345             } else {
4346                 throw new NoSuchElementException();
4347             }
4348             return result;
4349         }
4350         @Override
remove()4351         public void remove() {
4352             throw new UnsupportedOperationException();
4353         }
4354     }
4355 
4356 
4357     /**
4358      * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}.
4359      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
4360      * Do not alter the UnicodeSet while iterating.
4361      * @see java.util.Set#iterator()
4362      * @stable ICU 4.4
4363      */
4364     @Override
iterator()4365     public Iterator<String> iterator() {
4366         return new UnicodeSetIterator2(this);
4367     }
4368 
4369     // Cover for string iteration.
4370     private static class UnicodeSetIterator2 implements Iterator<String> {
4371         // Invariants:
4372         // sourceList != null then sourceList[item] is a valid character
4373         // sourceList == null then delegates to stringIterator
4374         private int[] sourceList;
4375         private int len;
4376         private int item;
4377         private int current;
4378         private int limit;
4379         private SortedSet<String> sourceStrings;
4380         private Iterator<String> stringIterator;
4381         private char[] buffer;
4382 
UnicodeSetIterator2(UnicodeSet source)4383         UnicodeSetIterator2(UnicodeSet source) {
4384             // set according to invariants
4385             len = source.len - 1;
4386             if (len > 0) {
4387                 sourceStrings = source.strings;
4388                 sourceList = source.list;
4389                 current = sourceList[item++];
4390                 limit = sourceList[item++];
4391             } else {
4392                 stringIterator = source.strings.iterator();
4393                 sourceList = null;
4394             }
4395         }
4396 
4397         /* (non-Javadoc)
4398          * @see java.util.Iterator#hasNext()
4399          */
4400         @Override
hasNext()4401         public boolean hasNext() {
4402             return sourceList != null || stringIterator.hasNext();
4403         }
4404 
4405         /* (non-Javadoc)
4406          * @see java.util.Iterator#next()
4407          */
4408         @Override
next()4409         public String next() {
4410             if (sourceList == null) {
4411                 return stringIterator.next();
4412             }
4413             int codepoint = current++;
4414             // we have the codepoint we need, but we may need to adjust the state
4415             if (current >= limit) {
4416                 if (item >= len) {
4417                     stringIterator = sourceStrings.iterator();
4418                     sourceList = null;
4419                 } else {
4420                     current = sourceList[item++];
4421                     limit = sourceList[item++];
4422                 }
4423             }
4424             // Now return. Single code point is easy
4425             if (codepoint <= 0xFFFF) {
4426                 return String.valueOf((char)codepoint);
4427             }
4428             // But Java lacks a valueOfCodePoint, so we handle ourselves for speed
4429             // allocate a buffer the first time, to make conversion faster.
4430             if (buffer == null) {
4431                 buffer = new char[2];
4432             }
4433             // compute ourselves, to save tests and calls
4434             int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT;
4435             buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE);
4436             buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE);
4437             return String.valueOf(buffer);
4438         }
4439 
4440         /* (non-Javadoc)
4441          * @see java.util.Iterator#remove()
4442          */
4443         @Override
remove()4444         public void remove() {
4445             throw new UnsupportedOperationException();
4446         }
4447     }
4448 
4449     /**
4450      * @see #containsAll(com.ibm.icu.text.UnicodeSet)
4451      * @stable ICU 4.4
4452      */
containsAll(Iterable<T> collection)4453     public <T extends CharSequence> boolean containsAll(Iterable<T> collection) {
4454         for (T o : collection) {
4455             if (!contains(o)) {
4456                 return false;
4457             }
4458         }
4459         return true;
4460     }
4461 
4462     /**
4463      * @see #containsNone(com.ibm.icu.text.UnicodeSet)
4464      * @stable ICU 4.4
4465      */
containsNone(Iterable<T> collection)4466     public <T extends CharSequence> boolean containsNone(Iterable<T> collection) {
4467         for (T o : collection) {
4468             if (contains(o)) {
4469                 return false;
4470             }
4471         }
4472         return true;
4473     }
4474 
4475     /**
4476      * @see #containsAll(com.ibm.icu.text.UnicodeSet)
4477      * @stable ICU 4.4
4478      */
containsSome(Iterable<T> collection)4479     public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) {
4480         return !containsNone(collection);
4481     }
4482 
4483     /**
4484      * @see #addAll(com.ibm.icu.text.UnicodeSet)
4485      * @stable ICU 4.4
4486      */
4487     @SuppressWarnings("unchecked")  // See ticket #11395, this is safe.
addAll(T... collection)4488     public <T extends CharSequence> UnicodeSet addAll(T... collection) {
4489         checkFrozen();
4490         for (T str : collection) {
4491             add(str);
4492         }
4493         return this;
4494     }
4495 
4496 
4497     /**
4498      * @see #removeAll(com.ibm.icu.text.UnicodeSet)
4499      * @stable ICU 4.4
4500      */
removeAll(Iterable<T> collection)4501     public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) {
4502         checkFrozen();
4503         for (T o : collection) {
4504             remove(o);
4505         }
4506         return this;
4507     }
4508 
4509     /**
4510      * @see #retainAll(com.ibm.icu.text.UnicodeSet)
4511      * @stable ICU 4.4
4512      */
retainAll(Iterable<T> collection)4513     public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) {
4514         checkFrozen();
4515         // TODO optimize
4516         UnicodeSet toRetain = new UnicodeSet();
4517         toRetain.addAll(collection);
4518         retainAll(toRetain);
4519         return this;
4520     }
4521 
4522     /**
4523      * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}.
4524      * @stable ICU 4.4
4525      */
4526     public enum ComparisonStyle {
4527         /**
4528          * @stable ICU 4.4
4529          */
4530         SHORTER_FIRST,
4531         /**
4532          * @stable ICU 4.4
4533          */
4534         LEXICOGRAPHIC,
4535         /**
4536          * @stable ICU 4.4
4537          */
4538         LONGER_FIRST
4539     }
4540 
4541     /**
4542      * Compares UnicodeSets, where shorter come first, and otherwise lexigraphically
4543      * (according to the comparison of the first characters that differ).
4544      * @see java.lang.Comparable#compareTo(java.lang.Object)
4545      * @stable ICU 4.4
4546      */
4547     @Override
compareTo(UnicodeSet o)4548     public int compareTo(UnicodeSet o) {
4549         return compareTo(o, ComparisonStyle.SHORTER_FIRST);
4550     }
4551     /**
4552      * Compares UnicodeSets, in three different ways.
4553      * @see java.lang.Comparable#compareTo(java.lang.Object)
4554      * @stable ICU 4.4
4555      */
compareTo(UnicodeSet o, ComparisonStyle style)4556     public int compareTo(UnicodeSet o, ComparisonStyle style) {
4557         if (style != ComparisonStyle.LEXICOGRAPHIC) {
4558             int diff = size() - o.size();
4559             if (diff != 0) {
4560                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
4561             }
4562         }
4563         int result;
4564         for (int i = 0; ; ++i) {
4565             if (0 != (result = list[i] - o.list[i])) {
4566                 // if either list ran out, compare to the last string
4567                 if (list[i] == HIGH) {
4568                     if (!hasStrings()) return 1;
4569                     String item = strings.first();
4570                     return compare(item, o.list[i]);
4571                 }
4572                 if (o.list[i] == HIGH) {
4573                     if (!o.hasStrings()) return -1;
4574                     String item = o.strings.first();
4575                     int compareResult = compare(item, list[i]);
4576                     return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order.
4577                 }
4578                 // otherwise return the result if even index, or the reversal if not
4579                 return (i & 1) == 0 ? result : -result;
4580             }
4581             if (list[i] == HIGH) {
4582                 break;
4583             }
4584         }
4585         return compare(strings, o.strings);
4586     }
4587 
4588     /**
4589      * @stable ICU 4.4
4590      */
compareTo(Iterable<String> other)4591     public int compareTo(Iterable<String> other) {
4592         return compare(this, other);
4593     }
4594 
4595     /**
4596      * Utility to compare a string to a code point.
4597      * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString())
4598      * and comparing, but much faster (no object creation).
4599      * Actually, there is one difference; a null compares as less.
4600      * Note that this (=String) order is UTF-16 order -- *not* code point order.
4601      * @stable ICU 4.4
4602      */
4603 
compare(CharSequence string, int codePoint)4604     public static int compare(CharSequence string, int codePoint) {
4605         return CharSequences.compare(string, codePoint);
4606     }
4607 
4608     /**
4609      * Utility to compare a string to a code point.
4610      * Same results as turning the code point into a string and comparing, but much faster (no object creation).
4611      * Actually, there is one difference; a null compares as less.
4612      * Note that this (=String) order is UTF-16 order -- *not* code point order.
4613      * @stable ICU 4.4
4614      */
compare(int codePoint, CharSequence string)4615     public static int compare(int codePoint, CharSequence string) {
4616         return -CharSequences.compare(string, codePoint);
4617     }
4618 
4619 
4620     /**
4621      * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered,
4622      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
4623      * That means that sets can't be compared directly with this method, unless they are TreeSets without
4624      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
4625      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
4626      * @stable ICU 4.4
4627      */
compare(Iterable<T> collection1, Iterable<T> collection2)4628     public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) {
4629         return compare(collection1.iterator(), collection2.iterator());
4630     }
4631 
4632     /**
4633      * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered,
4634      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
4635      * That means that sets can't be compared directly with this method, unless they are TreeSets without
4636      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
4637      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
4638      * @internal
4639      * @deprecated This API is ICU internal only.
4640      */
4641     @Deprecated
compare(Iterator<T> first, Iterator<T> other)4642     public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) {
4643         while (true) {
4644             if (!first.hasNext()) {
4645                 return other.hasNext() ? -1 : 0;
4646             } else if (!other.hasNext()) {
4647                 return 1;
4648             }
4649             T item1 = first.next();
4650             T item2 = other.next();
4651             int result = item1.compareTo(item2);
4652             if (result != 0) {
4653                 return result;
4654             }
4655         }
4656     }
4657 
4658 
4659     /**
4660      * Utility to compare two collections, optionally by size, and then lexicographically.
4661      * @stable ICU 4.4
4662      */
compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4663     public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) {
4664         if (style != ComparisonStyle.LEXICOGRAPHIC) {
4665             int diff = collection1.size() - collection2.size();
4666             if (diff != 0) {
4667                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
4668             }
4669         }
4670         return compare(collection1, collection2);
4671     }
4672 
4673     /**
4674      * Utility for adding the contents of an iterable to a collection.
4675      * @stable ICU 4.4
4676      */
addAllTo(Iterable<T> source, U target)4677     public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) {
4678         for (T item : source) {
4679             target.add(item);
4680         }
4681         return target;
4682     }
4683 
4684     /**
4685      * Utility for adding the contents of an iterable to a collection.
4686      * @stable ICU 4.4
4687      */
addAllTo(Iterable<T> source, T[] target)4688     public static <T> T[] addAllTo(Iterable<T> source, T[] target) {
4689         int i = 0;
4690         for (T item : source) {
4691             target[i++] = item;
4692         }
4693         return target;
4694     }
4695 
4696     /**
4697      * For iterating through the strings in the set. Example:
4698      * <pre>
4699      * for (String key : myUnicodeSet.strings()) {
4700      *   doSomethingWith(key);
4701      * }
4702      * </pre>
4703      * @stable ICU 4.4
4704      */
strings()4705     public Collection<String> strings() {
4706         if (hasStrings()) {
4707             return Collections.unmodifiableSortedSet(strings);
4708         } else {
4709             return EMPTY_STRINGS;
4710         }
4711     }
4712 
4713     /**
4714      * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE.
4715      * @internal
4716      * @deprecated This API is ICU internal only.
4717      */
4718     @Deprecated
getSingleCodePoint(CharSequence s)4719     public static int getSingleCodePoint(CharSequence s) {
4720         return CharSequences.getSingleCodePoint(s);
4721     }
4722 
4723     /**
4724      * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set.
4725      * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E
4726      * if the dontCare set includes unassigned characters (for a particular version of Unicode).
4727      * @param dontCare Set with the don't-care characters for spanning
4728      * @return the input set, modified
4729      * @internal
4730      * @deprecated This API is ICU internal only.
4731      */
4732     @Deprecated
addBridges(UnicodeSet dontCare)4733     public UnicodeSet addBridges(UnicodeSet dontCare) {
4734         UnicodeSet notInInput = new UnicodeSet(this).complement();
4735         for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) {
4736             if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) {
4737                 add(it.codepoint,it.codepointEnd);
4738             }
4739         }
4740         return this;
4741     }
4742 
4743     /**
4744      * Find the first index at or after fromIndex where the UnicodeSet matches at that index.
4745      * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match.
4746      * If there is no match, length is returned.
4747      * @internal
4748      * @deprecated This API is ICU internal only. Use span instead.
4749      */
4750     @Deprecated
findIn(CharSequence value, int fromIndex, boolean findNot)4751     public int findIn(CharSequence value, int fromIndex, boolean findNot) {
4752         //TODO add strings, optimize, using ICU4C algorithms
4753         int cp;
4754         for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) {
4755             cp = UTF16.charAt(value, fromIndex);
4756             if (contains(cp) != findNot) {
4757                 break;
4758             }
4759         }
4760         return fromIndex;
4761     }
4762 
4763     /**
4764      * Find the last index before fromIndex where the UnicodeSet matches at that index.
4765      * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match.
4766      * If there is no match, -1 is returned.
4767      * BEFORE index is not in the UnicodeSet.
4768      * @internal
4769      * @deprecated This API is ICU internal only. Use spanBack instead.
4770      */
4771     @Deprecated
findLastIn(CharSequence value, int fromIndex, boolean findNot)4772     public int findLastIn(CharSequence value, int fromIndex, boolean findNot) {
4773         //TODO add strings, optimize, using ICU4C algorithms
4774         int cp;
4775         fromIndex -= 1;
4776         for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) {
4777             cp = UTF16.charAt(value, fromIndex);
4778             if (contains(cp) != findNot) {
4779                 break;
4780             }
4781         }
4782         return fromIndex < 0 ? -1 : fromIndex;
4783     }
4784 
4785     /**
4786      * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match.
4787      * @param source The source of the CharSequence to strip from.
4788      * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object.
4789      * @return The string after it has been stripped.
4790      * @internal
4791      * @deprecated This API is ICU internal only. Use replaceFrom.
4792      */
4793     @Deprecated
stripFrom(CharSequence source, boolean matches)4794     public String stripFrom(CharSequence source, boolean matches) {
4795         StringBuilder result = new StringBuilder();
4796         for (int pos = 0; pos < source.length();) {
4797             int inside = findIn(source, pos, !matches);
4798             result.append(source.subSequence(pos, inside));
4799             pos = findIn(source, inside, matches); // get next start
4800         }
4801         return result.toString();
4802     }
4803 
4804     /**
4805      * Argument values for whether span() and similar functions continue while the current character is contained vs.
4806      * not contained in the set.
4807      * <p>
4808      * The functionality is straightforward for sets with only single code points, without strings (which is the common
4809      * case):
4810      * <ul>
4811      * <li>CONTAINED and SIMPLE work the same.
4812      * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
4813      * <li>span() and spanBack() partition any string the
4814      * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
4815      * <li>Using a
4816      * complemented (inverted) set and the opposite span conditions yields the same results.
4817      * </ul>
4818      * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
4819      * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
4820      * strings:
4821      * <ul>
4822      * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
4823      * Therefore, complementing both the set and the span conditions may yield different results.
4824      * <li>When starting spans
4825      * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
4826      * because a set string may start before the later position.
4827      * <li>span(SIMPLE) may be shorter than
4828      * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
4829      * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
4830      * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
4831      * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
4832      * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
4833      * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
4834      * </ul>
4835      * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
4836      * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
4837      * be used.
4838      * <p>
4839      * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
4840      * boundaries, never in the middle of a surrogate pair.
4841      *
4842      * @stable ICU 4.4
4843      */
4844     public enum SpanCondition {
4845         /**
4846          * Continues a span() while there is no set element at the current position.
4847          * Increments by one code point at a time.
4848          * Stops before the first set element (character or string).
4849          * (For code points only, this is like while contains(current)==false).
4850          * <p>
4851          * When span() returns, the substring between where it started and the position it returned consists only of
4852          * characters that are not in the set, and none of its strings overlap with the span.
4853          *
4854          * @stable ICU 4.4
4855          */
4856         NOT_CONTAINED,
4857 
4858         /**
4859          * Spans the longest substring that is a concatenation of set elements (characters or strings).
4860          * (For characters only, this is like while contains(current)==true).
4861          * <p>
4862          * When span() returns, the substring between where it started and the position it returned consists only of set
4863          * elements (characters or strings) that are in the set.
4864          * <p>
4865          * If a set contains strings, then the span will be the longest substring for which there
4866          * exists at least one non-overlapping concatenation of set elements (characters or strings).
4867          * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
4868          * (Java/ICU/Perl regex stops at the first match of an OR.)
4869          *
4870          * @stable ICU 4.4
4871          */
4872         CONTAINED,
4873 
4874         /**
4875          * Continues a span() while there is a set element at the current position.
4876          * Increments by the longest matching element at each position.
4877          * (For characters only, this is like while contains(current)==true).
4878          * <p>
4879          * When span() returns, the substring between where it started and the position it returned consists only of set
4880          * elements (characters or strings) that are in the set.
4881          * <p>
4882          * If a set only contains single characters, then this is the same as CONTAINED.
4883          * <p>
4884          * If a set contains strings, then the span will be the longest substring with a match at each position with the
4885          * longest single set element (character or string).
4886          * <p>
4887          * Use this span condition together with other longest-match algorithms, such as ICU converters
4888          * (ucnv_getUnicodeSet()).
4889          *
4890          * @stable ICU 4.4
4891          */
4892         SIMPLE,
4893 
4894         /**
4895          * One more than the last span condition.
4896          *
4897          * @stable ICU 4.4
4898          */
4899         CONDITION_COUNT
4900     }
4901 
4902     /**
4903      * Get the default symbol table. Null means ordinary processing. For internal use only.
4904      * @return the symbol table
4905      * @internal
4906      * @deprecated This API is ICU internal only.
4907      */
4908     @Deprecated
getDefaultXSymbolTable()4909     public static XSymbolTable getDefaultXSymbolTable() {
4910         return XSYMBOL_TABLE;
4911     }
4912 
4913     /**
4914      * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing
4915      * of UnicodeSets.
4916      * <p>
4917      * WARNING: If this function is used with a UnicodeProperty, and the
4918      * Unassigned characters (gc=Cn) are different than in ICU, you MUST call
4919      * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable}
4920      * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}.
4921      *
4922      * @param xSymbolTable the new default symbol table.
4923      * @internal
4924      * @deprecated This API is ICU internal only.
4925      */
4926     @Deprecated
setDefaultXSymbolTable(XSymbolTable xSymbolTable)4927     public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
4928         // If the properties override inclusions, these have to be regenerated.
4929         // TODO: Check if the Unicode Tools or Unicode Utilities really need this.
4930         CharacterPropertiesImpl.clear();
4931         XSYMBOL_TABLE = xSymbolTable;
4932     }
4933 }
4934 //eof
4935