• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import java.io.IOException;
12 import java.text.ParsePosition;
13 import java.util.ArrayList;
14 import java.util.Collection;
15 import java.util.Collections;
16 import java.util.Iterator;
17 import java.util.NoSuchElementException;
18 import java.util.TreeSet;
19 
20 import com.ibm.icu.impl.BMPSet;
21 import com.ibm.icu.impl.Norm2AllModes;
22 import com.ibm.icu.impl.PatternProps;
23 import com.ibm.icu.impl.RuleCharacterIterator;
24 import com.ibm.icu.impl.SortedSetRelation;
25 import com.ibm.icu.impl.StringRange;
26 import com.ibm.icu.impl.UBiDiProps;
27 import com.ibm.icu.impl.UCaseProps;
28 import com.ibm.icu.impl.UCharacterProperty;
29 import com.ibm.icu.impl.UPropertyAliases;
30 import com.ibm.icu.impl.UnicodeSetStringSpan;
31 import com.ibm.icu.impl.Utility;
32 import com.ibm.icu.lang.CharSequences;
33 import com.ibm.icu.lang.UCharacter;
34 import com.ibm.icu.lang.UProperty;
35 import com.ibm.icu.lang.UScript;
36 import com.ibm.icu.util.Freezable;
37 import com.ibm.icu.util.ICUUncheckedIOException;
38 import com.ibm.icu.util.OutputInt;
39 import com.ibm.icu.util.ULocale;
40 import com.ibm.icu.util.VersionInfo;
41 
42 /**
43  * A mutable set of Unicode characters and multicharacter strings.
44  * Objects of this class represent <em>character classes</em> used
45  * in regular expressions. A character specifies a subset of Unicode
46  * code points.  Legal code points are U+0000 to U+10FFFF, inclusive.
47  *
48  * Note: method freeze() will not only make the set immutable, but
49  * also makes important methods much higher performance:
50  * contains(c), containsNone(...), span(...), spanBack(...) etc.
51  * After the object is frozen, any subsequent call that wants to change
52  * the object will throw UnsupportedOperationException.
53  *
54  * <p>The UnicodeSet class is not designed to be subclassed.
55  *
56  * <p><code>UnicodeSet</code> supports two APIs. The first is the
57  * <em>operand</em> API that allows the caller to modify the value of
58  * a <code>UnicodeSet</code> object. It conforms to Java 2's
59  * <code>java.util.Set</code> interface, although
60  * <code>UnicodeSet</code> does not actually implement that
61  * interface. All methods of <code>Set</code> are supported, with the
62  * modification that they take a character range or single character
63  * instead of an <code>Object</code>, and they take a
64  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
65  * operand API may be thought of in terms of boolean logic: a boolean
66  * OR is implemented by <code>add</code>, a boolean AND is implemented
67  * by <code>retain</code>, a boolean XOR is implemented by
68  * <code>complement</code> taking an argument, and a boolean NOT is
69  * implemented by <code>complement</code> with no argument.  In terms
70  * of traditional set theory function names, <code>add</code> is a
71  * union, <code>retain</code> is an intersection, <code>remove</code>
72  * is an asymmetric difference, and <code>complement</code> with no
73  * argument is a set complement with respect to the superset range
74  * <code>MIN_VALUE-MAX_VALUE</code>
75  *
76  * <p>The second API is the
77  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
78  * <code>java.text.Format</code>-derived classes.  Unlike the
79  * methods that add characters, add categories, and control the logic
80  * of the set, the method <code>applyPattern()</code> sets all
81  * attributes of a <code>UnicodeSet</code> at once, based on a
82  * string pattern.
83  *
84  * <p><b>Pattern syntax</b></p>
85  *
86  * Patterns are accepted by the constructors and the
87  * <code>applyPattern()</code> methods and returned by the
88  * <code>toPattern()</code> method.  These patterns follow a syntax
89  * similar to that employed by version 8 regular expression character
90  * classes.  Here are some simple examples:
91  *
92  * <blockquote>
93  *   <table>
94  *     <tr style="vertical-align: top">
95  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td>
96  *       <td style="vertical-align: top;">No characters</td>
97  *     </tr><tr style="vertical-align: top">
98  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td>
99  *       <td style="vertical-align: top;">The character 'a'</td>
100  *     </tr><tr style="vertical-align: top">
101  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td>
102  *       <td style="vertical-align: top;">The characters 'a' and 'e'</td>
103  *     </tr>
104  *     <tr>
105  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td>
106  *       <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code
107  *       point order</td>
108  *     </tr>
109  *     <tr>
110  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td>
111  *       <td style="vertical-align: top;">The character U+4E01</td>
112  *     </tr>
113  *     <tr>
114  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td>
115  *       <td style="vertical-align: top;">The character 'a' and the multicharacter strings &quot;ab&quot; and
116  *       &quot;ac&quot;</td>
117  *     </tr>
118  *     <tr>
119  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td>
120  *       <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td>
121  *     </tr>
122  *   </table>
123  * </blockquote>
124  *
125  * Any character may be preceded by a backslash in order to remove any special
126  * meaning.  White space characters, as defined by the Unicode Pattern_White_Space property, are
127  * ignored, unless they are escaped.
128  *
129  * <p>Property patterns specify a set of characters having a certain
130  * property as defined by the Unicode standard.  Both the POSIX-like
131  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
132  * complete list of supported property patterns, see the User's Guide
133  * for UnicodeSet at
134  * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
135  * http://www.icu-project.org/userguide/unicodeSet.html</a>.
136  * Actual determination of property data is defined by the underlying
137  * Unicode database as implemented by UCharacter.
138  *
139  * <p>Patterns specify individual characters, ranges of characters, and
140  * Unicode property sets.  When elements are concatenated, they
141  * specify their union.  To complement a set, place a '^' immediately
142  * after the opening '['.  Property patterns are inverted by modifying
143  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
144  * '^' has no special meaning.
145  *
146  * <p>Ranges are indicated by placing two a '-' between two
147  * characters, as in "a-z".  This specifies the range of all
148  * characters from the left to the right, in Unicode order.  If the
149  * left character is greater than or equal to the
150  * right character it is a syntax error.  If a '-' occurs as the first
151  * character after the opening '[' or '[^', or if it occurs as the
152  * last character before the closing ']', then it is taken as a
153  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
154  * set of three characters, 'a', 'b', and '-'.
155  *
156  * <p>Sets may be intersected using the '&amp;' operator or the asymmetric
157  * set difference may be taken using the '-' operator, for example,
158  * "[[:L:]&amp;[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
159  * with values less than 4096.  Operators ('&amp;' and '|') have equal
160  * precedence and bind left-to-right.  Thus
161  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
162  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
163  * difference; intersection is commutative.
164  *
165  * <table>
166  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a'
167  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a'
168  * through 'z' and all letters in between, in Unicode order
169  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing
170  * all characters but 'a' through 'z',
171  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
172  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
173  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
174  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&amp;[<em>pat2</em>]]</code>
175  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
176  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
177  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
178  * <em>pat2</em>
179  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code>
180  * <td>The set of characters having the specified
181  * Unicode property; in
182  * this case, Unicode uppercase letters
183  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code>
184  * <td>The set of characters <em>not</em> having the given
185  * Unicode property
186  * </table>
187  *
188  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
189  *
190  * <p><b>Formal syntax</b></p>
191  *
192  * <blockquote>
193  *   <table>
194  *     <tr style="vertical-align: top">
195  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern :=&nbsp; </code></td>
196  *       <td style="vertical-align: top;"><code>('[' '^'? item* ']') |
197  *       property</code></td>
198  *     </tr>
199  *     <tr style="vertical-align: top">
200  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item :=&nbsp; </code></td>
201  *       <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br>
202  *       </code></td>
203  *     </tr>
204  *     <tr style="vertical-align: top">
205  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr :=&nbsp; </code></td>
206  *       <td style="vertical-align: top;"><code>pattern | pattern-expr pattern |
207  *       pattern-expr op pattern<br>
208  *       </code></td>
209  *     </tr>
210  *     <tr style="vertical-align: top">
211  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op :=&nbsp; </code></td>
212  *       <td style="vertical-align: top;"><code>'&amp;' | '-'<br>
213  *       </code></td>
214  *     </tr>
215  *     <tr style="vertical-align: top">
216  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special :=&nbsp; </code></td>
217  *       <td style="vertical-align: top;"><code>'[' | ']' | '-'<br>
218  *       </code></td>
219  *     </tr>
220  *     <tr style="vertical-align: top">
221  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char :=&nbsp; </code></td>
222  *       <td style="vertical-align: top;"><em>any character that is not</em><code> special<br>
223  *       | ('\\' </code><em>any character</em><code>)<br>
224  *       | ('&#92;u' hex hex hex hex)<br>
225  *       </code></td>
226  *     </tr>
227  *     <tr style="vertical-align: top">
228  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex :=&nbsp; </code></td>
229  *       <td style="vertical-align: top;"><em>any character for which
230  *       </em><code>Character.digit(c, 16)</code><em>
231  *       returns a non-negative result</em></td>
232  *     </tr>
233  *     <tr>
234  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property :=&nbsp; </code></td>
235  *       <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td>
236  *     </tr>
237  *   </table>
238  *   <br>
239  *   <table border="1">
240  *     <tr>
241  *       <td>Legend: <table>
242  *         <tr>
243  *           <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td>
244  *           <td style="width: 20; vertical-align: top;">&nbsp; </td>
245  *           <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td>
246  *         </tr>
247  *         <tr>
248  *           <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td>
249  *           <td style="vertical-align: top;"></td>
250  *           <td style="vertical-align: top;">zero or one instance of <code>a</code><br>
251  *           </td>
252  *         </tr>
253  *         <tr>
254  *           <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td>
255  *           <td style="vertical-align: top;"></td>
256  *           <td style="vertical-align: top;">one or more instances of <code>a</code><br>
257  *           </td>
258  *         </tr>
259  *         <tr>
260  *           <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td>
261  *           <td style="vertical-align: top;"></td>
262  *           <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br>
263  *           </td>
264  *         </tr>
265  *         <tr>
266  *           <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td>
267  *           <td style="vertical-align: top;"></td>
268  *           <td style="vertical-align: top;">the literal string between the quotes </td>
269  *         </tr>
270  *       </table>
271  *       </td>
272  *     </tr>
273  *   </table>
274  * </blockquote>
275  * <p>To iterate over contents of UnicodeSet, the following are available:
276  * <ul><li>{@link #ranges()} to iterate through the ranges</li>
277  * <li>{@link #strings()} to iterate through the strings</li>
278  * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
279  * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
280  * </ul>
281  * All of the above can be used in <b>for</b> loops.
282  * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
283  * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
284  *
285  * @author Alan Liu
286  * @stable ICU 2.0
287  * @see UnicodeSetIterator
288  * @see UnicodeSetSpanner
289  */
290 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> {
291 
292     /**
293      * Constant for the empty set.
294      * @stable ICU 4.8
295      */
296     public static final UnicodeSet EMPTY = new UnicodeSet().freeze();
297     /**
298      * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.)
299      * @stable ICU 4.8
300      */
301     public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze();
302 
303     private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing
304 
305     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
306     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
307     // 110000 for codepoints
308 
309     /**
310      * Minimum value that can be stored in a UnicodeSet.
311      * @stable ICU 2.0
312      */
313     public static final int MIN_VALUE = LOW;
314 
315     /**
316      * Maximum value that can be stored in a UnicodeSet.
317      * @stable ICU 2.0
318      */
319     public static final int MAX_VALUE = HIGH - 1;
320 
321     private int len;      // length used; list may be longer to minimize reallocs
322     private int[] list;   // MUST be terminated with HIGH
323     private int[] rangeList; // internal buffer
324     private int[] buffer; // internal buffer
325 
326     // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
327     // is not private so that UnicodeSetIterator can get access
328     TreeSet<String> strings = new TreeSet<String>();
329 
330     /**
331      * The pattern representation of this set.  This may not be the
332      * most economical pattern.  It is the pattern supplied to
333      * applyPattern(), with variables substituted and whitespace
334      * removed.  For sets constructed without applyPattern(), or
335      * modified using the non-pattern API, this string will be null,
336      * indicating that toPattern() must generate a pattern
337      * representation from the inversion list.
338      */
339     private String pat = null;
340 
341     private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
342     private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
343 
344     // Special property set IDs
345     private static final String ANY_ID   = "ANY";   // [\u0000-\U0010FFFF]
346     private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
347     private static final String ASSIGNED = "Assigned"; // [:^Cn:]
348 
349     /**
350      * A set of all characters _except_ the second through last characters of
351      * certain ranges.  These ranges are ranges of characters whose
352      * properties are all exactly alike, e.g. CJK Ideographs from
353      * U+4E00 to U+9FA5.
354      */
355     private static UnicodeSet INCLUSIONS[] = null;
356 
357     private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
358     private volatile UnicodeSetStringSpan stringSpan;
359     //----------------------------------------------------------------
360     // Public API
361     //----------------------------------------------------------------
362 
363     /**
364      * Constructs an empty set.
365      * @stable ICU 2.0
366      */
UnicodeSet()367     public UnicodeSet() {
368         list = new int[1 + START_EXTRA];
369         list[len++] = HIGH;
370     }
371 
372     /**
373      * Constructs a copy of an existing set.
374      * @stable ICU 2.0
375      */
UnicodeSet(UnicodeSet other)376     public UnicodeSet(UnicodeSet other) {
377         set(other);
378     }
379 
380     /**
381      * Constructs a set containing the given range. If <code>end &gt;
382      * start</code> then an empty set is created.
383      *
384      * @param start first character, inclusive, of range
385      * @param end last character, inclusive, of range
386      * @stable ICU 2.0
387      */
UnicodeSet(int start, int end)388     public UnicodeSet(int start, int end) {
389         this();
390         complement(start, end);
391     }
392 
393     /**
394      * Quickly constructs a set from a set of ranges &lt;s0, e0, s1, e1, s2, e2, ..., sn, en&gt;.
395      * There must be an even number of integers, and they must be all greater than zero,
396      * all less than or equal to Character.MAX_CODE_POINT.
397      * In each pair (..., si, ei, ...) it must be true that si &lt;= ei
398      * Between adjacent pairs (...ei, sj...), it must be true that ei+1 &lt; sj
399      * @param pairs pairs of character representing ranges
400      * @stable ICU 4.4
401      */
UnicodeSet(int... pairs)402     public UnicodeSet(int... pairs) {
403         if ((pairs.length & 1) != 0) {
404             throw new IllegalArgumentException("Must have even number of integers");
405         }
406         list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set.
407         len = list.length;
408         int last = -1; // used to ensure that the results are monotonically increasing.
409         int i = 0;
410         while (i < pairs.length) {
411             // start of pair
412             int start = pairs[i];
413             if (last >= start) {
414                 throw new IllegalArgumentException("Must be monotonically increasing.");
415             }
416             list[i++] = last = start;
417             // end of pair
418             int end = pairs[i] + 1;
419             if (last >= end) {
420                 throw new IllegalArgumentException("Must be monotonically increasing.");
421             }
422             list[i++] = last = end;
423         }
424         list[i] = HIGH; // terminate
425     }
426 
427     /**
428      * Constructs a set from the given pattern.  See the class description
429      * for the syntax of the pattern language.  Whitespace is ignored.
430      * @param pattern a string specifying what characters are in the set
431      * @exception java.lang.IllegalArgumentException if the pattern contains
432      * a syntax error.
433      * @stable ICU 2.0
434      */
UnicodeSet(String pattern)435     public UnicodeSet(String pattern) {
436         this();
437         applyPattern(pattern, null, null, IGNORE_SPACE);
438     }
439 
440     /**
441      * Constructs a set from the given pattern.  See the class description
442      * for the syntax of the pattern language.
443      * @param pattern a string specifying what characters are in the set
444      * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters
445      * @exception java.lang.IllegalArgumentException if the pattern contains
446      * a syntax error.
447      * @stable ICU 2.0
448      */
UnicodeSet(String pattern, boolean ignoreWhitespace)449     public UnicodeSet(String pattern, boolean ignoreWhitespace) {
450         this();
451         applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
452     }
453 
454     /**
455      * Constructs a set from the given pattern.  See the class description
456      * for the syntax of the pattern language.
457      * @param pattern a string specifying what characters are in the set
458      * @param options a bitmask indicating which options to apply.
459      * Valid options are IGNORE_SPACE and CASE.
460      * @exception java.lang.IllegalArgumentException if the pattern contains
461      * a syntax error.
462      * @stable ICU 3.8
463      */
UnicodeSet(String pattern, int options)464     public UnicodeSet(String pattern, int options) {
465         this();
466         applyPattern(pattern, null, null, options);
467     }
468 
469     /**
470      * Constructs a set from the given pattern.  See the class description
471      * for the syntax of the pattern language.
472      * @param pattern a string specifying what characters are in the set
473      * @param pos on input, the position in pattern at which to start parsing.
474      * On output, the position after the last character parsed.
475      * @param symbols a symbol table mapping variables to char[] arrays
476      * and chars to UnicodeSets
477      * @exception java.lang.IllegalArgumentException if the pattern
478      * contains a syntax error.
479      * @stable ICU 2.0
480      */
UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)481     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) {
482         this();
483         applyPattern(pattern, pos, symbols, IGNORE_SPACE);
484     }
485 
486     /**
487      * Constructs a set from the given pattern.  See the class description
488      * for the syntax of the pattern language.
489      * @param pattern a string specifying what characters are in the set
490      * @param pos on input, the position in pattern at which to start parsing.
491      * On output, the position after the last character parsed.
492      * @param symbols a symbol table mapping variables to char[] arrays
493      * and chars to UnicodeSets
494      * @param options a bitmask indicating which options to apply.
495      * Valid options are IGNORE_SPACE and CASE.
496      * @exception java.lang.IllegalArgumentException if the pattern
497      * contains a syntax error.
498      * @stable ICU 3.2
499      */
UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)500     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) {
501         this();
502         applyPattern(pattern, pos, symbols, options);
503     }
504 
505 
506     /**
507      * Return a new set that is equivalent to this one.
508      * @stable ICU 2.0
509      */
510     @Override
clone()511     public Object clone() {
512         if (isFrozen()) {
513             return this;
514         }
515         UnicodeSet result = new UnicodeSet(this);
516         result.bmpSet = this.bmpSet;
517         result.stringSpan = this.stringSpan;
518         return result;
519     }
520 
521     /**
522      * Make this object represent the range <code>start - end</code>.
523      * If <code>end &gt; start</code> then this object is set to an
524      * an empty range.
525      *
526      * @param start first character in the set, inclusive
527      * @param end last character in the set, inclusive
528      * @stable ICU 2.0
529      */
set(int start, int end)530     public UnicodeSet set(int start, int end) {
531         checkFrozen();
532         clear();
533         complement(start, end);
534         return this;
535     }
536 
537     /**
538      * Make this object represent the same set as <code>other</code>.
539      * @param other a <code>UnicodeSet</code> whose value will be
540      * copied to this object
541      * @stable ICU 2.0
542      */
set(UnicodeSet other)543     public UnicodeSet set(UnicodeSet other) {
544         checkFrozen();
545         list = other.list.clone();
546         len = other.len;
547         pat = other.pat;
548         strings = new TreeSet<String>(other.strings);
549         return this;
550     }
551 
552     /**
553      * Modifies this set to represent the set specified by the given pattern.
554      * See the class description for the syntax of the pattern language.
555      * Whitespace is ignored.
556      * @param pattern a string specifying what characters are in the set
557      * @exception java.lang.IllegalArgumentException if the pattern
558      * contains a syntax error.
559      * @stable ICU 2.0
560      */
applyPattern(String pattern)561     public final UnicodeSet applyPattern(String pattern) {
562         checkFrozen();
563         return applyPattern(pattern, null, null, IGNORE_SPACE);
564     }
565 
566     /**
567      * Modifies this set to represent the set specified by the given pattern,
568      * optionally ignoring whitespace.
569      * See the class description for the syntax of the pattern language.
570      * @param pattern a string specifying what characters are in the set
571      * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored
572      * @exception java.lang.IllegalArgumentException if the pattern
573      * contains a syntax error.
574      * @stable ICU 2.0
575      */
applyPattern(String pattern, boolean ignoreWhitespace)576     public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) {
577         checkFrozen();
578         return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
579     }
580 
581     /**
582      * Modifies this set to represent the set specified by the given pattern,
583      * optionally ignoring whitespace.
584      * See the class description for the syntax of the pattern language.
585      * @param pattern a string specifying what characters are in the set
586      * @param options a bitmask indicating which options to apply.
587      * Valid options are IGNORE_SPACE and CASE.
588      * @exception java.lang.IllegalArgumentException if the pattern
589      * contains a syntax error.
590      * @stable ICU 3.8
591      */
applyPattern(String pattern, int options)592     public UnicodeSet applyPattern(String pattern, int options) {
593         checkFrozen();
594         return applyPattern(pattern, null, null, options);
595     }
596 
597     /**
598      * Return true if the given position, in the given pattern, appears
599      * to be the start of a UnicodeSet pattern.
600      * @stable ICU 2.0
601      */
resemblesPattern(String pattern, int pos)602     public static boolean resemblesPattern(String pattern, int pos) {
603         return ((pos+1) < pattern.length() &&
604                 pattern.charAt(pos) == '[') ||
605                 resemblesPropertyPattern(pattern, pos);
606     }
607 
608     /**
609      * TODO: create Appendable version of UTF16.append(buf, c),
610      * maybe in new class Appendables?
611      * @throws IOException
612      */
appendCodePoint(Appendable app, int c)613     private static void appendCodePoint(Appendable app, int c) {
614         assert 0 <= c && c <= 0x10ffff;
615         try {
616             if (c <= 0xffff) {
617                 app.append((char) c);
618             } else {
619                 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c));
620             }
621         } catch (IOException e) {
622             throw new ICUUncheckedIOException(e);
623         }
624     }
625 
626     /**
627      * TODO: create class Appendables?
628      * @throws IOException
629      */
append(Appendable app, CharSequence s)630     private static void append(Appendable app, CharSequence s) {
631         try {
632             app.append(s);
633         } catch (IOException e) {
634             throw new ICUUncheckedIOException(e);
635         }
636     }
637 
638     /**
639      * Append the <code>toPattern()</code> representation of a
640      * string to the given <code>Appendable</code>.
641      */
_appendToPat(T buf, String s, boolean escapeUnprintable)642     private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) {
643         int cp;
644         for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
645             cp = s.codePointAt(i);
646             _appendToPat(buf, cp, escapeUnprintable);
647         }
648         return buf;
649     }
650 
651     /**
652      * Append the <code>toPattern()</code> representation of a
653      * character to the given <code>Appendable</code>.
654      */
_appendToPat(T buf, int c, boolean escapeUnprintable)655     private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
656         try {
657             if (escapeUnprintable && Utility.isUnprintable(c)) {
658                 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
659                 // unprintable
660                 if (Utility.escapeUnprintable(buf, c)) {
661                     return buf;
662                 }
663             }
664             // Okay to let ':' pass through
665             switch (c) {
666             case '[': // SET_OPEN:
667             case ']': // SET_CLOSE:
668             case '-': // HYPHEN:
669             case '^': // COMPLEMENT:
670             case '&': // INTERSECTION:
671             case '\\': //BACKSLASH:
672             case '{':
673             case '}':
674             case '$':
675             case ':':
676                 buf.append('\\');
677                 break;
678             default:
679                 // Escape whitespace
680                 if (PatternProps.isWhiteSpace(c)) {
681                     buf.append('\\');
682                 }
683                 break;
684             }
685             appendCodePoint(buf, c);
686             return buf;
687         } catch (IOException e) {
688             throw new ICUUncheckedIOException(e);
689         }
690     }
691 
692     /**
693      * Returns a string representation of this set.  If the result of
694      * calling this function is passed to a UnicodeSet constructor, it
695      * will produce another set that is equal to this one.
696      * @stable ICU 2.0
697      */
698     @Override
toPattern(boolean escapeUnprintable)699     public String toPattern(boolean escapeUnprintable) {
700         if (pat != null && !escapeUnprintable) {
701             return pat;
702         }
703         StringBuilder result = new StringBuilder();
704         return _toPattern(result, escapeUnprintable).toString();
705     }
706 
707     /**
708      * Append a string representation of this set to result.  This will be
709      * a cleaned version of the string passed to applyPattern(), if there
710      * is one.  Otherwise it will be generated.
711      */
_toPattern(T result, boolean escapeUnprintable)712     private <T extends Appendable> T _toPattern(T result,
713             boolean escapeUnprintable) {
714         if (pat == null) {
715             return appendNewPattern(result, escapeUnprintable, true);
716         }
717         try {
718             if (!escapeUnprintable) {
719                 result.append(pat);
720                 return result;
721             }
722             boolean oddNumberOfBackslashes = false;
723             for (int i=0; i<pat.length(); ) {
724                 int c = pat.codePointAt(i);
725                 i += Character.charCount(c);
726                 if (Utility.isUnprintable(c)) {
727                     // If the unprintable character is preceded by an odd
728                     // number of backslashes, then it has been escaped
729                     // and we omit the last backslash.
730                     Utility.escapeUnprintable(result, c);
731                     oddNumberOfBackslashes = false;
732                 } else if (!oddNumberOfBackslashes && c == '\\') {
733                     // Temporarily withhold an odd-numbered backslash.
734                     oddNumberOfBackslashes = true;
735                 } else {
736                     if (oddNumberOfBackslashes) {
737                         result.append('\\');
738                     }
739                     appendCodePoint(result, c);
740                     oddNumberOfBackslashes = false;
741                 }
742             }
743             if (oddNumberOfBackslashes) {
744                 result.append('\\');
745             }
746             return result;
747         } catch (IOException e) {
748             throw new ICUUncheckedIOException(e);
749         }
750     }
751 
752     /**
753      * Generate and append a string representation of this set to result.
754      * This does not use this.pat, the cleaned up copy of the string
755      * passed to applyPattern().
756      * @param result the buffer into which to generate the pattern
757      * @param escapeUnprintable escape unprintable characters if true
758      * @stable ICU 2.0
759      */
_generatePattern(StringBuffer result, boolean escapeUnprintable)760     public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) {
761         return _generatePattern(result, escapeUnprintable, true);
762     }
763 
764     /**
765      * Generate and append a string representation of this set to result.
766      * This does not use this.pat, the cleaned up copy of the string
767      * passed to applyPattern().
768      * @param includeStrings if false, doesn't include the strings.
769      * @stable ICU 3.8
770      */
_generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)771     public StringBuffer _generatePattern(StringBuffer result,
772             boolean escapeUnprintable, boolean includeStrings) {
773         return appendNewPattern(result, escapeUnprintable, includeStrings);
774     }
775 
appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)776     private <T extends Appendable> T appendNewPattern(
777             T result, boolean escapeUnprintable, boolean includeStrings) {
778         try {
779             result.append('[');
780 
781             int count = getRangeCount();
782 
783             // If the set contains at least 2 intervals and includes both
784             // MIN_VALUE and MAX_VALUE, then the inverse representation will
785             // be more economical.
786             if (count > 1 &&
787                     getRangeStart(0) == MIN_VALUE &&
788                     getRangeEnd(count-1) == MAX_VALUE) {
789 
790                 // Emit the inverse
791                 result.append('^');
792 
793                 for (int i = 1; i < count; ++i) {
794                     int start = getRangeEnd(i-1)+1;
795                     int end = getRangeStart(i)-1;
796                     _appendToPat(result, start, escapeUnprintable);
797                     if (start != end) {
798                         if ((start+1) != end) {
799                             result.append('-');
800                         }
801                         _appendToPat(result, end, escapeUnprintable);
802                     }
803                 }
804             }
805 
806             // Default; emit the ranges as pairs
807             else {
808                 for (int i = 0; i < count; ++i) {
809                     int start = getRangeStart(i);
810                     int end = getRangeEnd(i);
811                     _appendToPat(result, start, escapeUnprintable);
812                     if (start != end) {
813                         if ((start+1) != end) {
814                             result.append('-');
815                         }
816                         _appendToPat(result, end, escapeUnprintable);
817                     }
818                 }
819             }
820 
821             if (includeStrings && strings.size() > 0) {
822                 for (String s : strings) {
823                     result.append('{');
824                     _appendToPat(result, s, escapeUnprintable);
825                     result.append('}');
826                 }
827             }
828             result.append(']');
829             return result;
830         } catch (IOException e) {
831             throw new ICUUncheckedIOException(e);
832         }
833     }
834 
835     /**
836      * Returns the number of elements in this set (its cardinality)
837      * Note than the elements of a set may include both individual
838      * codepoints and strings.
839      *
840      * @return the number of elements in this set (its cardinality).
841      * @stable ICU 2.0
842      */
size()843     public int size() {
844         int n = 0;
845         int count = getRangeCount();
846         for (int i = 0; i < count; ++i) {
847             n += getRangeEnd(i) - getRangeStart(i) + 1;
848         }
849         return n + strings.size();
850     }
851 
852     /**
853      * Returns <tt>true</tt> if this set contains no elements.
854      *
855      * @return <tt>true</tt> if this set contains no elements.
856      * @stable ICU 2.0
857      */
isEmpty()858     public boolean isEmpty() {
859         return len == 1 && strings.size() == 0;
860     }
861 
862     /**
863      * Implementation of UnicodeMatcher API.  Returns <tt>true</tt> if
864      * this set contains any character whose low byte is the given
865      * value.  This is used by <tt>RuleBasedTransliterator</tt> for
866      * indexing.
867      * @stable ICU 2.0
868      */
869     @Override
matchesIndexValue(int v)870     public boolean matchesIndexValue(int v) {
871         /* The index value v, in the range [0,255], is contained in this set if
872          * it is contained in any pair of this set.  Pairs either have the high
873          * bytes equal, or unequal.  If the high bytes are equal, then we have
874          * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
875          * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
876          * Then v is contained if xx <= v || v <= yy.  (This is identical to the
877          * time zone month containment logic.)
878          */
879         for (int i=0; i<getRangeCount(); ++i) {
880             int low = getRangeStart(i);
881             int high = getRangeEnd(i);
882             if ((low & ~0xFF) == (high & ~0xFF)) {
883                 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
884                     return true;
885                 }
886             } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
887                 return true;
888             }
889         }
890         if (strings.size() != 0) {
891             for (String s : strings) {
892                 //if (s.length() == 0) {
893                 //    // Empty strings match everything
894                 //    return true;
895                 //}
896                 // assert(s.length() != 0); // We enforce this elsewhere
897                 int c = UTF16.charAt(s, 0);
898                 if ((c & 0xFF) == v) {
899                     return true;
900                 }
901             }
902         }
903         return false;
904     }
905 
906     /**
907      * Implementation of UnicodeMatcher.matches().  Always matches the
908      * longest possible multichar string.
909      * @stable ICU 2.0
910      */
911     @Override
matches(Replaceable text, int[] offset, int limit, boolean incremental)912     public int matches(Replaceable text,
913             int[] offset,
914             int limit,
915             boolean incremental) {
916 
917         if (offset[0] == limit) {
918             // Strings, if any, have length != 0, so we don't worry
919             // about them here.  If we ever allow zero-length strings
920             // we much check for them here.
921             if (contains(UnicodeMatcher.ETHER)) {
922                 return incremental ? U_PARTIAL_MATCH : U_MATCH;
923             } else {
924                 return U_MISMATCH;
925             }
926         } else {
927             if (strings.size() != 0) { // try strings first
928 
929                 // might separate forward and backward loops later
930                 // for now they are combined
931 
932                 // TODO Improve efficiency of this, at least in the forward
933                 // direction, if not in both.  In the forward direction we
934                 // can assume the strings are sorted.
935 
936                 boolean forward = offset[0] < limit;
937 
938                 // firstChar is the leftmost char to match in the
939                 // forward direction or the rightmost char to match in
940                 // the reverse direction.
941                 char firstChar = text.charAt(offset[0]);
942 
943                 // If there are multiple strings that can match we
944                 // return the longest match.
945                 int highWaterLength = 0;
946 
947                 for (String trial : strings) {
948                     //if (trial.length() == 0) {
949                     //    return U_MATCH; // null-string always matches
950                     //}
951                     // assert(trial.length() != 0); // We ensure this elsewhere
952 
953                     char c = trial.charAt(forward ? 0 : trial.length() - 1);
954 
955                     // Strings are sorted, so we can optimize in the
956                     // forward direction.
957                     if (forward && c > firstChar) break;
958                     if (c != firstChar) continue;
959 
960                     int length = matchRest(text, offset[0], limit, trial);
961 
962                     if (incremental) {
963                         int maxLen = forward ? limit-offset[0] : offset[0]-limit;
964                         if (length == maxLen) {
965                             // We have successfully matched but only up to limit.
966                             return U_PARTIAL_MATCH;
967                         }
968                     }
969 
970                     if (length == trial.length()) {
971                         // We have successfully matched the whole string.
972                         if (length > highWaterLength) {
973                             highWaterLength = length;
974                         }
975                         // In the forward direction we know strings
976                         // are sorted so we can bail early.
977                         if (forward && length < highWaterLength) {
978                             break;
979                         }
980                         continue;
981                     }
982                 }
983 
984                 // We've checked all strings without a partial match.
985                 // If we have full matches, return the longest one.
986                 if (highWaterLength != 0) {
987                     offset[0] += forward ? highWaterLength : -highWaterLength;
988                     return U_MATCH;
989                 }
990             }
991             return super.matches(text, offset, limit, incremental);
992         }
993     }
994 
995     /**
996      * Returns the longest match for s in text at the given position.
997      * If limit > start then match forward from start+1 to limit
998      * matching all characters except s.charAt(0).  If limit < start,
999      * go backward starting from start-1 matching all characters
1000      * except s.charAt(s.length()-1).  This method assumes that the
1001      * first character, text.charAt(start), matches s, so it does not
1002      * check it.
1003      * @param text the text to match
1004      * @param start the first character to match.  In the forward
1005      * direction, text.charAt(start) is matched against s.charAt(0).
1006      * In the reverse direction, it is matched against
1007      * s.charAt(s.length()-1).
1008      * @param limit the limit offset for matching, either last+1 in
1009      * the forward direction, or last-1 in the reverse direction,
1010      * where last is the index of the last character to match.
1011      * @return If part of s matches up to the limit, return |limit -
1012      * start|.  If all of s matches before reaching the limit, return
1013      * s.length().  If there is a mismatch between s and text, return
1014      * 0
1015      */
matchRest(Replaceable text, int start, int limit, String s)1016     private static int matchRest (Replaceable text, int start, int limit, String s) {
1017         int maxLen;
1018         int slen = s.length();
1019         if (start < limit) {
1020             maxLen = limit - start;
1021             if (maxLen > slen) maxLen = slen;
1022             for (int i = 1; i < maxLen; ++i) {
1023                 if (text.charAt(start + i) != s.charAt(i)) return 0;
1024             }
1025         } else {
1026             maxLen = start - limit;
1027             if (maxLen > slen) maxLen = slen;
1028             --slen; // <=> slen = s.length() - 1;
1029             for (int i = 1; i < maxLen; ++i) {
1030                 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
1031             }
1032         }
1033         return maxLen;
1034     }
1035 
1036     /**
1037      * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1.
1038      * @internal
1039      * @deprecated This API is ICU internal only.
1040      */
1041     @Deprecated
matchesAt(CharSequence text, int offset)1042     public int matchesAt(CharSequence text, int offset) {
1043         int lastLen = -1;
1044         strings:
1045             if (strings.size() != 0) {
1046                 char firstChar = text.charAt(offset);
1047                 String trial = null;
1048                 // find the first string starting with firstChar
1049                 Iterator<String> it = strings.iterator();
1050                 while (it.hasNext()) {
1051                     trial = it.next();
1052                     char firstStringChar = trial.charAt(0);
1053                     if (firstStringChar < firstChar) continue;
1054                     if (firstStringChar > firstChar) break strings;
1055                 }
1056 
1057                 // now keep checking string until we get the longest one
1058                 for (;;) {
1059                     int tempLen = matchesAt(text, offset, trial);
1060                     if (lastLen > tempLen) break strings;
1061                     lastLen = tempLen;
1062                     if (!it.hasNext()) break;
1063                     trial = it.next();
1064                 }
1065             }
1066 
1067         if (lastLen < 2) {
1068             int cp = UTF16.charAt(text, offset);
1069             if (contains(cp)) lastLen = UTF16.getCharCount(cp);
1070         }
1071 
1072         return offset+lastLen;
1073     }
1074 
1075     /**
1076      * Does one string contain another, starting at a specific offset?
1077      * @param text text to match
1078      * @param offsetInText offset within that text
1079      * @param substring substring to match at offset in text
1080      * @return -1 if match fails, otherwise other.length()
1081      */
1082     // Note: This method was moved from CollectionUtilities
matchesAt(CharSequence text, int offsetInText, CharSequence substring)1083     private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) {
1084         int len = substring.length();
1085         int textLength = text.length();
1086         if (textLength + offsetInText > len) {
1087             return -1;
1088         }
1089         int i = 0;
1090         for (int j = offsetInText; i < len; ++i, ++j) {
1091             char pc = substring.charAt(i);
1092             char tc = text.charAt(j);
1093             if (pc != tc) return -1;
1094         }
1095         return i;
1096     }
1097 
1098     /**
1099      * Implementation of UnicodeMatcher API.  Union the set of all
1100      * characters that may be matched by this object into the given
1101      * set.
1102      * @param toUnionTo the set into which to union the source characters
1103      * @stable ICU 2.2
1104      */
1105     @Override
addMatchSetTo(UnicodeSet toUnionTo)1106     public void addMatchSetTo(UnicodeSet toUnionTo) {
1107         toUnionTo.addAll(this);
1108     }
1109 
1110     /**
1111      * Returns the index of the given character within this set, where
1112      * the set is ordered by ascending code point.  If the character
1113      * is not in this set, return -1.  The inverse of this method is
1114      * <code>charAt()</code>.
1115      * @return an index from 0..size()-1, or -1
1116      * @stable ICU 2.0
1117      */
indexOf(int c)1118     public int indexOf(int c) {
1119         if (c < MIN_VALUE || c > MAX_VALUE) {
1120             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1121         }
1122         int i = 0;
1123         int n = 0;
1124         for (;;) {
1125             int start = list[i++];
1126             if (c < start) {
1127                 return -1;
1128             }
1129             int limit = list[i++];
1130             if (c < limit) {
1131                 return n + c - start;
1132             }
1133             n += limit - start;
1134         }
1135     }
1136 
1137     /**
1138      * Returns the character at the given index within this set, where
1139      * the set is ordered by ascending code point.  If the index is
1140      * out of range, return -1.  The inverse of this method is
1141      * <code>indexOf()</code>.
1142      * @param index an index from 0..size()-1
1143      * @return the character at the given index, or -1.
1144      * @stable ICU 2.0
1145      */
charAt(int index)1146     public int charAt(int index) {
1147         if (index >= 0) {
1148             // len2 is the largest even integer <= len, that is, it is len
1149             // for even values and len-1 for odd values.  With odd values
1150             // the last entry is UNICODESET_HIGH.
1151             int len2 = len & ~1;
1152             for (int i=0; i < len2;) {
1153                 int start = list[i++];
1154                 int count = list[i++] - start;
1155                 if (index < count) {
1156                     return start + index;
1157                 }
1158                 index -= count;
1159             }
1160         }
1161         return -1;
1162     }
1163 
1164     /**
1165      * Adds the specified range to this set if it is not already
1166      * present.  If this set already contains the specified range,
1167      * the call leaves this set unchanged.  If <code>end &gt; start</code>
1168      * then an empty range is added, leaving the set unchanged.
1169      *
1170      * @param start first character, inclusive, of range to be added
1171      * to this set.
1172      * @param end last character, inclusive, of range to be added
1173      * to this set.
1174      * @stable ICU 2.0
1175      */
add(int start, int end)1176     public UnicodeSet add(int start, int end) {
1177         checkFrozen();
1178         return add_unchecked(start, end);
1179     }
1180 
1181     /**
1182      * Adds all characters in range (uses preferred naming convention).
1183      * @param start The index of where to start on adding all characters.
1184      * @param end The index of where to end on adding all characters.
1185      * @return a reference to this object
1186      * @stable ICU 4.4
1187      */
addAll(int start, int end)1188     public UnicodeSet addAll(int start, int end) {
1189         checkFrozen();
1190         return add_unchecked(start, end);
1191     }
1192 
1193     // for internal use, after checkFrozen has been called
add_unchecked(int start, int end)1194     private UnicodeSet add_unchecked(int start, int end) {
1195         if (start < MIN_VALUE || start > MAX_VALUE) {
1196             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1197         }
1198         if (end < MIN_VALUE || end > MAX_VALUE) {
1199             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1200         }
1201         if (start < end) {
1202             add(range(start, end), 2, 0);
1203         } else if (start == end) {
1204             add(start);
1205         }
1206         return this;
1207     }
1208 
1209     //    /**
1210     //     * Format out the inversion list as a string, for debugging.  Uncomment when
1211     //     * needed.
1212     //     */
1213     //    public final String dump() {
1214     //        StringBuffer buf = new StringBuffer("[");
1215     //        for (int i=0; i<len; ++i) {
1216     //            if (i != 0) buf.append(", ");
1217     //            int c = list[i];
1218     //            //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') {
1219     //            //    buf.append((char) c);
1220     //            //} else {
1221     //                buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
1222     //            //}
1223     //        }
1224     //        buf.append("]");
1225     //        return buf.toString();
1226     //    }
1227 
1228     /**
1229      * Adds the specified character to this set if it is not already
1230      * present.  If this set already contains the specified character,
1231      * the call leaves this set unchanged.
1232      * @stable ICU 2.0
1233      */
add(int c)1234     public final UnicodeSet add(int c) {
1235         checkFrozen();
1236         return add_unchecked(c);
1237     }
1238 
1239     // for internal use only, after checkFrozen has been called
add_unchecked(int c)1240     private final UnicodeSet add_unchecked(int c) {
1241         if (c < MIN_VALUE || c > MAX_VALUE) {
1242             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1243         }
1244 
1245         // find smallest i such that c < list[i]
1246         // if odd, then it is IN the set
1247         // if even, then it is OUT of the set
1248         int i = findCodePoint(c);
1249 
1250         // already in set?
1251         if ((i & 1) != 0) return this;
1252 
1253         // HIGH is 0x110000
1254         // assert(list[len-1] == HIGH);
1255 
1256         // empty = [HIGH]
1257         // [start_0, limit_0, start_1, limit_1, HIGH]
1258 
1259         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1260         //                             ^
1261         //                             list[i]
1262 
1263         // i == 0 means c is before the first range
1264         // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into
1265         //      add_unchecked, the maximum value that "c" will be compared to
1266         //      is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will
1267         //      never be reached according to this logic.
1268         if (c == list[i]-1) {
1269             // c is before start of next range
1270             list[i] = c;
1271             // if we touched the HIGH mark, then add a new one
1272             if (c == MAX_VALUE) {
1273                 ensureCapacity(len+1);
1274                 list[len++] = HIGH;
1275             }
1276             if (i > 0 && c == list[i-1]) {
1277                 // collapse adjacent ranges
1278 
1279                 // [..., start_k-1, c, c, limit_k, ..., HIGH]
1280                 //                     ^
1281                 //                     list[i]
1282                 System.arraycopy(list, i+1, list, i-1, len-i-1);
1283                 len -= 2;
1284             }
1285         }
1286 
1287         else if (i > 0 && c == list[i-1]) {
1288             // c is after end of prior range
1289             list[i-1]++;
1290             // no need to chcek for collapse here
1291         }
1292 
1293         else {
1294             // At this point we know the new char is not adjacent to
1295             // any existing ranges, and it is not 10FFFF.
1296 
1297 
1298             // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1299             //                             ^
1300             //                             list[i]
1301 
1302             // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
1303             //                             ^
1304             //                             list[i]
1305 
1306             // Don't use ensureCapacity() to save on copying.
1307             // NOTE: This has no measurable impact on performance,
1308             // but it might help in some usage patterns.
1309             if (len+2 > list.length) {
1310                 int[] temp = new int[len + 2 + GROW_EXTRA];
1311                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
1312                 System.arraycopy(list, i, temp, i+2, len-i);
1313                 list = temp;
1314             } else {
1315                 System.arraycopy(list, i, list, i+2, len-i);
1316             }
1317 
1318             list[i] = c;
1319             list[i+1] = c+1;
1320             len += 2;
1321         }
1322 
1323         pat = null;
1324         return this;
1325     }
1326 
1327     /**
1328      * Adds the specified multicharacter to this set if it is not already
1329      * present.  If this set already contains the multicharacter,
1330      * the call leaves this set unchanged.
1331      * Thus "ch" =&gt; {"ch"}
1332      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1333      * @param s the source string
1334      * @return this object, for chaining
1335      * @stable ICU 2.0
1336      */
add(CharSequence s)1337     public final UnicodeSet add(CharSequence s) {
1338         checkFrozen();
1339         int cp = getSingleCP(s);
1340         if (cp < 0) {
1341             strings.add(s.toString());
1342             pat = null;
1343         } else {
1344             add_unchecked(cp, cp);
1345         }
1346         return this;
1347     }
1348 
1349     /**
1350      * Utility for getting code point from single code point CharSequence.
1351      * See the public UTF16.getSingleCodePoint()
1352      * @return a code point IF the string consists of a single one.
1353      * otherwise returns -1.
1354      * @param s to test
1355      */
getSingleCP(CharSequence s)1356     private static int getSingleCP(CharSequence s) {
1357         if (s.length() < 1) {
1358             throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1359         }
1360         if (s.length() > 2) return -1;
1361         if (s.length() == 1) return s.charAt(0);
1362 
1363         // at this point, len = 2
1364         int cp = UTF16.charAt(s, 0);
1365         if (cp > 0xFFFF) { // is surrogate pair
1366             return cp;
1367         }
1368         return -1;
1369     }
1370 
1371     /**
1372      * Adds each of the characters in this string to the set. Thus "ch" =&gt; {"c", "h"}
1373      * If this set already any particular character, it has no effect on that character.
1374      * @param s the source string
1375      * @return this object, for chaining
1376      * @stable ICU 2.0
1377      */
addAll(CharSequence s)1378     public final UnicodeSet addAll(CharSequence s) {
1379         checkFrozen();
1380         int cp;
1381         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1382             cp = UTF16.charAt(s, i);
1383             add_unchecked(cp, cp);
1384         }
1385         return this;
1386     }
1387 
1388     /**
1389      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1390      * If this set already any particular character, it has no effect on that character.
1391      * @param s the source string
1392      * @return this object, for chaining
1393      * @stable ICU 2.0
1394      */
retainAll(CharSequence s)1395     public final UnicodeSet retainAll(CharSequence s) {
1396         return retainAll(fromAll(s));
1397     }
1398 
1399     /**
1400      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1401      * If this set already any particular character, it has no effect on that character.
1402      * @param s the source string
1403      * @return this object, for chaining
1404      * @stable ICU 2.0
1405      */
complementAll(CharSequence s)1406     public final UnicodeSet complementAll(CharSequence s) {
1407         return complementAll(fromAll(s));
1408     }
1409 
1410     /**
1411      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1412      * If this set already any particular character, it has no effect on that character.
1413      * @param s the source string
1414      * @return this object, for chaining
1415      * @stable ICU 2.0
1416      */
removeAll(CharSequence s)1417     public final UnicodeSet removeAll(CharSequence s) {
1418         return removeAll(fromAll(s));
1419     }
1420 
1421     /**
1422      * Remove all strings from this UnicodeSet
1423      * @return this object, for chaining
1424      * @stable ICU 4.2
1425      */
removeAllStrings()1426     public final UnicodeSet removeAllStrings() {
1427         checkFrozen();
1428         if (strings.size() != 0) {
1429             strings.clear();
1430             pat = null;
1431         }
1432         return this;
1433     }
1434 
1435     /**
1436      * Makes a set from a multicharacter string. Thus "ch" =&gt; {"ch"}
1437      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1438      * @param s the source string
1439      * @return a newly created set containing the given string
1440      * @stable ICU 2.0
1441      */
from(CharSequence s)1442     public static UnicodeSet from(CharSequence s) {
1443         return new UnicodeSet().add(s);
1444     }
1445 
1446 
1447     /**
1448      * Makes a set from each of the characters in the string. Thus "ch" =&gt; {"c", "h"}
1449      * @param s the source string
1450      * @return a newly created set containing the given characters
1451      * @stable ICU 2.0
1452      */
fromAll(CharSequence s)1453     public static UnicodeSet fromAll(CharSequence s) {
1454         return new UnicodeSet().addAll(s);
1455     }
1456 
1457 
1458     /**
1459      * Retain only the elements in this set that are contained in the
1460      * specified range.  If <code>end &gt; start</code> then an empty range is
1461      * retained, leaving the set empty.
1462      *
1463      * @param start first character, inclusive, of range to be retained
1464      * to this set.
1465      * @param end last character, inclusive, of range to be retained
1466      * to this set.
1467      * @stable ICU 2.0
1468      */
retain(int start, int end)1469     public UnicodeSet retain(int start, int end) {
1470         checkFrozen();
1471         if (start < MIN_VALUE || start > MAX_VALUE) {
1472             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1473         }
1474         if (end < MIN_VALUE || end > MAX_VALUE) {
1475             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1476         }
1477         if (start <= end) {
1478             retain(range(start, end), 2, 0);
1479         } else {
1480             clear();
1481         }
1482         return this;
1483     }
1484 
1485     /**
1486      * Retain the specified character from this set if it is present.
1487      * Upon return this set will be empty if it did not contain c, or
1488      * will only contain c if it did contain c.
1489      * @param c the character to be retained
1490      * @return this object, for chaining
1491      * @stable ICU 2.0
1492      */
retain(int c)1493     public final UnicodeSet retain(int c) {
1494         return retain(c, c);
1495     }
1496 
1497     /**
1498      * Retain the specified string in this set if it is present.
1499      * Upon return this set will be empty if it did not contain s, or
1500      * will only contain s if it did contain s.
1501      * @param cs the string to be retained
1502      * @return this object, for chaining
1503      * @stable ICU 2.0
1504      */
retain(CharSequence cs)1505     public final UnicodeSet retain(CharSequence cs) {
1506 
1507         int cp = getSingleCP(cs);
1508         if (cp < 0) {
1509             String s = cs.toString();
1510             boolean isIn = strings.contains(s);
1511             if (isIn && size() == 1) {
1512                 return this;
1513             }
1514             clear();
1515             strings.add(s);
1516             pat = null;
1517         } else {
1518             retain(cp, cp);
1519         }
1520         return this;
1521     }
1522 
1523     /**
1524      * Removes the specified range from this set if it is present.
1525      * The set will not contain the specified range once the call
1526      * returns.  If <code>end &gt; start</code> then an empty range is
1527      * removed, leaving the set unchanged.
1528      *
1529      * @param start first character, inclusive, of range to be removed
1530      * from this set.
1531      * @param end last character, inclusive, of range to be removed
1532      * from this set.
1533      * @stable ICU 2.0
1534      */
remove(int start, int end)1535     public UnicodeSet remove(int start, int end) {
1536         checkFrozen();
1537         if (start < MIN_VALUE || start > MAX_VALUE) {
1538             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1539         }
1540         if (end < MIN_VALUE || end > MAX_VALUE) {
1541             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1542         }
1543         if (start <= end) {
1544             retain(range(start, end), 2, 2);
1545         }
1546         return this;
1547     }
1548 
1549     /**
1550      * Removes the specified character from this set if it is present.
1551      * The set will not contain the specified character once the call
1552      * returns.
1553      * @param c the character to be removed
1554      * @return this object, for chaining
1555      * @stable ICU 2.0
1556      */
remove(int c)1557     public final UnicodeSet remove(int c) {
1558         return remove(c, c);
1559     }
1560 
1561     /**
1562      * Removes the specified string from this set if it is present.
1563      * The set will not contain the specified string once the call
1564      * returns.
1565      * @param s the string to be removed
1566      * @return this object, for chaining
1567      * @stable ICU 2.0
1568      */
remove(CharSequence s)1569     public final UnicodeSet remove(CharSequence s) {
1570         int cp = getSingleCP(s);
1571         if (cp < 0) {
1572             strings.remove(s.toString());
1573             pat = null;
1574         } else {
1575             remove(cp, cp);
1576         }
1577         return this;
1578     }
1579 
1580     /**
1581      * Complements the specified range in this set.  Any character in
1582      * the range will be removed if it is in this set, or will be
1583      * added if it is not in this set.  If <code>end &gt; start</code>
1584      * then an empty range is complemented, leaving the set unchanged.
1585      *
1586      * @param start first character, inclusive, of range to be removed
1587      * from this set.
1588      * @param end last character, inclusive, of range to be removed
1589      * from this set.
1590      * @stable ICU 2.0
1591      */
complement(int start, int end)1592     public UnicodeSet complement(int start, int end) {
1593         checkFrozen();
1594         if (start < MIN_VALUE || start > MAX_VALUE) {
1595             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1596         }
1597         if (end < MIN_VALUE || end > MAX_VALUE) {
1598             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1599         }
1600         if (start <= end) {
1601             xor(range(start, end), 2, 0);
1602         }
1603         pat = null;
1604         return this;
1605     }
1606 
1607     /**
1608      * Complements the specified character in this set.  The character
1609      * will be removed if it is in this set, or will be added if it is
1610      * not in this set.
1611      * @stable ICU 2.0
1612      */
complement(int c)1613     public final UnicodeSet complement(int c) {
1614         return complement(c, c);
1615     }
1616 
1617     /**
1618      * This is equivalent to
1619      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1620      * @stable ICU 2.0
1621      */
complement()1622     public UnicodeSet complement() {
1623         checkFrozen();
1624         if (list[0] == LOW) {
1625             System.arraycopy(list, 1, list, 0, len-1);
1626             --len;
1627         } else {
1628             ensureCapacity(len+1);
1629             System.arraycopy(list, 0, list, 1, len);
1630             list[0] = LOW;
1631             ++len;
1632         }
1633         pat = null;
1634         return this;
1635     }
1636 
1637     /**
1638      * Complement the specified string in this set.
1639      * The set will not contain the specified string once the call
1640      * returns.
1641      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1642      * @param s the string to complement
1643      * @return this object, for chaining
1644      * @stable ICU 2.0
1645      */
complement(CharSequence s)1646     public final UnicodeSet complement(CharSequence s) {
1647         checkFrozen();
1648         int cp = getSingleCP(s);
1649         if (cp < 0) {
1650             String s2 = s.toString();
1651             if (strings.contains(s2)) {
1652                 strings.remove(s2);
1653             } else {
1654                 strings.add(s2);
1655             }
1656             pat = null;
1657         } else {
1658             complement(cp, cp);
1659         }
1660         return this;
1661     }
1662 
1663     /**
1664      * Returns true if this set contains the given character.
1665      * @param c character to be checked for containment
1666      * @return true if the test condition is met
1667      * @stable ICU 2.0
1668      */
1669     @Override
contains(int c)1670     public boolean contains(int c) {
1671         if (c < MIN_VALUE || c > MAX_VALUE) {
1672             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1673         }
1674         if (bmpSet != null) {
1675             return bmpSet.contains(c);
1676         }
1677         if (stringSpan != null) {
1678             return stringSpan.contains(c);
1679         }
1680 
1681         /*
1682         // Set i to the index of the start item greater than ch
1683         // We know we will terminate without length test!
1684         int i = -1;
1685         while (true) {
1686             if (c < list[++i]) break;
1687         }
1688          */
1689 
1690         int i = findCodePoint(c);
1691 
1692         return ((i & 1) != 0); // return true if odd
1693     }
1694 
1695     /**
1696      * Returns the smallest value i such that c < list[i].  Caller
1697      * must ensure that c is a legal value or this method will enter
1698      * an infinite loop.  This method performs a binary search.
1699      * @param c a character in the range MIN_VALUE..MAX_VALUE
1700      * inclusive
1701      * @return the smallest integer i in the range 0..len-1,
1702      * inclusive, such that c < list[i]
1703      */
findCodePoint(int c)1704     private final int findCodePoint(int c) {
1705         /* Examples:
1706                                            findCodePoint(c)
1707            set              list[]         c=0 1 3 4 7 8
1708            ===              ==============   ===========
1709            []               [110000]         0 0 0 0 0 0
1710            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
1711            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
1712            [:all:]          [0, 110000]      1 1 1 1 1 1
1713          */
1714 
1715         // Return the smallest i such that c < list[i].  Assume
1716         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
1717         if (c < list[0]) return 0;
1718         // High runner test.  c is often after the last range, so an
1719         // initial check for this condition pays off.
1720         if (len >= 2 && c >= list[len-2]) return len-1;
1721         int lo = 0;
1722         int hi = len - 1;
1723         // invariant: c >= list[lo]
1724         // invariant: c < list[hi]
1725         for (;;) {
1726             int i = (lo + hi) >>> 1;
1727         if (i == lo) return hi;
1728         if (c < list[i]) {
1729             hi = i;
1730         } else {
1731             lo = i;
1732         }
1733         }
1734     }
1735 
1736     //    //----------------------------------------------------------------
1737     //    // Unrolled binary search
1738     //    //----------------------------------------------------------------
1739     //
1740     //    private int validLen = -1; // validated value of len
1741     //    private int topOfLow;
1742     //    private int topOfHigh;
1743     //    private int power;
1744     //    private int deltaStart;
1745     //
1746     //    private void validate() {
1747     //        if (len <= 1) {
1748     //            throw new IllegalArgumentException("list.len==" + len + "; must be >1");
1749     //        }
1750     //
1751     //        // find greatest power of 2 less than or equal to len
1752     //        for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
1753     //
1754     //        // assert(exp2[power] <= len);
1755     //
1756     //        // determine the starting points
1757     //        topOfLow = exp2[power] - 1;
1758     //        topOfHigh = len - 1;
1759     //        deltaStart = exp2[power-1];
1760     //        validLen = len;
1761     //    }
1762     //
1763     //    private static final int exp2[] = {
1764     //        0x1, 0x2, 0x4, 0x8,
1765     //        0x10, 0x20, 0x40, 0x80,
1766     //        0x100, 0x200, 0x400, 0x800,
1767     //        0x1000, 0x2000, 0x4000, 0x8000,
1768     //        0x10000, 0x20000, 0x40000, 0x80000,
1769     //        0x100000, 0x200000, 0x400000, 0x800000,
1770     //        0x1000000, 0x2000000, 0x4000000, 0x8000000,
1771     //        0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
1772     //    };
1773     //
1774     //    /**
1775     //     * Unrolled lowest index GT.
1776     //     */
1777     //    private final int leastIndexGT(int searchValue) {
1778     //
1779     //        if (len != validLen) {
1780     //            if (len == 1) return 0;
1781     //            validate();
1782     //        }
1783     //        int temp;
1784     //
1785     //        // set up initial range to search. Each subrange is a power of two in length
1786     //        int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
1787     //
1788     //        // Completely unrolled binary search, folhighing "Programming Pearls"
1789     //        // Each case deliberately falls through to the next
1790     //        // Logically, list[-1] < all_search_values && list[count] > all_search_values
1791     //        // although the values -1 and count are never actually touched.
1792     //
1793     //        // The bounds at each point are low & high,
1794     //        // where low == high - delta*2
1795     //        // so high - delta is the midpoint
1796     //
1797     //        // The invariant AFTER each line is that list[low] < searchValue <= list[high]
1798     //
1799     //        switch (power) {
1800     //        //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
1801     //        case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
1802     //        case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
1803     //
1804     //        case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
1805     //        case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
1806     //        case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
1807     //        case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
1808     //
1809     //        case 24: if (searchValue < list[temp = high-  0x800000]) high = temp;
1810     //        case 23: if (searchValue < list[temp = high-  0x400000]) high = temp;
1811     //        case 22: if (searchValue < list[temp = high-  0x200000]) high = temp;
1812     //        case 21: if (searchValue < list[temp = high-  0x100000]) high = temp;
1813     //
1814     //        case 20: if (searchValue < list[temp = high-   0x80000]) high = temp;
1815     //        case 19: if (searchValue < list[temp = high-   0x40000]) high = temp;
1816     //        case 18: if (searchValue < list[temp = high-   0x20000]) high = temp;
1817     //        case 17: if (searchValue < list[temp = high-   0x10000]) high = temp;
1818     //
1819     //        case 16: if (searchValue < list[temp = high-    0x8000]) high = temp;
1820     //        case 15: if (searchValue < list[temp = high-    0x4000]) high = temp;
1821     //        case 14: if (searchValue < list[temp = high-    0x2000]) high = temp;
1822     //        case 13: if (searchValue < list[temp = high-    0x1000]) high = temp;
1823     //
1824     //        case 12: if (searchValue < list[temp = high-     0x800]) high = temp;
1825     //        case 11: if (searchValue < list[temp = high-     0x400]) high = temp;
1826     //        case 10: if (searchValue < list[temp = high-     0x200]) high = temp;
1827     //        case  9: if (searchValue < list[temp = high-     0x100]) high = temp;
1828     //
1829     //        case  8: if (searchValue < list[temp = high-      0x80]) high = temp;
1830     //        case  7: if (searchValue < list[temp = high-      0x40]) high = temp;
1831     //        case  6: if (searchValue < list[temp = high-      0x20]) high = temp;
1832     //        case  5: if (searchValue < list[temp = high-      0x10]) high = temp;
1833     //
1834     //        case  4: if (searchValue < list[temp = high-       0x8]) high = temp;
1835     //        case  3: if (searchValue < list[temp = high-       0x4]) high = temp;
1836     //        case  2: if (searchValue < list[temp = high-       0x2]) high = temp;
1837     //        case  1: if (searchValue < list[temp = high-       0x1]) high = temp;
1838     //        }
1839     //
1840     //        return high;
1841     //    }
1842     //
1843     //    // For debugging only
1844     //    public int len() {
1845     //        return len;
1846     //    }
1847     //
1848     //    //----------------------------------------------------------------
1849     //    //----------------------------------------------------------------
1850 
1851     /**
1852      * Returns true if this set contains every character
1853      * of the given range.
1854      * @param start first character, inclusive, of the range
1855      * @param end last character, inclusive, of the range
1856      * @return true if the test condition is met
1857      * @stable ICU 2.0
1858      */
contains(int start, int end)1859     public boolean contains(int start, int end) {
1860         if (start < MIN_VALUE || start > MAX_VALUE) {
1861             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1862         }
1863         if (end < MIN_VALUE || end > MAX_VALUE) {
1864             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1865         }
1866         //int i = -1;
1867         //while (true) {
1868         //    if (start < list[++i]) break;
1869         //}
1870         int i = findCodePoint(start);
1871         return ((i & 1) != 0 && end < list[i]);
1872     }
1873 
1874     /**
1875      * Returns <tt>true</tt> if this set contains the given
1876      * multicharacter string.
1877      * @param s string to be checked for containment
1878      * @return <tt>true</tt> if this set contains the specified string
1879      * @stable ICU 2.0
1880      */
contains(CharSequence s)1881     public final boolean contains(CharSequence s) {
1882 
1883         int cp = getSingleCP(s);
1884         if (cp < 0) {
1885             return strings.contains(s.toString());
1886         } else {
1887             return contains(cp);
1888         }
1889     }
1890 
1891     /**
1892      * Returns true if this set contains all the characters and strings
1893      * of the given set.
1894      * @param b set to be checked for containment
1895      * @return true if the test condition is met
1896      * @stable ICU 2.0
1897      */
containsAll(UnicodeSet b)1898     public boolean containsAll(UnicodeSet b) {
1899         // The specified set is a subset if all of its pairs are contained in
1900         // this set. This implementation accesses the lists directly for speed.
1901         // TODO: this could be faster if size() were cached. But that would affect building speed
1902         // so it needs investigation.
1903         int[] listB = b.list;
1904         boolean needA = true;
1905         boolean needB = true;
1906         int aPtr = 0;
1907         int bPtr = 0;
1908         int aLen = len - 1;
1909         int bLen = b.len - 1;
1910         int startA = 0, startB = 0, limitA = 0, limitB = 0;
1911         while (true) {
1912             // double iterations are such a pain...
1913             if (needA) {
1914                 if (aPtr >= aLen) {
1915                     // ran out of A. If B is also exhausted, then break;
1916                     if (needB && bPtr >= bLen) {
1917                         break;
1918                     }
1919                     return false;
1920                 }
1921                 startA = list[aPtr++];
1922                 limitA = list[aPtr++];
1923             }
1924             if (needB) {
1925                 if (bPtr >= bLen) {
1926                     // ran out of B. Since we got this far, we have an A and we are ok so far
1927                     break;
1928                 }
1929                 startB = listB[bPtr++];
1930                 limitB = listB[bPtr++];
1931             }
1932             // if B doesn't overlap and is greater than A, get new A
1933             if (startB >= limitA) {
1934                 needA = true;
1935                 needB = false;
1936                 continue;
1937             }
1938             // if B is wholy contained in A, then get a new B
1939             if (startB >= startA && limitB <= limitA) {
1940                 needA = false;
1941                 needB = true;
1942                 continue;
1943             }
1944             // all other combinations mean we fail
1945             return false;
1946         }
1947 
1948         if (!strings.containsAll(b.strings)) return false;
1949         return true;
1950     }
1951 
1952     //    /**
1953     //     * Returns true if this set contains all the characters and strings
1954     //     * of the given set.
1955     //     * @param c set to be checked for containment
1956     //     * @return true if the test condition is met
1957     //     * @stable ICU 2.0
1958     //     */
1959     //    public boolean containsAllOld(UnicodeSet c) {
1960     //        // The specified set is a subset if all of its pairs are contained in
1961     //        // this set.  It's possible to code this more efficiently in terms of
1962     //        // direct manipulation of the inversion lists if the need arises.
1963     //        int n = c.getRangeCount();
1964     //        for (int i=0; i<n; ++i) {
1965     //            if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
1966     //                return false;
1967     //            }
1968     //        }
1969     //        if (!strings.containsAll(c.strings)) return false;
1970     //        return true;
1971     //    }
1972 
1973     /**
1974      * Returns true if there is a partition of the string such that this set contains each of the partitioned strings.
1975      * For example, for the Unicode set [a{bc}{cd}]<br>
1976      * containsAll is true for each of: "a", "bc", ""cdbca"<br>
1977      * containsAll is false for each of: "acb", "bcda", "bcx"<br>
1978      * @param s string containing characters to be checked for containment
1979      * @return true if the test condition is met
1980      * @stable ICU 2.0
1981      */
containsAll(String s)1982     public boolean containsAll(String s) {
1983         int cp;
1984         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1985             cp = UTF16.charAt(s, i);
1986             if (!contains(cp))  {
1987                 if (strings.size() == 0) {
1988                     return false;
1989                 }
1990                 return containsAll(s, 0);
1991             }
1992         }
1993         return true;
1994     }
1995 
1996     /**
1997      * Recursive routine called if we fail to find a match in containsAll, and there are strings
1998      * @param s source string
1999      * @param i point to match to the end on
2000      * @return true if ok
2001      */
containsAll(String s, int i)2002     private boolean containsAll(String s, int i) {
2003         if (i >= s.length()) {
2004             return true;
2005         }
2006         int  cp= UTF16.charAt(s, i);
2007         if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
2008             return true;
2009         }
2010         for (String setStr : strings) {
2011             if (s.startsWith(setStr, i) &&  containsAll(s, i+setStr.length())) {
2012                 return true;
2013             }
2014         }
2015         return false;
2016 
2017     }
2018 
2019     /**
2020      * Get the Regex equivalent for this UnicodeSet
2021      * @return regex pattern equivalent to this UnicodeSet
2022      * @internal
2023      * @deprecated This API is ICU internal only.
2024      */
2025     @Deprecated
getRegexEquivalent()2026     public String getRegexEquivalent() {
2027         if (strings.size() == 0) {
2028             return toString();
2029         }
2030         StringBuilder result = new StringBuilder("(?:");
2031         appendNewPattern(result, true, false);
2032         for (String s : strings) {
2033             result.append('|');
2034             _appendToPat(result, s, true);
2035         }
2036         return result.append(")").toString();
2037     }
2038 
2039     /**
2040      * Returns true if this set contains none of the characters
2041      * of the given range.
2042      * @param start first character, inclusive, of the range
2043      * @param end last character, inclusive, of the range
2044      * @return true if the test condition is met
2045      * @stable ICU 2.0
2046      */
containsNone(int start, int end)2047     public boolean containsNone(int start, int end) {
2048         if (start < MIN_VALUE || start > MAX_VALUE) {
2049             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
2050         }
2051         if (end < MIN_VALUE || end > MAX_VALUE) {
2052             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
2053         }
2054         int i = -1;
2055         while (true) {
2056             if (start < list[++i]) break;
2057         }
2058         return ((i & 1) == 0 && end < list[i]);
2059     }
2060 
2061     /**
2062      * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
2063      * For example, for the Unicode set [a{bc}{cd}]<br>
2064      * containsNone is true for: "xy", "cb"<br>
2065      * containsNone is false for: "a", "bc", "bcd"<br>
2066      * @param b set to be checked for containment
2067      * @return true if the test condition is met
2068      * @stable ICU 2.0
2069      */
containsNone(UnicodeSet b)2070     public boolean containsNone(UnicodeSet b) {
2071         // The specified set is a subset if some of its pairs overlap with some of this set's pairs.
2072         // This implementation accesses the lists directly for speed.
2073         int[] listB = b.list;
2074         boolean needA = true;
2075         boolean needB = true;
2076         int aPtr = 0;
2077         int bPtr = 0;
2078         int aLen = len - 1;
2079         int bLen = b.len - 1;
2080         int startA = 0, startB = 0, limitA = 0, limitB = 0;
2081         while (true) {
2082             // double iterations are such a pain...
2083             if (needA) {
2084                 if (aPtr >= aLen) {
2085                     // ran out of A: break so we test strings
2086                     break;
2087                 }
2088                 startA = list[aPtr++];
2089                 limitA = list[aPtr++];
2090             }
2091             if (needB) {
2092                 if (bPtr >= bLen) {
2093                     // ran out of B: break so we test strings
2094                     break;
2095                 }
2096                 startB = listB[bPtr++];
2097                 limitB = listB[bPtr++];
2098             }
2099             // if B is higher than any part of A, get new A
2100             if (startB >= limitA) {
2101                 needA = true;
2102                 needB = false;
2103                 continue;
2104             }
2105             // if A is higher than any part of B, get new B
2106             if (startA >= limitB) {
2107                 needA = false;
2108                 needB = true;
2109                 continue;
2110             }
2111             // all other combinations mean we fail
2112             return false;
2113         }
2114 
2115         if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false;
2116         return true;
2117     }
2118 
2119     //    /**
2120     //     * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
2121     //     * For example, for the Unicode set [a{bc}{cd}]<br>
2122     //     * containsNone is true for: "xy", "cb"<br>
2123     //     * containsNone is false for: "a", "bc", "bcd"<br>
2124     //     * @param c set to be checked for containment
2125     //     * @return true if the test condition is met
2126     //     * @stable ICU 2.0
2127     //     */
2128     //    public boolean containsNoneOld(UnicodeSet c) {
2129     //        // The specified set is a subset if all of its pairs are contained in
2130     //        // this set.  It's possible to code this more efficiently in terms of
2131     //        // direct manipulation of the inversion lists if the need arises.
2132     //        int n = c.getRangeCount();
2133     //        for (int i=0; i<n; ++i) {
2134     //            if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
2135     //                return false;
2136     //            }
2137     //        }
2138     //        if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false;
2139     //        return true;
2140     //    }
2141 
2142     /**
2143      * Returns true if this set contains none of the characters
2144      * of the given string.
2145      * @param s string containing characters to be checked for containment
2146      * @return true if the test condition is met
2147      * @stable ICU 2.0
2148      */
containsNone(CharSequence s)2149     public boolean containsNone(CharSequence s) {
2150         return span(s, SpanCondition.NOT_CONTAINED) == s.length();
2151     }
2152 
2153     /**
2154      * Returns true if this set contains one or more of the characters
2155      * in the given range.
2156      * @param start first character, inclusive, of the range
2157      * @param end last character, inclusive, of the range
2158      * @return true if the condition is met
2159      * @stable ICU 2.0
2160      */
containsSome(int start, int end)2161     public final boolean containsSome(int start, int end) {
2162         return !containsNone(start, end);
2163     }
2164 
2165     /**
2166      * Returns true if this set contains one or more of the characters
2167      * and strings of the given set.
2168      * @param s set to be checked for containment
2169      * @return true if the condition is met
2170      * @stable ICU 2.0
2171      */
containsSome(UnicodeSet s)2172     public final boolean containsSome(UnicodeSet s) {
2173         return !containsNone(s);
2174     }
2175 
2176     /**
2177      * Returns true if this set contains one or more of the characters
2178      * of the given string.
2179      * @param s string containing characters to be checked for containment
2180      * @return true if the condition is met
2181      * @stable ICU 2.0
2182      */
containsSome(CharSequence s)2183     public final boolean containsSome(CharSequence s) {
2184         return !containsNone(s);
2185     }
2186 
2187 
2188     /**
2189      * Adds all of the elements in the specified set to this set if
2190      * they're not already present.  This operation effectively
2191      * modifies this set so that its value is the <i>union</i> of the two
2192      * sets.  The behavior of this operation is unspecified if the specified
2193      * collection is modified while the operation is in progress.
2194      *
2195      * @param c set whose elements are to be added to this set.
2196      * @stable ICU 2.0
2197      */
addAll(UnicodeSet c)2198     public UnicodeSet addAll(UnicodeSet c) {
2199         checkFrozen();
2200         add(c.list, c.len, 0);
2201         strings.addAll(c.strings);
2202         return this;
2203     }
2204 
2205     /**
2206      * Retains only the elements in this set that are contained in the
2207      * specified set.  In other words, removes from this set all of
2208      * its elements that are not contained in the specified set.  This
2209      * operation effectively modifies this set so that its value is
2210      * the <i>intersection</i> of the two sets.
2211      *
2212      * @param c set that defines which elements this set will retain.
2213      * @stable ICU 2.0
2214      */
retainAll(UnicodeSet c)2215     public UnicodeSet retainAll(UnicodeSet c) {
2216         checkFrozen();
2217         retain(c.list, c.len, 0);
2218         strings.retainAll(c.strings);
2219         return this;
2220     }
2221 
2222     /**
2223      * Removes from this set all of its elements that are contained in the
2224      * specified set.  This operation effectively modifies this
2225      * set so that its value is the <i>asymmetric set difference</i> of
2226      * the two sets.
2227      *
2228      * @param c set that defines which elements will be removed from
2229      *          this set.
2230      * @stable ICU 2.0
2231      */
removeAll(UnicodeSet c)2232     public UnicodeSet removeAll(UnicodeSet c) {
2233         checkFrozen();
2234         retain(c.list, c.len, 2);
2235         strings.removeAll(c.strings);
2236         return this;
2237     }
2238 
2239     /**
2240      * Complements in this set all elements contained in the specified
2241      * set.  Any character in the other set will be removed if it is
2242      * in this set, or will be added if it is not in this set.
2243      *
2244      * @param c set that defines which elements will be complemented from
2245      *          this set.
2246      * @stable ICU 2.0
2247      */
complementAll(UnicodeSet c)2248     public UnicodeSet complementAll(UnicodeSet c) {
2249         checkFrozen();
2250         xor(c.list, c.len, 0);
2251         SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings);
2252         return this;
2253     }
2254 
2255     /**
2256      * Removes all of the elements from this set.  This set will be
2257      * empty after this call returns.
2258      * @stable ICU 2.0
2259      */
clear()2260     public UnicodeSet clear() {
2261         checkFrozen();
2262         list[0] = HIGH;
2263         len = 1;
2264         pat = null;
2265         strings.clear();
2266         return this;
2267     }
2268 
2269     /**
2270      * Iteration method that returns the number of ranges contained in
2271      * this set.
2272      * @see #getRangeStart
2273      * @see #getRangeEnd
2274      * @stable ICU 2.0
2275      */
getRangeCount()2276     public int getRangeCount() {
2277         return len/2;
2278     }
2279 
2280     /**
2281      * Iteration method that returns the first character in the
2282      * specified range of this set.
2283      * @exception ArrayIndexOutOfBoundsException if index is outside
2284      * the range <code>0..getRangeCount()-1</code>
2285      * @see #getRangeCount
2286      * @see #getRangeEnd
2287      * @stable ICU 2.0
2288      */
getRangeStart(int index)2289     public int getRangeStart(int index) {
2290         return list[index*2];
2291     }
2292 
2293     /**
2294      * Iteration method that returns the last character in the
2295      * specified range of this set.
2296      * @exception ArrayIndexOutOfBoundsException if index is outside
2297      * the range <code>0..getRangeCount()-1</code>
2298      * @see #getRangeStart
2299      * @see #getRangeEnd
2300      * @stable ICU 2.0
2301      */
getRangeEnd(int index)2302     public int getRangeEnd(int index) {
2303         return (list[index*2 + 1] - 1);
2304     }
2305 
2306     /**
2307      * Reallocate this objects internal structures to take up the least
2308      * possible space, without changing this object's value.
2309      * @stable ICU 2.0
2310      */
compact()2311     public UnicodeSet compact() {
2312         checkFrozen();
2313         if (len != list.length) {
2314             int[] temp = new int[len];
2315             System.arraycopy(list, 0, temp, 0, len);
2316             list = temp;
2317         }
2318         rangeList = null;
2319         buffer = null;
2320         return this;
2321     }
2322 
2323     /**
2324      * Compares the specified object with this set for equality.  Returns
2325      * <tt>true</tt> if the specified object is also a set, the two sets
2326      * have the same size, and every member of the specified set is
2327      * contained in this set (or equivalently, every member of this set is
2328      * contained in the specified set).
2329      *
2330      * @param o Object to be compared for equality with this set.
2331      * @return <tt>true</tt> if the specified Object is equal to this set.
2332      * @stable ICU 2.0
2333      */
2334     @Override
equals(Object o)2335     public boolean equals(Object o) {
2336         if (o == null) {
2337             return false;
2338         }
2339         if (this == o) {
2340             return true;
2341         }
2342         try {
2343             UnicodeSet that = (UnicodeSet) o;
2344             if (len != that.len) return false;
2345             for (int i = 0; i < len; ++i) {
2346                 if (list[i] != that.list[i]) return false;
2347             }
2348             if (!strings.equals(that.strings)) return false;
2349         } catch (Exception e) {
2350             return false;
2351         }
2352         return true;
2353     }
2354 
2355     /**
2356      * Returns the hash code value for this set.
2357      *
2358      * @return the hash code value for this set.
2359      * @see java.lang.Object#hashCode()
2360      * @stable ICU 2.0
2361      */
2362     @Override
hashCode()2363     public int hashCode() {
2364         int result = len;
2365         for (int i = 0; i < len; ++i) {
2366             result *= 1000003;
2367             result += list[i];
2368         }
2369         return result;
2370     }
2371 
2372     /**
2373      * Return a programmer-readable string representation of this object.
2374      * @stable ICU 2.0
2375      */
2376     @Override
toString()2377     public String toString() {
2378         return toPattern(true);
2379     }
2380 
2381     //----------------------------------------------------------------
2382     // Implementation: Pattern parsing
2383     //----------------------------------------------------------------
2384 
2385     /**
2386      * Parses the given pattern, starting at the given position.  The character
2387      * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
2388      * Parsing continues until the corresponding closing ']'.  If a syntax error
2389      * is encountered between the opening and closing brace, the parse fails.
2390      * Upon return from a successful parse, the ParsePosition is updated to
2391      * point to the character following the closing ']', and an inversion
2392      * list for the parsed pattern is returned.  This method
2393      * calls itself recursively to parse embedded subpatterns.
2394      *
2395      * @param pattern the string containing the pattern to be parsed.  The
2396      * portion of the string from pos.getIndex(), which must be a '[', to the
2397      * corresponding closing ']', is parsed.
2398      * @param pos upon entry, the position at which to being parsing.  The
2399      * character at pattern.charAt(pos.getIndex()) must be a '['.  Upon return
2400      * from a successful parse, pos.getIndex() is either the character after the
2401      * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
2402      * is the last character of the pattern string.
2403      * @return an inversion list for the parsed substring
2404      * of <code>pattern</code>
2405      * @exception java.lang.IllegalArgumentException if the parse fails.
2406      * @internal
2407      * @deprecated This API is ICU internal only.
2408      */
2409     @Deprecated
applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2410     public UnicodeSet applyPattern(String pattern,
2411             ParsePosition pos,
2412             SymbolTable symbols,
2413             int options) {
2414 
2415         // Need to build the pattern in a temporary string because
2416         // _applyPattern calls add() etc., which set pat to empty.
2417         boolean parsePositionWasNull = pos == null;
2418         if (parsePositionWasNull) {
2419             pos = new ParsePosition(0);
2420         }
2421 
2422         StringBuilder rebuiltPat = new StringBuilder();
2423         RuleCharacterIterator chars =
2424                 new RuleCharacterIterator(pattern, symbols, pos);
2425         applyPattern(chars, symbols, rebuiltPat, options);
2426         if (chars.inVariable()) {
2427             syntaxError(chars, "Extra chars in variable value");
2428         }
2429         pat = rebuiltPat.toString();
2430         if (parsePositionWasNull) {
2431             int i = pos.getIndex();
2432 
2433             // Skip over trailing whitespace
2434             if ((options & IGNORE_SPACE) != 0) {
2435                 i = PatternProps.skipWhiteSpace(pattern, i);
2436             }
2437 
2438             if (i != pattern.length()) {
2439                 throw new IllegalArgumentException("Parse of \"" + pattern +
2440                         "\" failed at " + i);
2441             }
2442         }
2443         return this;
2444     }
2445 
2446     // Add constants to make the applyPattern() code easier to follow.
2447 
2448     private static final int LAST0_START = 0,
2449             LAST1_RANGE = 1,
2450             LAST2_SET = 2;
2451 
2452     private static final int MODE0_NONE = 0,
2453             MODE1_INBRACKET = 1,
2454             MODE2_OUTBRACKET = 2;
2455 
2456     private static final int SETMODE0_NONE = 0,
2457             SETMODE1_UNICODESET = 1,
2458             SETMODE2_PROPERTYPAT = 2,
2459             SETMODE3_PREPARSED = 3;
2460 
2461     /**
2462      * Parse the pattern from the given RuleCharacterIterator.  The
2463      * iterator is advanced over the parsed pattern.
2464      * @param chars iterator over the pattern characters.  Upon return
2465      * it will be advanced to the first character after the parsed
2466      * pattern, or the end of the iteration if all characters are
2467      * parsed.
2468      * @param symbols symbol table to use to parse and dereference
2469      * variables, or null if none.
2470      * @param rebuiltPat the pattern that was parsed, rebuilt or
2471      * copied from the input pattern, as appropriate.
2472      * @param options a bit mask of zero or more of the following:
2473      * IGNORE_SPACE, CASE.
2474      */
applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options)2475     private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
2476             Appendable rebuiltPat, int options) {
2477 
2478         // Syntax characters: [ ] ^ - & { }
2479 
2480         // Recognized special forms for chars, sets: c-c s-s s&s
2481 
2482         int opts = RuleCharacterIterator.PARSE_VARIABLES |
2483                 RuleCharacterIterator.PARSE_ESCAPES;
2484         if ((options & IGNORE_SPACE) != 0) {
2485             opts |= RuleCharacterIterator.SKIP_WHITESPACE;
2486         }
2487 
2488         StringBuilder patBuf = new StringBuilder(), buf = null;
2489         boolean usePat = false;
2490         UnicodeSet scratch = null;
2491         Object backup = null;
2492 
2493         // mode: 0=before [, 1=between [...], 2=after ]
2494         // lastItem: 0=none, 1=char, 2=set
2495         int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE;
2496         char op = 0;
2497 
2498         boolean invert = false;
2499 
2500         clear();
2501         String lastString = null;
2502 
2503         while (mode != MODE2_OUTBRACKET && !chars.atEnd()) {
2504             //Eclipse stated the following is "dead code"
2505             /*
2506             if (false) {
2507                 // Debugging assertion
2508                 if (!((lastItem == 0 && op == 0) ||
2509                         (lastItem == 1 && (op == 0 || op == '-')) ||
2510                         (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
2511                     throw new IllegalArgumentException();
2512                 }
2513             }*/
2514 
2515             int c = 0;
2516             boolean literal = false;
2517             UnicodeSet nested = null;
2518 
2519             // -------- Check for property pattern
2520 
2521             // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
2522             int setMode = SETMODE0_NONE;
2523             if (resemblesPropertyPattern(chars, opts)) {
2524                 setMode = SETMODE2_PROPERTYPAT;
2525             }
2526 
2527             // -------- Parse '[' of opening delimiter OR nested set.
2528             // If there is a nested set, use `setMode' to define how
2529             // the set should be parsed.  If the '[' is part of the
2530             // opening delimiter for this pattern, parse special
2531             // strings "[", "[^", "[-", and "[^-".  Check for stand-in
2532             // characters representing a nested set in the symbol
2533             // table.
2534 
2535             else {
2536                 // Prepare to backup if necessary
2537                 backup = chars.getPos(backup);
2538                 c = chars.next(opts);
2539                 literal = chars.isEscaped();
2540 
2541                 if (c == '[' && !literal) {
2542                     if (mode == MODE1_INBRACKET) {
2543                         chars.setPos(backup); // backup
2544                         setMode = SETMODE1_UNICODESET;
2545                     } else {
2546                         // Handle opening '[' delimiter
2547                         mode = MODE1_INBRACKET;
2548                         patBuf.append('[');
2549                         backup = chars.getPos(backup); // prepare to backup
2550                         c = chars.next(opts);
2551                         literal = chars.isEscaped();
2552                         if (c == '^' && !literal) {
2553                             invert = true;
2554                             patBuf.append('^');
2555                             backup = chars.getPos(backup); // prepare to backup
2556                             c = chars.next(opts);
2557                             literal = chars.isEscaped();
2558                         }
2559                         // Fall through to handle special leading '-';
2560                         // otherwise restart loop for nested [], \p{}, etc.
2561                         if (c == '-') {
2562                             literal = true;
2563                             // Fall through to handle literal '-' below
2564                         } else {
2565                             chars.setPos(backup); // backup
2566                             continue;
2567                         }
2568                     }
2569                 } else if (symbols != null) {
2570                     UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
2571                     if (m != null) {
2572                         try {
2573                             nested = (UnicodeSet) m;
2574                             setMode = SETMODE3_PREPARSED;
2575                         } catch (ClassCastException e) {
2576                             syntaxError(chars, "Syntax error");
2577                         }
2578                     }
2579                 }
2580             }
2581 
2582             // -------- Handle a nested set.  This either is inline in
2583             // the pattern or represented by a stand-in that has
2584             // previously been parsed and was looked up in the symbol
2585             // table.
2586 
2587             if (setMode != SETMODE0_NONE) {
2588                 if (lastItem == LAST1_RANGE) {
2589                     if (op != 0) {
2590                         syntaxError(chars, "Char expected after operator");
2591                     }
2592                     add_unchecked(lastChar, lastChar);
2593                     _appendToPat(patBuf, lastChar, false);
2594                     lastItem = LAST0_START;
2595                     op = 0;
2596                 }
2597 
2598                 if (op == '-' || op == '&') {
2599                     patBuf.append(op);
2600                 }
2601 
2602                 if (nested == null) {
2603                     if (scratch == null) scratch = new UnicodeSet();
2604                     nested = scratch;
2605                 }
2606                 switch (setMode) {
2607                 case SETMODE1_UNICODESET:
2608                     nested.applyPattern(chars, symbols, patBuf, options);
2609                     break;
2610                 case SETMODE2_PROPERTYPAT:
2611                     chars.skipIgnored(opts);
2612                     nested.applyPropertyPattern(chars, patBuf, symbols);
2613                     break;
2614                 case SETMODE3_PREPARSED: // `nested' already parsed
2615                     nested._toPattern(patBuf, false);
2616                     break;
2617                 }
2618 
2619                 usePat = true;
2620 
2621                 if (mode == MODE0_NONE) {
2622                     // Entire pattern is a category; leave parse loop
2623                     set(nested);
2624                     mode = MODE2_OUTBRACKET;
2625                     break;
2626                 }
2627 
2628                 switch (op) {
2629                 case '-':
2630                     removeAll(nested);
2631                     break;
2632                 case '&':
2633                     retainAll(nested);
2634                     break;
2635                 case 0:
2636                     addAll(nested);
2637                     break;
2638                 }
2639 
2640                 op = 0;
2641                 lastItem = LAST2_SET;
2642 
2643                 continue;
2644             }
2645 
2646             if (mode == MODE0_NONE) {
2647                 syntaxError(chars, "Missing '['");
2648             }
2649 
2650             // -------- Parse special (syntax) characters.  If the
2651             // current character is not special, or if it is escaped,
2652             // then fall through and handle it below.
2653 
2654             if (!literal) {
2655                 switch (c) {
2656                 case ']':
2657                     if (lastItem == LAST1_RANGE) {
2658                         add_unchecked(lastChar, lastChar);
2659                         _appendToPat(patBuf, lastChar, false);
2660                     }
2661                     // Treat final trailing '-' as a literal
2662                     if (op == '-') {
2663                         add_unchecked(op, op);
2664                         patBuf.append(op);
2665                     } else if (op == '&') {
2666                         syntaxError(chars, "Trailing '&'");
2667                     }
2668                     patBuf.append(']');
2669                     mode = MODE2_OUTBRACKET;
2670                     continue;
2671                 case '-':
2672                     if (op == 0) {
2673                         if (lastItem != LAST0_START) {
2674                             op = (char) c;
2675                             continue;
2676                         } else if (lastString != null) {
2677                             op = (char) c;
2678                             continue;
2679                         } else {
2680                             // Treat final trailing '-' as a literal
2681                             add_unchecked(c, c);
2682                             c = chars.next(opts);
2683                             literal = chars.isEscaped();
2684                             if (c == ']' && !literal) {
2685                                 patBuf.append("-]");
2686                                 mode = MODE2_OUTBRACKET;
2687                                 continue;
2688                             }
2689                         }
2690                     }
2691                     syntaxError(chars, "'-' not after char, string, or set");
2692                     break;
2693                 case '&':
2694                     if (lastItem == LAST2_SET && op == 0) {
2695                         op = (char) c;
2696                         continue;
2697                     }
2698                     syntaxError(chars, "'&' not after set");
2699                     break;
2700                 case '^':
2701                     syntaxError(chars, "'^' not after '['");
2702                     break;
2703                 case '{':
2704                     if (op != 0 && op != '-') {
2705                         syntaxError(chars, "Missing operand after operator");
2706                     }
2707                     if (lastItem == LAST1_RANGE) {
2708                         add_unchecked(lastChar, lastChar);
2709                         _appendToPat(patBuf, lastChar, false);
2710                     }
2711                     lastItem = LAST0_START;
2712                     if (buf == null) {
2713                         buf = new StringBuilder();
2714                     } else {
2715                         buf.setLength(0);
2716                     }
2717                     boolean ok = false;
2718                     while (!chars.atEnd()) {
2719                         c = chars.next(opts);
2720                         literal = chars.isEscaped();
2721                         if (c == '}' && !literal) {
2722                             ok = true;
2723                             break;
2724                         }
2725                         appendCodePoint(buf, c);
2726                     }
2727                     if (buf.length() < 1 || !ok) {
2728                         syntaxError(chars, "Invalid multicharacter string");
2729                     }
2730                     // We have new string. Add it to set and continue;
2731                     // we don't need to drop through to the further
2732                     // processing
2733                     String curString = buf.toString();
2734                     if (op == '-') {
2735                         int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString);
2736                         int curSingle = CharSequences.getSingleCodePoint(curString);
2737                         if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) {
2738                             add(lastSingle,curSingle);
2739                         } else {
2740                             try {
2741                                 StringRange.expand(lastString, curString, true, strings);
2742                             } catch (Exception e) {
2743                                 syntaxError(chars, e.getMessage());
2744                             }
2745                         }
2746                         lastString = null;
2747                         op = 0;
2748                     } else {
2749                         add(curString);
2750                         lastString = curString;
2751                     }
2752                     patBuf.append('{');
2753                     _appendToPat(patBuf, curString, false);
2754                     patBuf.append('}');
2755                     continue;
2756                 case SymbolTable.SYMBOL_REF:
2757                     //         symbols  nosymbols
2758                     // [a-$]   error    error (ambiguous)
2759                     // [a$]    anchor   anchor
2760                     // [a-$x]  var "x"* literal '$'
2761                     // [a-$.]  error    literal '$'
2762                     // *We won't get here in the case of var "x"
2763                     backup = chars.getPos(backup);
2764                     c = chars.next(opts);
2765                     literal = chars.isEscaped();
2766                     boolean anchor = (c == ']' && !literal);
2767                     if (symbols == null && !anchor) {
2768                         c = SymbolTable.SYMBOL_REF;
2769                         chars.setPos(backup);
2770                         break; // literal '$'
2771                     }
2772                     if (anchor && op == 0) {
2773                         if (lastItem == LAST1_RANGE) {
2774                             add_unchecked(lastChar, lastChar);
2775                             _appendToPat(patBuf, lastChar, false);
2776                         }
2777                         add_unchecked(UnicodeMatcher.ETHER);
2778                         usePat = true;
2779                         patBuf.append(SymbolTable.SYMBOL_REF).append(']');
2780                         mode = MODE2_OUTBRACKET;
2781                         continue;
2782                     }
2783                     syntaxError(chars, "Unquoted '$'");
2784                     break;
2785                 default:
2786                     break;
2787                 }
2788             }
2789 
2790             // -------- Parse literal characters.  This includes both
2791             // escaped chars ("\u4E01") and non-syntax characters
2792             // ("a").
2793 
2794             switch (lastItem) {
2795             case LAST0_START:
2796                 if (op == '-' && lastString != null) {
2797                     syntaxError(chars, "Invalid range");
2798                 }
2799                 lastItem = LAST1_RANGE;
2800                 lastChar = c;
2801                 lastString = null;
2802                 break;
2803             case LAST1_RANGE:
2804                 if (op == '-') {
2805                     if (lastString != null) {
2806                         syntaxError(chars, "Invalid range");
2807                     }
2808                     if (lastChar >= c) {
2809                         // Don't allow redundant (a-a) or empty (b-a) ranges;
2810                         // these are most likely typos.
2811                         syntaxError(chars, "Invalid range");
2812                     }
2813                     add_unchecked(lastChar, c);
2814                     _appendToPat(patBuf, lastChar, false);
2815                     patBuf.append(op);
2816                     _appendToPat(patBuf, c, false);
2817                     lastItem = LAST0_START;
2818                     op = 0;
2819                 } else {
2820                     add_unchecked(lastChar, lastChar);
2821                     _appendToPat(patBuf, lastChar, false);
2822                     lastChar = c;
2823                 }
2824                 break;
2825             case LAST2_SET:
2826                 if (op != 0) {
2827                     syntaxError(chars, "Set expected after operator");
2828                 }
2829                 lastChar = c;
2830                 lastItem = LAST1_RANGE;
2831                 break;
2832             }
2833         }
2834 
2835         if (mode != MODE2_OUTBRACKET) {
2836             syntaxError(chars, "Missing ']'");
2837         }
2838 
2839         chars.skipIgnored(opts);
2840 
2841         /**
2842          * Handle global flags (invert, case insensitivity).  If this
2843          * pattern should be compiled case-insensitive, then we need
2844          * to close over case BEFORE COMPLEMENTING.  This makes
2845          * patterns like /[^abc]/i work.
2846          */
2847         if ((options & CASE) != 0) {
2848             closeOver(CASE);
2849         }
2850         if (invert) {
2851             complement();
2852         }
2853 
2854         // Use the rebuilt pattern (pat) only if necessary.  Prefer the
2855         // generated pattern.
2856         if (usePat) {
2857             append(rebuiltPat, patBuf.toString());
2858         } else {
2859             appendNewPattern(rebuiltPat, false, true);
2860         }
2861     }
2862 
syntaxError(RuleCharacterIterator chars, String msg)2863     private static void syntaxError(RuleCharacterIterator chars, String msg) {
2864         throw new IllegalArgumentException("Error: " + msg + " at \"" +
2865                 Utility.escape(chars.toString()) +
2866                 '"');
2867     }
2868 
2869     /**
2870      * Add the contents of the UnicodeSet (as strings) into a collection.
2871      * @param target collection to add into
2872      * @stable ICU 4.4
2873      */
addAllTo(T target)2874     public <T extends Collection<String>> T addAllTo(T target) {
2875         return addAllTo(this, target);
2876     }
2877 
2878 
2879     /**
2880      * Add the contents of the UnicodeSet (as strings) into a collection.
2881      * @param target collection to add into
2882      * @stable ICU 4.4
2883      */
addAllTo(String[] target)2884     public String[] addAllTo(String[] target) {
2885         return addAllTo(this, target);
2886     }
2887 
2888     /**
2889      * Add the contents of the UnicodeSet (as strings) into an array.
2890      * @stable ICU 4.4
2891      */
toArray(UnicodeSet set)2892     public static String[] toArray(UnicodeSet set) {
2893         return addAllTo(set, new String[set.size()]);
2894     }
2895 
2896     /**
2897      * Add the contents of the collection (as strings) into this UnicodeSet.
2898      * The collection must not contain null.
2899      * @param source the collection to add
2900      * @return a reference to this object
2901      * @stable ICU 4.4
2902      */
add(Iterable<?> source)2903     public UnicodeSet add(Iterable<?> source) {
2904         return addAll(source);
2905     }
2906 
2907     /**
2908      * Add a collection (as strings) into this UnicodeSet.
2909      * Uses standard naming convention.
2910      * @param source collection to add into
2911      * @return a reference to this object
2912      * @stable ICU 4.4
2913      */
addAll(Iterable<?> source)2914     public UnicodeSet addAll(Iterable<?> source) {
2915         checkFrozen();
2916         for (Object o : source) {
2917             add(o.toString());
2918         }
2919         return this;
2920     }
2921 
2922     //----------------------------------------------------------------
2923     // Implementation: Utility methods
2924     //----------------------------------------------------------------
2925 
ensureCapacity(int newLen)2926     private void ensureCapacity(int newLen) {
2927         if (newLen <= list.length) return;
2928         int[] temp = new int[newLen + GROW_EXTRA];
2929         System.arraycopy(list, 0, temp, 0, len);
2930         list = temp;
2931     }
2932 
ensureBufferCapacity(int newLen)2933     private void ensureBufferCapacity(int newLen) {
2934         if (buffer != null && newLen <= buffer.length) return;
2935         buffer = new int[newLen + GROW_EXTRA];
2936     }
2937 
2938     /**
2939      * Assumes start <= end.
2940      */
range(int start, int end)2941     private int[] range(int start, int end) {
2942         if (rangeList == null) {
2943             rangeList = new int[] { start, end+1, HIGH };
2944         } else {
2945             rangeList[0] = start;
2946             rangeList[1] = end+1;
2947         }
2948         return rangeList;
2949     }
2950 
2951     //----------------------------------------------------------------
2952     // Implementation: Fundamental operations
2953     //----------------------------------------------------------------
2954 
2955     // polarity = 0, 3 is normal: x xor y
2956     // polarity = 1, 2: x xor ~y == x === y
2957 
xor(int[] other, int otherLen, int polarity)2958     private UnicodeSet xor(int[] other, int otherLen, int polarity) {
2959         ensureBufferCapacity(len + otherLen);
2960         int i = 0, j = 0, k = 0;
2961         int a = list[i++];
2962         int b;
2963         // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used
2964         //      so the following if statement will not be called.
2965         ///CLOVER:OFF
2966         if (polarity == 1 || polarity == 2) {
2967             b = LOW;
2968             if (other[j] == LOW) { // skip base if already LOW
2969                 ++j;
2970                 b = other[j];
2971             }
2972             ///CLOVER:ON
2973         } else {
2974             b = other[j++];
2975         }
2976         // simplest of all the routines
2977         // sort the values, discarding identicals!
2978         while (true) {
2979             if (a < b) {
2980                 buffer[k++] = a;
2981                 a = list[i++];
2982             } else if (b < a) {
2983                 buffer[k++] = b;
2984                 b = other[j++];
2985             } else if (a != HIGH) { // at this point, a == b
2986                 // discard both values!
2987                 a = list[i++];
2988                 b = other[j++];
2989             } else { // DONE!
2990                 buffer[k++] = HIGH;
2991                 len = k;
2992                 break;
2993             }
2994         }
2995         // swap list and buffer
2996         int[] temp = list;
2997         list = buffer;
2998         buffer = temp;
2999         pat = null;
3000         return this;
3001     }
3002 
3003     // polarity = 0 is normal: x union y
3004     // polarity = 2: x union ~y
3005     // polarity = 1: ~x union y
3006     // polarity = 3: ~x union ~y
3007 
add(int[] other, int otherLen, int polarity)3008     private UnicodeSet add(int[] other, int otherLen, int polarity) {
3009         ensureBufferCapacity(len + otherLen);
3010         int i = 0, j = 0, k = 0;
3011         int a = list[i++];
3012         int b = other[j++];
3013         // change from xor is that we have to check overlapping pairs
3014         // polarity bit 1 means a is second, bit 2 means b is.
3015         main:
3016             while (true) {
3017                 switch (polarity) {
3018                 case 0: // both first; take lower if unequal
3019                     if (a < b) { // take a
3020                         // Back up over overlapping ranges in buffer[]
3021                         if (k > 0 && a <= buffer[k-1]) {
3022                             // Pick latter end value in buffer[] vs. list[]
3023                             a = max(list[i], buffer[--k]);
3024                         } else {
3025                             // No overlap
3026                             buffer[k++] = a;
3027                             a = list[i];
3028                         }
3029                         i++; // Common if/else code factored out
3030                         polarity ^= 1;
3031                     } else if (b < a) { // take b
3032                         if (k > 0 && b <= buffer[k-1]) {
3033                             b = max(other[j], buffer[--k]);
3034                         } else {
3035                             buffer[k++] = b;
3036                             b = other[j];
3037                         }
3038                         j++;
3039                         polarity ^= 2;
3040                     } else { // a == b, take a, drop b
3041                         if (a == HIGH) break main;
3042                         // This is symmetrical; it doesn't matter if
3043                         // we backtrack with a or b. - liu
3044                         if (k > 0 && a <= buffer[k-1]) {
3045                             a = max(list[i], buffer[--k]);
3046                         } else {
3047                             // No overlap
3048                             buffer[k++] = a;
3049                             a = list[i];
3050                         }
3051                         i++;
3052                         polarity ^= 1;
3053                         b = other[j++]; polarity ^= 2;
3054                     }
3055                     break;
3056                 case 3: // both second; take higher if unequal, and drop other
3057                     if (b <= a) { // take a
3058                         if (a == HIGH) break main;
3059                         buffer[k++] = a;
3060                     } else { // take b
3061                         if (b == HIGH) break main;
3062                         buffer[k++] = b;
3063                     }
3064                     a = list[i++]; polarity ^= 1;   // factored common code
3065                     b = other[j++]; polarity ^= 2;
3066                     break;
3067                 case 1: // a second, b first; if b < a, overlap
3068                     if (a < b) { // no overlap, take a
3069                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3070                     } else if (b < a) { // OVERLAP, drop b
3071                         b = other[j++]; polarity ^= 2;
3072                     } else { // a == b, drop both!
3073                         if (a == HIGH) break main;
3074                         a = list[i++]; polarity ^= 1;
3075                         b = other[j++]; polarity ^= 2;
3076                     }
3077                     break;
3078                 case 2: // a first, b second; if a < b, overlap
3079                     if (b < a) { // no overlap, take b
3080                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3081                     } else  if (a < b) { // OVERLAP, drop a
3082                         a = list[i++]; polarity ^= 1;
3083                     } else { // a == b, drop both!
3084                         if (a == HIGH) break main;
3085                         a = list[i++]; polarity ^= 1;
3086                         b = other[j++]; polarity ^= 2;
3087                     }
3088                     break;
3089                 }
3090             }
3091         buffer[k++] = HIGH;    // terminate
3092         len = k;
3093         // swap list and buffer
3094         int[] temp = list;
3095         list = buffer;
3096         buffer = temp;
3097         pat = null;
3098         return this;
3099     }
3100 
3101     // polarity = 0 is normal: x intersect y
3102     // polarity = 2: x intersect ~y == set-minus
3103     // polarity = 1: ~x intersect y
3104     // polarity = 3: ~x intersect ~y
3105 
retain(int[] other, int otherLen, int polarity)3106     private UnicodeSet retain(int[] other, int otherLen, int polarity) {
3107         ensureBufferCapacity(len + otherLen);
3108         int i = 0, j = 0, k = 0;
3109         int a = list[i++];
3110         int b = other[j++];
3111         // change from xor is that we have to check overlapping pairs
3112         // polarity bit 1 means a is second, bit 2 means b is.
3113         main:
3114             while (true) {
3115                 switch (polarity) {
3116                 case 0: // both first; drop the smaller
3117                     if (a < b) { // drop a
3118                         a = list[i++]; polarity ^= 1;
3119                     } else if (b < a) { // drop b
3120                         b = other[j++]; polarity ^= 2;
3121                     } else { // a == b, take one, drop other
3122                         if (a == HIGH) break main;
3123                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3124                         b = other[j++]; polarity ^= 2;
3125                     }
3126                     break;
3127                 case 3: // both second; take lower if unequal
3128                     if (a < b) { // take a
3129                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3130                     } else if (b < a) { // take b
3131                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3132                     } else { // a == b, take one, drop other
3133                         if (a == HIGH) break main;
3134                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3135                         b = other[j++]; polarity ^= 2;
3136                     }
3137                     break;
3138                 case 1: // a second, b first;
3139                     if (a < b) { // NO OVERLAP, drop a
3140                         a = list[i++]; polarity ^= 1;
3141                     } else if (b < a) { // OVERLAP, take b
3142                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3143                     } else { // a == b, drop both!
3144                         if (a == HIGH) break main;
3145                         a = list[i++]; polarity ^= 1;
3146                         b = other[j++]; polarity ^= 2;
3147                     }
3148                     break;
3149                 case 2: // a first, b second; if a < b, overlap
3150                     if (b < a) { // no overlap, drop b
3151                         b = other[j++]; polarity ^= 2;
3152                     } else  if (a < b) { // OVERLAP, take a
3153                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3154                     } else { // a == b, drop both!
3155                         if (a == HIGH) break main;
3156                         a = list[i++]; polarity ^= 1;
3157                         b = other[j++]; polarity ^= 2;
3158                     }
3159                     break;
3160                 }
3161             }
3162         buffer[k++] = HIGH;    // terminate
3163         len = k;
3164         // swap list and buffer
3165         int[] temp = list;
3166         list = buffer;
3167         buffer = temp;
3168         pat = null;
3169         return this;
3170     }
3171 
max(int a, int b)3172     private static final int max(int a, int b) {
3173         return (a > b) ? a : b;
3174     }
3175 
3176     //----------------------------------------------------------------
3177     // Generic filter-based scanning code
3178     //----------------------------------------------------------------
3179 
3180     private static interface Filter {
contains(int codePoint)3181         boolean contains(int codePoint);
3182     }
3183 
3184     private static class NumericValueFilter implements Filter {
3185         double value;
NumericValueFilter(double value)3186         NumericValueFilter(double value) { this.value = value; }
3187         @Override
contains(int ch)3188         public boolean contains(int ch) {
3189             return UCharacter.getUnicodeNumericValue(ch) == value;
3190         }
3191     }
3192 
3193     private static class GeneralCategoryMaskFilter implements Filter {
3194         int mask;
GeneralCategoryMaskFilter(int mask)3195         GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
3196         @Override
contains(int ch)3197         public boolean contains(int ch) {
3198             return ((1 << UCharacter.getType(ch)) & mask) != 0;
3199         }
3200     }
3201 
3202     private static class IntPropertyFilter implements Filter {
3203         int prop;
3204         int value;
IntPropertyFilter(int prop, int value)3205         IntPropertyFilter(int prop, int value) {
3206             this.prop = prop;
3207             this.value = value;
3208         }
3209         @Override
contains(int ch)3210         public boolean contains(int ch) {
3211             return UCharacter.getIntPropertyValue(ch, prop) == value;
3212         }
3213     }
3214 
3215     private static class ScriptExtensionsFilter implements Filter {
3216         int script;
ScriptExtensionsFilter(int script)3217         ScriptExtensionsFilter(int script) { this.script = script; }
3218         @Override
contains(int c)3219         public boolean contains(int c) {
3220             return UScript.hasScript(c, script);
3221         }
3222     }
3223 
3224     // VersionInfo for unassigned characters
3225     private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
3226 
3227     private static class VersionFilter implements Filter {
3228         VersionInfo version;
VersionFilter(VersionInfo version)3229         VersionFilter(VersionInfo version) { this.version = version; }
3230         @Override
contains(int ch)3231         public boolean contains(int ch) {
3232             VersionInfo v = UCharacter.getAge(ch);
3233             // Reference comparison ok; VersionInfo caches and reuses
3234             // unique objects.
3235             return !Utility.sameObjects(v, NO_VERSION) &&
3236                     v.compareTo(version) <= 0;
3237         }
3238     }
3239 
getInclusions(int src)3240     private static synchronized UnicodeSet getInclusions(int src) {
3241         if (INCLUSIONS == null) {
3242             INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
3243         }
3244         if(INCLUSIONS[src] == null) {
3245             UnicodeSet incl = new UnicodeSet();
3246             switch(src) {
3247             case UCharacterProperty.SRC_CHAR:
3248                 UCharacterProperty.INSTANCE.addPropertyStarts(incl);
3249                 break;
3250             case UCharacterProperty.SRC_PROPSVEC:
3251                 UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
3252                 break;
3253             case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
3254                 UCharacterProperty.INSTANCE.addPropertyStarts(incl);
3255                 UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
3256                 break;
3257             case UCharacterProperty.SRC_CASE_AND_NORM:
3258                 Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
3259                 UCaseProps.INSTANCE.addPropertyStarts(incl);
3260                 break;
3261             case UCharacterProperty.SRC_NFC:
3262                 Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
3263                 break;
3264             case UCharacterProperty.SRC_NFKC:
3265                 Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl);
3266                 break;
3267             case UCharacterProperty.SRC_NFKC_CF:
3268                 Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl);
3269                 break;
3270             case UCharacterProperty.SRC_NFC_CANON_ITER:
3271                 Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl);
3272                 break;
3273             case UCharacterProperty.SRC_CASE:
3274                 UCaseProps.INSTANCE.addPropertyStarts(incl);
3275                 break;
3276             case UCharacterProperty.SRC_BIDI:
3277                 UBiDiProps.INSTANCE.addPropertyStarts(incl);
3278                 break;
3279             default:
3280                 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
3281             }
3282             INCLUSIONS[src] = incl;
3283         }
3284         return INCLUSIONS[src];
3285     }
3286 
3287     /**
3288      * Generic filter-based scanning code for UCD property UnicodeSets.
3289      */
applyFilter(Filter filter, int src)3290     private UnicodeSet applyFilter(Filter filter, int src) {
3291         // Logically, walk through all Unicode characters, noting the start
3292         // and end of each range for which filter.contain(c) is
3293         // true.  Add each range to a set.
3294         //
3295         // To improve performance, use an inclusions set which
3296         // encodes information about character ranges that are known
3297         // to have identical properties.
3298         // getInclusions(src) contains exactly the first characters of
3299         // same-value ranges for the given properties "source".
3300 
3301         clear();
3302 
3303         int startHasProperty = -1;
3304         UnicodeSet inclusions = getInclusions(src);
3305         int limitRange = inclusions.getRangeCount();
3306 
3307         for (int j=0; j<limitRange; ++j) {
3308             // get current range
3309             int start = inclusions.getRangeStart(j);
3310             int end = inclusions.getRangeEnd(j);
3311 
3312             // for all the code points in the range, process
3313             for (int ch = start; ch <= end; ++ch) {
3314                 // only add to the unicodeset on inflection points --
3315                 // where the hasProperty value changes to false
3316                 if (filter.contains(ch)) {
3317                     if (startHasProperty < 0) {
3318                         startHasProperty = ch;
3319                     }
3320                 } else if (startHasProperty >= 0) {
3321                     add_unchecked(startHasProperty, ch-1);
3322                     startHasProperty = -1;
3323                 }
3324             }
3325         }
3326         if (startHasProperty >= 0) {
3327             add_unchecked(startHasProperty, 0x10FFFF);
3328         }
3329 
3330         return this;
3331     }
3332 
3333 
3334     /**
3335      * Remove leading and trailing Pattern_White_Space and compress
3336      * internal Pattern_White_Space to a single space character.
3337      */
mungeCharName(String source)3338     private static String mungeCharName(String source) {
3339         source = PatternProps.trimWhiteSpace(source);
3340         StringBuilder buf = null;
3341         for (int i=0; i<source.length(); ++i) {
3342             char ch = source.charAt(i);
3343             if (PatternProps.isWhiteSpace(ch)) {
3344                 if (buf == null) {
3345                     buf = new StringBuilder().append(source, 0, i);
3346                 } else if (buf.charAt(buf.length() - 1) == ' ') {
3347                     continue;
3348                 }
3349                 ch = ' '; // convert to ' '
3350             }
3351             if (buf != null) {
3352                 buf.append(ch);
3353             }
3354         }
3355         return buf == null ? source : buf.toString();
3356     }
3357 
3358     //----------------------------------------------------------------
3359     // Property set API
3360     //----------------------------------------------------------------
3361 
3362     /**
3363      * Modifies this set to contain those code points which have the
3364      * given value for the given binary or enumerated property, as
3365      * returned by UCharacter.getIntPropertyValue.  Prior contents of
3366      * this set are lost.
3367      *
3368      * @param prop a property in the range
3369      * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or
3370      * UProperty.INT_START..UProperty.INT_LIMIT-1 or.
3371      * UProperty.MASK_START..UProperty.MASK_LIMIT-1.
3372      *
3373      * @param value a value in the range
3374      * UCharacter.getIntPropertyMinValue(prop)..
3375      * UCharacter.getIntPropertyMaxValue(prop), with one exception.
3376      * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be
3377      * a UCharacter.getType() result, but rather a mask value produced
3378      * by logically ORing (1 &lt;&lt; UCharacter.getType()) values together.
3379      * This allows grouped categories such as [:L:] to be represented.
3380      *
3381      * @return a reference to this set
3382      *
3383      * @stable ICU 2.4
3384      */
applyIntPropertyValue(int prop, int value)3385     public UnicodeSet applyIntPropertyValue(int prop, int value) {
3386         checkFrozen();
3387         if (prop == UProperty.GENERAL_CATEGORY_MASK) {
3388             applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR);
3389         } else if (prop == UProperty.SCRIPT_EXTENSIONS) {
3390             applyFilter(new ScriptExtensionsFilter(value), UCharacterProperty.SRC_PROPSVEC);
3391         } else {
3392             applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.INSTANCE.getSource(prop));
3393         }
3394         return this;
3395     }
3396 
3397 
3398 
3399     /**
3400      * Modifies this set to contain those code points which have the
3401      * given value for the given property.  Prior contents of this
3402      * set are lost.
3403      *
3404      * @param propertyAlias a property alias, either short or long.
3405      * The name is matched loosely.  See PropertyAliases.txt for names
3406      * and a description of loose matching.  If the value string is
3407      * empty, then this string is interpreted as either a
3408      * General_Category value alias, a Script value alias, a binary
3409      * property alias, or a special ID.  Special IDs are matched
3410      * loosely and correspond to the following sets:
3411      *
3412      * "ANY" = [\\u0000-\\u0010FFFF],
3413      * "ASCII" = [\\u0000-\\u007F].
3414      *
3415      * @param valueAlias a value alias, either short or long.  The
3416      * name is matched loosely.  See PropertyValueAliases.txt for
3417      * names and a description of loose matching.  In addition to
3418      * aliases listed, numeric values and canonical combining classes
3419      * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc",
3420      * "220").  The value string may also be empty.
3421      *
3422      * @return a reference to this set
3423      *
3424      * @stable ICU 2.4
3425      */
applyPropertyAlias(String propertyAlias, String valueAlias)3426     public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) {
3427         return applyPropertyAlias(propertyAlias, valueAlias, null);
3428     }
3429 
3430     /**
3431      * Modifies this set to contain those code points which have the
3432      * given value for the given property.  Prior contents of this
3433      * set are lost.
3434      * @param propertyAlias A string of the property alias.
3435      * @param valueAlias A string of the value alias.
3436      * @param symbols if not null, then symbols are first called to see if a property
3437      * is available. If true, then everything else is skipped.
3438      * @return this set
3439      * @stable ICU 3.2
3440      */
applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3441     public UnicodeSet applyPropertyAlias(String propertyAlias,
3442             String valueAlias, SymbolTable symbols) {
3443         checkFrozen();
3444         int p;
3445         int v;
3446         boolean mustNotBeEmpty = false, invert = false;
3447 
3448         if (symbols != null
3449                 && (symbols instanceof XSymbolTable)
3450                 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
3451             return this;
3452         }
3453 
3454         if (XSYMBOL_TABLE != null) {
3455             if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) {
3456                 return this;
3457             }
3458         }
3459 
3460         if (valueAlias.length() > 0) {
3461             p = UCharacter.getPropertyEnum(propertyAlias);
3462 
3463             // Treat gc as gcm
3464             if (p == UProperty.GENERAL_CATEGORY) {
3465                 p = UProperty.GENERAL_CATEGORY_MASK;
3466             }
3467 
3468             if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) ||
3469                     (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) ||
3470                     (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) {
3471                 try {
3472                     v = UCharacter.getPropertyValueEnum(p, valueAlias);
3473                 } catch (IllegalArgumentException e) {
3474                     // Handle numeric CCC
3475                     if (p == UProperty.CANONICAL_COMBINING_CLASS ||
3476                             p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
3477                             p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
3478                         v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias));
3479                         // If the resultant set is empty then the numeric value
3480                         // was invalid.
3481                         //mustNotBeEmpty = true;
3482                         // old code was wrong; anything between 0 and 255 is valid even if unused.
3483                         if (v < 0 || v > 255) throw e;
3484                     } else {
3485                         throw e;
3486                     }
3487                 }
3488             }
3489 
3490             else {
3491                 switch (p) {
3492                 case UProperty.NUMERIC_VALUE:
3493                 {
3494                     double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias));
3495                     applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR);
3496                     return this;
3497                 }
3498                 case UProperty.NAME:
3499                 {
3500                     // Must munge name, since
3501                     // UCharacter.charFromName() does not do
3502                     // 'loose' matching.
3503                     String buf = mungeCharName(valueAlias);
3504                     int ch = UCharacter.getCharFromExtendedName(buf);
3505                     if (ch == -1) {
3506                         throw new IllegalArgumentException("Invalid character name");
3507                     }
3508                     clear();
3509                     add_unchecked(ch);
3510                     return this;
3511                 }
3512                 case UProperty.UNICODE_1_NAME:
3513                     // ICU 49 deprecates the Unicode_1_Name property APIs.
3514                     throw new IllegalArgumentException("Unicode_1_Name (na1) not supported");
3515                 case UProperty.AGE:
3516                 {
3517                     // Must munge name, since
3518                     // VersionInfo.getInstance() does not do
3519                     // 'loose' matching.
3520                     VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
3521                     applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
3522                     return this;
3523                 }
3524                 case UProperty.SCRIPT_EXTENSIONS:
3525                     v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias);
3526                     // fall through to calling applyIntPropertyValue()
3527                     break;
3528                 default:
3529                     // p is a non-binary, non-enumerated property that we
3530                     // don't support (yet).
3531                     throw new IllegalArgumentException("Unsupported property");
3532                 }
3533             }
3534         }
3535 
3536         else {
3537             // valueAlias is empty.  Interpret as General Category, Script,
3538             // Binary property, or ANY or ASCII.  Upon success, p and v will
3539             // be set.
3540             UPropertyAliases pnames = UPropertyAliases.INSTANCE;
3541             p = UProperty.GENERAL_CATEGORY_MASK;
3542             v = pnames.getPropertyValueEnum(p, propertyAlias);
3543             if (v == UProperty.UNDEFINED) {
3544                 p = UProperty.SCRIPT;
3545                 v = pnames.getPropertyValueEnum(p, propertyAlias);
3546                 if (v == UProperty.UNDEFINED) {
3547                     p = pnames.getPropertyEnum(propertyAlias);
3548                     if (p == UProperty.UNDEFINED) {
3549                         p = -1;
3550                     }
3551                     if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) {
3552                         v = 1;
3553                     } else if (p == -1) {
3554                         if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) {
3555                             set(MIN_VALUE, MAX_VALUE);
3556                             return this;
3557                         } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) {
3558                             set(0, 0x7F);
3559                             return this;
3560                         } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) {
3561                             // [:Assigned:]=[:^Cn:]
3562                             p = UProperty.GENERAL_CATEGORY_MASK;
3563                             v = (1<<UCharacter.UNASSIGNED);
3564                             invert = true;
3565                         } else {
3566                             // Property name was never matched.
3567                             throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias);
3568                         }
3569                     } else {
3570                         // Valid propery name, but it isn't binary, so the value
3571                         // must be supplied.
3572                         throw new IllegalArgumentException("Missing property value");
3573                     }
3574                 }
3575             }
3576         }
3577 
3578         applyIntPropertyValue(p, v);
3579         if(invert) {
3580             complement();
3581         }
3582 
3583         if (mustNotBeEmpty && isEmpty()) {
3584             // mustNotBeEmpty is set to true if an empty set indicates
3585             // invalid input.
3586             throw new IllegalArgumentException("Invalid property value");
3587         }
3588 
3589         return this;
3590     }
3591 
3592     //----------------------------------------------------------------
3593     // Property set patterns
3594     //----------------------------------------------------------------
3595 
3596     /**
3597      * Return true if the given position, in the given pattern, appears
3598      * to be the start of a property set pattern.
3599      */
resemblesPropertyPattern(String pattern, int pos)3600     private static boolean resemblesPropertyPattern(String pattern, int pos) {
3601         // Patterns are at least 5 characters long
3602         if ((pos+5) > pattern.length()) {
3603             return false;
3604         }
3605 
3606         // Look for an opening [:, [:^, \p, or \P
3607         return pattern.regionMatches(pos, "[:", 0, 2) ||
3608                 pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3609                 pattern.regionMatches(pos, "\\N", 0, 2);
3610     }
3611 
3612     /**
3613      * Return true if the given iterator appears to point at a
3614      * property pattern.  Regardless of the result, return with the
3615      * iterator unchanged.
3616      * @param chars iterator over the pattern characters.  Upon return
3617      * it will be unchanged.
3618      * @param iterOpts RuleCharacterIterator options
3619      */
resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3620     private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
3621             int iterOpts) {
3622         boolean result = false;
3623         iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
3624         Object pos = chars.getPos(null);
3625         int c = chars.next(iterOpts);
3626         if (c == '[' || c == '\\') {
3627             int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
3628             result = (c == '[') ? (d == ':') :
3629                 (d == 'N' || d == 'p' || d == 'P');
3630         }
3631         chars.setPos(pos);
3632         return result;
3633     }
3634 
3635     /**
3636      * Parse the given property pattern at the given parse position.
3637      * @param symbols TODO
3638      */
applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3639     private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
3640         int pos = ppos.getIndex();
3641 
3642         // On entry, ppos should point to one of the following locations:
3643 
3644         // Minimum length is 5 characters, e.g. \p{L}
3645         if ((pos+5) > pattern.length()) {
3646             return null;
3647         }
3648 
3649         boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
3650         boolean isName = false; // true for \N{pat}, o/w false
3651         boolean invert = false;
3652 
3653         // Look for an opening [:, [:^, \p, or \P
3654         if (pattern.regionMatches(pos, "[:", 0, 2)) {
3655             posix = true;
3656             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
3657             if (pos < pattern.length() && pattern.charAt(pos) == '^') {
3658                 ++pos;
3659                 invert = true;
3660             }
3661         } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3662                 pattern.regionMatches(pos, "\\N", 0, 2)) {
3663             char c = pattern.charAt(pos+1);
3664             invert = (c == 'P');
3665             isName = (c == 'N');
3666             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
3667             if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
3668                 // Syntax error; "\p" or "\P" not followed by "{"
3669                 return null;
3670             }
3671         } else {
3672             // Open delimiter not seen
3673             return null;
3674         }
3675 
3676         // Look for the matching close delimiter, either :] or }
3677         int close = pattern.indexOf(posix ? ":]" : "}", pos);
3678         if (close < 0) {
3679             // Syntax error; close delimiter missing
3680             return null;
3681         }
3682 
3683         // Look for an '=' sign.  If this is present, we will parse a
3684         // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
3685         // pattern.
3686         int equals = pattern.indexOf('=', pos);
3687         String propName, valueName;
3688         if (equals >= 0 && equals < close && !isName) {
3689             // Equals seen; parse medium/long pattern
3690             propName = pattern.substring(pos, equals);
3691             valueName = pattern.substring(equals+1, close);
3692         }
3693 
3694         else {
3695             // Handle case where no '=' is seen, and \N{}
3696             propName = pattern.substring(pos, close);
3697             valueName = "";
3698 
3699             // Handle \N{name}
3700             if (isName) {
3701                 // This is a little inefficient since it means we have to
3702                 // parse "na" back to UProperty.NAME even though we already
3703                 // know it's UProperty.NAME.  If we refactor the API to
3704                 // support args of (int, String) then we can remove
3705                 // "na" and make this a little more efficient.
3706                 valueName = propName;
3707                 propName = "na";
3708             }
3709         }
3710 
3711         applyPropertyAlias(propName, valueName, symbols);
3712 
3713         if (invert) {
3714             complement();
3715         }
3716 
3717         // Move to the limit position after the close delimiter
3718         ppos.setIndex(close + (posix ? 2 : 1));
3719 
3720         return this;
3721     }
3722 
3723     /**
3724      * Parse a property pattern.
3725      * @param chars iterator over the pattern characters.  Upon return
3726      * it will be advanced to the first character after the parsed
3727      * pattern, or the end of the iteration if all characters are
3728      * parsed.
3729      * @param rebuiltPat the pattern that was parsed, rebuilt or
3730      * copied from the input pattern, as appropriate.
3731      * @param symbols TODO
3732      */
applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3733     private void applyPropertyPattern(RuleCharacterIterator chars,
3734             Appendable rebuiltPat, SymbolTable symbols) {
3735         String patStr = chars.lookahead();
3736         ParsePosition pos = new ParsePosition(0);
3737         applyPropertyPattern(patStr, pos, symbols);
3738         if (pos.getIndex() == 0) {
3739             syntaxError(chars, "Invalid property pattern");
3740         }
3741         chars.jumpahead(pos.getIndex());
3742         append(rebuiltPat, patStr.substring(0, pos.getIndex()));
3743     }
3744 
3745     //----------------------------------------------------------------
3746     // Case folding API
3747     //----------------------------------------------------------------
3748 
3749     /**
3750      * Bitmask for constructor and applyPattern() indicating that
3751      * white space should be ignored.  If set, ignore Unicode Pattern_White_Space characters,
3752      * unless they are quoted or escaped.  This may be ORed together
3753      * with other selectors.
3754      * @stable ICU 3.8
3755      */
3756     public static final int IGNORE_SPACE = 1;
3757 
3758     /**
3759      * Bitmask for constructor, applyPattern(), and closeOver()
3760      * indicating letter case.  This may be ORed together with other
3761      * selectors.
3762      *
3763      * Enable case insensitive matching.  E.g., "[ab]" with this flag
3764      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
3765      * match all except 'a', 'A', 'b', and 'B'. This performs a full
3766      * closure over case mappings, e.g. U+017F for s.
3767      *
3768      * The resulting set is a superset of the input for the code points but
3769      * not for the strings.
3770      * It performs a case mapping closure of the code points and adds
3771      * full case folding strings for the code points, and reduces strings of
3772      * the original set to their full case folding equivalents.
3773      *
3774      * This is designed for case-insensitive matches, for example
3775      * in regular expressions. The full code point case closure allows checking of
3776      * an input character directly against the closure set.
3777      * Strings are matched by comparing the case-folded form from the closure
3778      * set with an incremental case folding of the string in question.
3779      *
3780      * The closure set will also contain single code points if the original
3781      * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
3782      * This is not necessary (that is, redundant) for the above matching method
3783      * but results in the same closure sets regardless of whether the original
3784      * set contained the code point or a string.
3785      * @stable ICU 3.8
3786      */
3787     public static final int CASE = 2;
3788 
3789     /**
3790      * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
3791      * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
3792      * @see #CASE
3793      * @stable ICU 3.4
3794      */
3795     public static final int CASE_INSENSITIVE = 2;
3796 
3797     /**
3798      * Bitmask for constructor, applyPattern(), and closeOver()
3799      * indicating letter case.  This may be ORed together with other
3800      * selectors.
3801      *
3802      * Enable case insensitive matching.  E.g., "[ab]" with this flag
3803      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
3804      * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
3805      * title-, and uppercase mappings as well as the case folding
3806      * of each existing element in the set.
3807      * @stable ICU 3.4
3808      */
3809     public static final int ADD_CASE_MAPPINGS = 4;
3810 
3811     //  add the result of a full case mapping to the set
3812     //  use str as a temporary string to avoid constructing one
addCaseMapping(UnicodeSet set, int result, StringBuilder full)3813     private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
3814         if(result >= 0) {
3815             if(result > UCaseProps.MAX_STRING_LENGTH) {
3816                 // add a single-code point case mapping
3817                 set.add(result);
3818             } else {
3819                 // add a string case mapping from full with length result
3820                 set.add(full.toString());
3821                 full.setLength(0);
3822             }
3823         }
3824         // result < 0: the code point mapped to itself, no need to add it
3825         // see UCaseProps
3826     }
3827 
3828     /**
3829      * Close this set over the given attribute.  For the attribute
3830      * CASE, the result is to modify this set so that:
3831      *
3832      * 1. For each character or string 'a' in this set, all strings
3833      * 'b' such that foldCase(a) == foldCase(b) are added to this set.
3834      * (For most 'a' that are single characters, 'b' will have
3835      * b.length() == 1.)
3836      *
3837      * 2. For each string 'e' in the resulting set, if e !=
3838      * foldCase(e), 'e' will be removed.
3839      *
3840      * Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
3841      *
3842      * (Here foldCase(x) refers to the operation
3843      * UCharacter.foldCase(x, true), and a == b actually denotes
3844      * a.equals(b), not pointer comparison.)
3845      *
3846      * @param attribute bitmask for attributes to close over.
3847      * Currently only the CASE bit is supported.  Any undefined bits
3848      * are ignored.
3849      * @return a reference to this set.
3850      * @stable ICU 3.8
3851      */
closeOver(int attribute)3852     public UnicodeSet closeOver(int attribute) {
3853         checkFrozen();
3854         if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
3855             UCaseProps csp = UCaseProps.INSTANCE;
3856             UnicodeSet foldSet = new UnicodeSet(this);
3857             ULocale root = ULocale.ROOT;
3858 
3859             // start with input set to guarantee inclusion
3860             // CASE: remove strings because the strings will actually be reduced (folded);
3861             //       therefore, start with no strings and add only those needed
3862             if((attribute & CASE) != 0) {
3863                 foldSet.strings.clear();
3864             }
3865 
3866             int n = getRangeCount();
3867             int result;
3868             StringBuilder full = new StringBuilder();
3869 
3870             for (int i=0; i<n; ++i) {
3871                 int start = getRangeStart(i);
3872                 int end   = getRangeEnd(i);
3873 
3874                 if((attribute & CASE) != 0) {
3875                     // full case closure
3876                     for (int cp=start; cp<=end; ++cp) {
3877                         csp.addCaseClosure(cp, foldSet);
3878                     }
3879                 } else {
3880                     // add case mappings
3881                     // (does not add long s for regular s, or Kelvin for k, for example)
3882                     for (int cp=start; cp<=end; ++cp) {
3883                         result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
3884                         addCaseMapping(foldSet, result, full);
3885 
3886                         result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
3887                         addCaseMapping(foldSet, result, full);
3888 
3889                         result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
3890                         addCaseMapping(foldSet, result, full);
3891 
3892                         result = csp.toFullFolding(cp, full, 0);
3893                         addCaseMapping(foldSet, result, full);
3894                     }
3895                 }
3896             }
3897             if (!strings.isEmpty()) {
3898                 if ((attribute & CASE) != 0) {
3899                     for (String s : strings) {
3900                         String str = UCharacter.foldCase(s, 0);
3901                         if(!csp.addStringCaseClosure(str, foldSet)) {
3902                             foldSet.add(str); // does not map to code points: add the folded string itself
3903                         }
3904                     }
3905                 } else {
3906                     BreakIterator bi = BreakIterator.getWordInstance(root);
3907                     for (String str : strings) {
3908                         // TODO: call lower-level functions
3909                         foldSet.add(UCharacter.toLowerCase(root, str));
3910                         foldSet.add(UCharacter.toTitleCase(root, str, bi));
3911                         foldSet.add(UCharacter.toUpperCase(root, str));
3912                         foldSet.add(UCharacter.foldCase(str, 0));
3913                     }
3914                 }
3915             }
3916             set(foldSet);
3917         }
3918         return this;
3919     }
3920 
3921     /**
3922      * Internal class for customizing UnicodeSet parsing of properties.
3923      * TODO: extend to allow customizing of codepoint ranges
3924      * @draft ICU3.8 (retain)
3925      * @provisional This API might change or be removed in a future release.
3926      * @author medavis
3927      */
3928     abstract public static class XSymbolTable implements SymbolTable {
3929         /**
3930          * Default constructor
3931          * @draft ICU3.8 (retain)
3932          * @provisional This API might change or be removed in a future release.
3933          */
XSymbolTable()3934         public XSymbolTable(){}
3935         /**
3936          * Supplies default implementation for SymbolTable (no action).
3937          * @draft ICU3.8 (retain)
3938          * @provisional This API might change or be removed in a future release.
3939          */
3940         @Override
lookupMatcher(int i)3941         public UnicodeMatcher lookupMatcher(int i) {
3942             return null;
3943         }
3944 
3945         /**
3946          * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style
3947          * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be.
3948          * <p>
3949          * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a
3950          * propertyName+propertyValue combination.
3951          *
3952          * @param propertyName
3953          *            the name of the property
3954          * @param propertyValue
3955          *            the name of the property value
3956          * @param result UnicodeSet value to change
3957          *            a set to which the characters having the propertyName+propertyValue are to be added.
3958          * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters
3959          *         with that property have been added to the UnicodeSet, and returns false if the
3960          *         propertyName+propertyValue combination is not recognized (in which case result is unaltered).
3961          * @draft ICU3.8 (retain)
3962          * @provisional This API might change or be removed in a future release.
3963          */
applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)3964         public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) {
3965             return false;
3966         }
3967         /**
3968          * Supplies default implementation for SymbolTable (no action).
3969          * @draft ICU3.8 (retain)
3970          * @provisional This API might change or be removed in a future release.
3971          */
3972         @Override
lookup(String s)3973         public char[] lookup(String s) {
3974             return null;
3975         }
3976         /**
3977          * Supplies default implementation for SymbolTable (no action).
3978          * @draft ICU3.8 (retain)
3979          * @provisional This API might change or be removed in a future release.
3980          */
3981         @Override
parseReference(String text, ParsePosition pos, int limit)3982         public String parseReference(String text, ParsePosition pos, int limit) {
3983             return null;
3984         }
3985     }
3986 
3987     /**
3988      * Is this frozen, according to the Freezable interface?
3989      *
3990      * @return value
3991      * @stable ICU 3.8
3992      */
3993     @Override
isFrozen()3994     public boolean isFrozen() {
3995         return (bmpSet != null || stringSpan != null);
3996     }
3997 
3998     /**
3999      * Freeze this class, according to the Freezable interface.
4000      *
4001      * @return this
4002      * @stable ICU 4.4
4003      */
4004     @Override
freeze()4005     public UnicodeSet freeze() {
4006         if (!isFrozen()) {
4007             // Do most of what compact() does before freezing because
4008             // compact() will not work when the set is frozen.
4009             // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
4010 
4011             // Delete buffer first to defragment memory less.
4012             buffer = null;
4013             if (list.length > (len + GROW_EXTRA)) {
4014                 // Make the capacity equal to len or 1.
4015                 // We don't want to realloc of 0 size.
4016                 int capacity = (len == 0) ? 1 : len;
4017                 int[] oldList = list;
4018                 list = new int[capacity];
4019                 for (int i = capacity; i-- > 0;) {
4020                     list[i] = oldList[i];
4021                 }
4022             }
4023 
4024             // Optimize contains() and span() and similar functions.
4025             if (!strings.isEmpty()) {
4026                 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
4027             }
4028             if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
4029                 // Optimize for code point spans.
4030                 // There are no strings, or
4031                 // all strings are irrelevant for span() etc. because
4032                 // all of each string's code points are contained in this set.
4033                 // However, fully contained strings are relevant for spanAndCount(),
4034                 // so we create both objects.
4035                 bmpSet = new BMPSet(list, len);
4036             }
4037         }
4038         return this;
4039     }
4040 
4041     /**
4042      * Span a string using this UnicodeSet.
4043      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4044      * @param s The string to be spanned
4045      * @param spanCondition The span condition
4046      * @return the length of the span
4047      * @stable ICU 4.4
4048      */
span(CharSequence s, SpanCondition spanCondition)4049     public int span(CharSequence s, SpanCondition spanCondition) {
4050         return span(s, 0, spanCondition);
4051     }
4052 
4053     /**
4054      * Span a string using this UnicodeSet.
4055      *   If the start index is less than 0, span will start from 0.
4056      *   If the start index is greater than the string length, span returns the string length.
4057      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4058      * @param s The string to be spanned
4059      * @param start The start index that the span begins
4060      * @param spanCondition The span condition
4061      * @return the string index which ends the span (i.e. exclusive)
4062      * @stable ICU 4.4
4063      */
span(CharSequence s, int start, SpanCondition spanCondition)4064     public int span(CharSequence s, int start, SpanCondition spanCondition) {
4065         int end = s.length();
4066         if (start < 0) {
4067             start = 0;
4068         } else if (start >= end) {
4069             return end;
4070         }
4071         if (bmpSet != null) {
4072             // Frozen set without strings, or no string is relevant for span().
4073             return bmpSet.span(s, start, spanCondition, null);
4074         }
4075         if (stringSpan != null) {
4076             return stringSpan.span(s, start, spanCondition);
4077         } else if (!strings.isEmpty()) {
4078             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
4079                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
4080             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
4081             if (strSpan.needsStringSpanUTF16()) {
4082                 return strSpan.span(s, start, spanCondition);
4083             }
4084         }
4085 
4086         return spanCodePointsAndCount(s, start, spanCondition, null);
4087     }
4088 
4089     /**
4090      * Same as span() but also counts the smallest number of set elements on any path across the span.
4091      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4092      * @param outCount An output-only object (must not be null) for returning the count.
4093      * @return the limit (exclusive end) of the span
4094      * @internal
4095      * @deprecated This API is ICU internal only.
4096      */
4097     @Deprecated
spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4098     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
4099         if (outCount == null) {
4100             throw new IllegalArgumentException("outCount must not be null");
4101         }
4102         int end = s.length();
4103         if (start < 0) {
4104             start = 0;
4105         } else if (start >= end) {
4106             return end;
4107         }
4108         if (stringSpan != null) {
4109             // We might also have bmpSet != null,
4110             // but fully-contained strings are relevant for counting elements.
4111             return stringSpan.spanAndCount(s, start, spanCondition, outCount);
4112         } else if (bmpSet != null) {
4113             return bmpSet.span(s, start, spanCondition, outCount);
4114         } else if (!strings.isEmpty()) {
4115             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
4116                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
4117             which |= UnicodeSetStringSpan.WITH_COUNT;
4118             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
4119             return strSpan.spanAndCount(s, start, spanCondition, outCount);
4120         }
4121 
4122         return spanCodePointsAndCount(s, start, spanCondition, outCount);
4123     }
4124 
spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4125     private int spanCodePointsAndCount(CharSequence s, int start,
4126             SpanCondition spanCondition, OutputInt outCount) {
4127         // Pin to 0/1 values.
4128         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
4129 
4130         int c;
4131         int next = start;
4132         int length = s.length();
4133         int count = 0;
4134         do {
4135             c = Character.codePointAt(s, next);
4136             if (spanContained != contains(c)) {
4137                 break;
4138             }
4139             ++count;
4140             next += Character.charCount(c);
4141         } while (next < length);
4142         if (outCount != null) { outCount.value = count; }
4143         return next;
4144     }
4145 
4146     /**
4147      * Span a string backwards (from the end) using this UnicodeSet.
4148      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4149      * @param s The string to be spanned
4150      * @param spanCondition The span condition
4151      * @return The string index which starts the span (i.e. inclusive).
4152      * @stable ICU 4.4
4153      */
spanBack(CharSequence s, SpanCondition spanCondition)4154     public int spanBack(CharSequence s, SpanCondition spanCondition) {
4155         return spanBack(s, s.length(), spanCondition);
4156     }
4157 
4158     /**
4159      * Span a string backwards (from the fromIndex) using this UnicodeSet.
4160      * If the fromIndex is less than 0, spanBack will return 0.
4161      * If fromIndex is greater than the string length, spanBack will start from the string length.
4162      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4163      * @param s The string to be spanned
4164      * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
4165      * @param spanCondition The span condition
4166      * @return The string index which starts the span (i.e. inclusive).
4167      * @stable ICU 4.4
4168      */
spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4169     public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
4170         if (fromIndex <= 0) {
4171             return 0;
4172         }
4173         if (fromIndex > s.length()) {
4174             fromIndex = s.length();
4175         }
4176         if (bmpSet != null) {
4177             // Frozen set without strings, or no string is relevant for spanBack().
4178             return bmpSet.spanBack(s, fromIndex, spanCondition);
4179         }
4180         if (stringSpan != null) {
4181             return stringSpan.spanBack(s, fromIndex, spanCondition);
4182         } else if (!strings.isEmpty()) {
4183             int which = (spanCondition == SpanCondition.NOT_CONTAINED)
4184                     ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
4185                             : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
4186             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
4187             if (strSpan.needsStringSpanUTF16()) {
4188                 return strSpan.spanBack(s, fromIndex, spanCondition);
4189             }
4190         }
4191 
4192         // Pin to 0/1 values.
4193         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
4194 
4195         int c;
4196         int prev = fromIndex;
4197         do {
4198             c = Character.codePointBefore(s, prev);
4199             if (spanContained != contains(c)) {
4200                 break;
4201             }
4202             prev -= Character.charCount(c);
4203         } while (prev > 0);
4204         return prev;
4205     }
4206 
4207     /**
4208      * Clone a thawed version of this class, according to the Freezable interface.
4209      * @return the clone, not frozen
4210      * @stable ICU 4.4
4211      */
4212     @Override
cloneAsThawed()4213     public UnicodeSet cloneAsThawed() {
4214         UnicodeSet result = new UnicodeSet(this);
4215         assert !result.isFrozen();
4216         return result;
4217     }
4218 
4219     // internal function
checkFrozen()4220     private void checkFrozen() {
4221         if (isFrozen()) {
4222             throw new UnsupportedOperationException("Attempt to modify frozen object");
4223         }
4224     }
4225 
4226     // ************************
4227     // Additional methods for integration with Generics and Collections
4228     // ************************
4229 
4230     /**
4231      * A struct-like class used for iteration through ranges, for faster iteration than by String.
4232      * Read about the restrictions on usage in {@link UnicodeSet#ranges()}.
4233      *
4234      * @stable ICU 54
4235      */
4236     public static class EntryRange {
4237         /**
4238          * The starting code point of the range.
4239          *
4240          * @stable ICU 54
4241          */
4242         public int codepoint;
4243         /**
4244          * The ending code point of the range
4245          *
4246          * @stable ICU 54
4247          */
4248         public int codepointEnd;
4249 
EntryRange()4250         EntryRange() {
4251         }
4252 
4253         /**
4254          * {@inheritDoc}
4255          *
4256          * @stable ICU 54
4257          */
4258         @Override
toString()4259         public String toString() {
4260             StringBuilder b = new StringBuilder();
4261             return (
4262                     codepoint == codepointEnd ? _appendToPat(b, codepoint, false)
4263                             : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false))
4264                             .toString();
4265         }
4266     }
4267 
4268     /**
4269      * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points.
4270      * The UnicodeSet must not be altered during the iteration.
4271      * The EntryRange instance is the same each time; the contents are just reset.
4272      *
4273      * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
4274      *
4275      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
4276      * Do not alter the UnicodeSet while iterating.
4277      *
4278      * <pre>
4279      * // Sample code
4280      * for (EntryRange range : us1.ranges()) {
4281      *     // do something with code points between range.codepoint and range.codepointEnd;
4282      * }
4283      * for (String s : us1.strings()) {
4284      *     // do something with each string;
4285      * }
4286      * </pre>
4287      *
4288      * @stable ICU 54
4289      */
ranges()4290     public Iterable<EntryRange> ranges() {
4291         return new EntryRangeIterable();
4292     }
4293 
4294     private class EntryRangeIterable implements Iterable<EntryRange> {
4295         @Override
iterator()4296         public Iterator<EntryRange> iterator() {
4297             return new EntryRangeIterator();
4298         }
4299     }
4300 
4301     private class EntryRangeIterator implements Iterator<EntryRange> {
4302         int pos;
4303         EntryRange result = new EntryRange();
4304 
4305         @Override
hasNext()4306         public boolean hasNext() {
4307             return pos < len-1;
4308         }
4309         @Override
next()4310         public EntryRange next() {
4311             if (pos < len-1) {
4312                 result.codepoint = list[pos++];
4313                 result.codepointEnd = list[pos++]-1;
4314             } else {
4315                 throw new NoSuchElementException();
4316             }
4317             return result;
4318         }
4319         @Override
remove()4320         public void remove() {
4321             throw new UnsupportedOperationException();
4322         }
4323     }
4324 
4325 
4326     /**
4327      * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}.
4328      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
4329      * Do not alter the UnicodeSet while iterating.
4330      * @see java.util.Set#iterator()
4331      * @stable ICU 4.4
4332      */
4333     @Override
iterator()4334     public Iterator<String> iterator() {
4335         return new UnicodeSetIterator2(this);
4336     }
4337 
4338     // Cover for string iteration.
4339     private static class UnicodeSetIterator2 implements Iterator<String> {
4340         // Invariants:
4341         // sourceList != null then sourceList[item] is a valid character
4342         // sourceList == null then delegates to stringIterator
4343         private int[] sourceList;
4344         private int len;
4345         private int item;
4346         private int current;
4347         private int limit;
4348         private TreeSet<String> sourceStrings;
4349         private Iterator<String> stringIterator;
4350         private char[] buffer;
4351 
UnicodeSetIterator2(UnicodeSet source)4352         UnicodeSetIterator2(UnicodeSet source) {
4353             // set according to invariants
4354             len = source.len - 1;
4355             if (len > 0) {
4356                 sourceStrings = source.strings;
4357                 sourceList = source.list;
4358                 current = sourceList[item++];
4359                 limit = sourceList[item++];
4360             } else {
4361                 stringIterator = source.strings.iterator();
4362                 sourceList = null;
4363             }
4364         }
4365 
4366         /* (non-Javadoc)
4367          * @see java.util.Iterator#hasNext()
4368          */
4369         @Override
hasNext()4370         public boolean hasNext() {
4371             return sourceList != null || stringIterator.hasNext();
4372         }
4373 
4374         /* (non-Javadoc)
4375          * @see java.util.Iterator#next()
4376          */
4377         @Override
next()4378         public String next() {
4379             if (sourceList == null) {
4380                 return stringIterator.next();
4381             }
4382             int codepoint = current++;
4383             // we have the codepoint we need, but we may need to adjust the state
4384             if (current >= limit) {
4385                 if (item >= len) {
4386                     stringIterator = sourceStrings.iterator();
4387                     sourceList = null;
4388                 } else {
4389                     current = sourceList[item++];
4390                     limit = sourceList[item++];
4391                 }
4392             }
4393             // Now return. Single code point is easy
4394             if (codepoint <= 0xFFFF) {
4395                 return String.valueOf((char)codepoint);
4396             }
4397             // But Java lacks a valueOfCodePoint, so we handle ourselves for speed
4398             // allocate a buffer the first time, to make conversion faster.
4399             if (buffer == null) {
4400                 buffer = new char[2];
4401             }
4402             // compute ourselves, to save tests and calls
4403             int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT;
4404             buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE);
4405             buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE);
4406             return String.valueOf(buffer);
4407         }
4408 
4409         /* (non-Javadoc)
4410          * @see java.util.Iterator#remove()
4411          */
4412         @Override
remove()4413         public void remove() {
4414             throw new UnsupportedOperationException();
4415         }
4416     }
4417 
4418     /**
4419      * @see #containsAll(com.ibm.icu.text.UnicodeSet)
4420      * @stable ICU 4.4
4421      */
containsAll(Iterable<T> collection)4422     public <T extends CharSequence> boolean containsAll(Iterable<T> collection) {
4423         for (T o : collection) {
4424             if (!contains(o)) {
4425                 return false;
4426             }
4427         }
4428         return true;
4429     }
4430 
4431     /**
4432      * @see #containsNone(com.ibm.icu.text.UnicodeSet)
4433      * @stable ICU 4.4
4434      */
containsNone(Iterable<T> collection)4435     public <T extends CharSequence> boolean containsNone(Iterable<T> collection) {
4436         for (T o : collection) {
4437             if (contains(o)) {
4438                 return false;
4439             }
4440         }
4441         return true;
4442     }
4443 
4444     /**
4445      * @see #containsAll(com.ibm.icu.text.UnicodeSet)
4446      * @stable ICU 4.4
4447      */
containsSome(Iterable<T> collection)4448     public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) {
4449         return !containsNone(collection);
4450     }
4451 
4452     /**
4453      * @see #addAll(com.ibm.icu.text.UnicodeSet)
4454      * @stable ICU 4.4
4455      */
4456     @SuppressWarnings("unchecked")  // See ticket #11395, this is safe.
addAll(T... collection)4457     public <T extends CharSequence> UnicodeSet addAll(T... collection) {
4458         checkFrozen();
4459         for (T str : collection) {
4460             add(str);
4461         }
4462         return this;
4463     }
4464 
4465 
4466     /**
4467      * @see #removeAll(com.ibm.icu.text.UnicodeSet)
4468      * @stable ICU 4.4
4469      */
removeAll(Iterable<T> collection)4470     public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) {
4471         checkFrozen();
4472         for (T o : collection) {
4473             remove(o);
4474         }
4475         return this;
4476     }
4477 
4478     /**
4479      * @see #retainAll(com.ibm.icu.text.UnicodeSet)
4480      * @stable ICU 4.4
4481      */
retainAll(Iterable<T> collection)4482     public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) {
4483         checkFrozen();
4484         // TODO optimize
4485         UnicodeSet toRetain = new UnicodeSet();
4486         toRetain.addAll(collection);
4487         retainAll(toRetain);
4488         return this;
4489     }
4490 
4491     /**
4492      * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}.
4493      * @stable ICU 4.4
4494      */
4495     public enum ComparisonStyle {
4496         /**
4497          * @stable ICU 4.4
4498          */
4499         SHORTER_FIRST,
4500         /**
4501          * @stable ICU 4.4
4502          */
4503         LEXICOGRAPHIC,
4504         /**
4505          * @stable ICU 4.4
4506          */
4507         LONGER_FIRST
4508     }
4509 
4510     /**
4511      * Compares UnicodeSets, where shorter come first, and otherwise lexigraphically
4512      * (according to the comparison of the first characters that differ).
4513      * @see java.lang.Comparable#compareTo(java.lang.Object)
4514      * @stable ICU 4.4
4515      */
4516     @Override
compareTo(UnicodeSet o)4517     public int compareTo(UnicodeSet o) {
4518         return compareTo(o, ComparisonStyle.SHORTER_FIRST);
4519     }
4520     /**
4521      * Compares UnicodeSets, in three different ways.
4522      * @see java.lang.Comparable#compareTo(java.lang.Object)
4523      * @stable ICU 4.4
4524      */
compareTo(UnicodeSet o, ComparisonStyle style)4525     public int compareTo(UnicodeSet o, ComparisonStyle style) {
4526         if (style != ComparisonStyle.LEXICOGRAPHIC) {
4527             int diff = size() - o.size();
4528             if (diff != 0) {
4529                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
4530             }
4531         }
4532         int result;
4533         for (int i = 0; ; ++i) {
4534             if (0 != (result = list[i] - o.list[i])) {
4535                 // if either list ran out, compare to the last string
4536                 if (list[i] == HIGH) {
4537                     if (strings.isEmpty()) return 1;
4538                     String item = strings.first();
4539                     return compare(item, o.list[i]);
4540                 }
4541                 if (o.list[i] == HIGH) {
4542                     if (o.strings.isEmpty()) return -1;
4543                     String item = o.strings.first();
4544                     int compareResult = compare(item, list[i]);
4545                     return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order.
4546                 }
4547                 // otherwise return the result if even index, or the reversal if not
4548                 return (i & 1) == 0 ? result : -result;
4549             }
4550             if (list[i] == HIGH) {
4551                 break;
4552             }
4553         }
4554         return compare(strings, o.strings);
4555     }
4556 
4557     /**
4558      * @stable ICU 4.4
4559      */
compareTo(Iterable<String> other)4560     public int compareTo(Iterable<String> other) {
4561         return compare(this, other);
4562     }
4563 
4564     /**
4565      * Utility to compare a string to a code point.
4566      * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString())
4567      * and comparing, but much faster (no object creation).
4568      * Actually, there is one difference; a null compares as less.
4569      * Note that this (=String) order is UTF-16 order -- *not* code point order.
4570      * @stable ICU 4.4
4571      */
4572 
compare(CharSequence string, int codePoint)4573     public static int compare(CharSequence string, int codePoint) {
4574         return CharSequences.compare(string, codePoint);
4575     }
4576 
4577     /**
4578      * Utility to compare a string to a code point.
4579      * Same results as turning the code point into a string and comparing, but much faster (no object creation).
4580      * Actually, there is one difference; a null compares as less.
4581      * Note that this (=String) order is UTF-16 order -- *not* code point order.
4582      * @stable ICU 4.4
4583      */
compare(int codePoint, CharSequence string)4584     public static int compare(int codePoint, CharSequence string) {
4585         return -CharSequences.compare(string, codePoint);
4586     }
4587 
4588 
4589     /**
4590      * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered,
4591      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
4592      * That means that sets can't be compared directly with this method, unless they are TreeSets without
4593      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
4594      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
4595      * @stable ICU 4.4
4596      */
compare(Iterable<T> collection1, Iterable<T> collection2)4597     public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) {
4598         return compare(collection1.iterator(), collection2.iterator());
4599     }
4600 
4601     /**
4602      * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered,
4603      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
4604      * That means that sets can't be compared directly with this method, unless they are TreeSets without
4605      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
4606      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
4607      * @internal
4608      * @deprecated This API is ICU internal only.
4609      */
4610     @Deprecated
compare(Iterator<T> first, Iterator<T> other)4611     public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) {
4612         while (true) {
4613             if (!first.hasNext()) {
4614                 return other.hasNext() ? -1 : 0;
4615             } else if (!other.hasNext()) {
4616                 return 1;
4617             }
4618             T item1 = first.next();
4619             T item2 = other.next();
4620             int result = item1.compareTo(item2);
4621             if (result != 0) {
4622                 return result;
4623             }
4624         }
4625     }
4626 
4627 
4628     /**
4629      * Utility to compare two collections, optionally by size, and then lexicographically.
4630      * @stable ICU 4.4
4631      */
compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4632     public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) {
4633         if (style != ComparisonStyle.LEXICOGRAPHIC) {
4634             int diff = collection1.size() - collection2.size();
4635             if (diff != 0) {
4636                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
4637             }
4638         }
4639         return compare(collection1, collection2);
4640     }
4641 
4642     /**
4643      * Utility for adding the contents of an iterable to a collection.
4644      * @stable ICU 4.4
4645      */
addAllTo(Iterable<T> source, U target)4646     public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) {
4647         for (T item : source) {
4648             target.add(item);
4649         }
4650         return target;
4651     }
4652 
4653     /**
4654      * Utility for adding the contents of an iterable to a collection.
4655      * @stable ICU 4.4
4656      */
addAllTo(Iterable<T> source, T[] target)4657     public static <T> T[] addAllTo(Iterable<T> source, T[] target) {
4658         int i = 0;
4659         for (T item : source) {
4660             target[i++] = item;
4661         }
4662         return target;
4663     }
4664 
4665     /**
4666      * For iterating through the strings in the set. Example:
4667      * <pre>
4668      * for (String key : myUnicodeSet.strings()) {
4669      *   doSomethingWith(key);
4670      * }
4671      * </pre>
4672      * @stable ICU 4.4
4673      */
strings()4674     public Collection<String> strings() {
4675         return Collections.unmodifiableSortedSet(strings);
4676     }
4677 
4678     /**
4679      * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE.
4680      * @internal
4681      * @deprecated This API is ICU internal only.
4682      */
4683     @Deprecated
getSingleCodePoint(CharSequence s)4684     public static int getSingleCodePoint(CharSequence s) {
4685         return CharSequences.getSingleCodePoint(s);
4686     }
4687 
4688     /**
4689      * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set.
4690      * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E
4691      * if the dontCare set includes unassigned characters (for a particular version of Unicode).
4692      * @param dontCare Set with the don't-care characters for spanning
4693      * @return the input set, modified
4694      * @internal
4695      * @deprecated This API is ICU internal only.
4696      */
4697     @Deprecated
addBridges(UnicodeSet dontCare)4698     public UnicodeSet addBridges(UnicodeSet dontCare) {
4699         UnicodeSet notInInput = new UnicodeSet(this).complement();
4700         for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) {
4701             if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) {
4702                 add(it.codepoint,it.codepointEnd);
4703             }
4704         }
4705         return this;
4706     }
4707 
4708     /**
4709      * Find the first index at or after fromIndex where the UnicodeSet matches at that index.
4710      * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match.
4711      * If there is no match, length is returned.
4712      * @internal
4713      * @deprecated This API is ICU internal only. Use span instead.
4714      */
4715     @Deprecated
findIn(CharSequence value, int fromIndex, boolean findNot)4716     public int findIn(CharSequence value, int fromIndex, boolean findNot) {
4717         //TODO add strings, optimize, using ICU4C algorithms
4718         int cp;
4719         for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) {
4720             cp = UTF16.charAt(value, fromIndex);
4721             if (contains(cp) != findNot) {
4722                 break;
4723             }
4724         }
4725         return fromIndex;
4726     }
4727 
4728     /**
4729      * Find the last index before fromIndex where the UnicodeSet matches at that index.
4730      * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match.
4731      * If there is no match, -1 is returned.
4732      * BEFORE index is not in the UnicodeSet.
4733      * @internal
4734      * @deprecated This API is ICU internal only. Use spanBack instead.
4735      */
4736     @Deprecated
findLastIn(CharSequence value, int fromIndex, boolean findNot)4737     public int findLastIn(CharSequence value, int fromIndex, boolean findNot) {
4738         //TODO add strings, optimize, using ICU4C algorithms
4739         int cp;
4740         fromIndex -= 1;
4741         for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) {
4742             cp = UTF16.charAt(value, fromIndex);
4743             if (contains(cp) != findNot) {
4744                 break;
4745             }
4746         }
4747         return fromIndex < 0 ? -1 : fromIndex;
4748     }
4749 
4750     /**
4751      * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match.
4752      * @param source The source of the CharSequence to strip from.
4753      * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object.
4754      * @return The string after it has been stripped.
4755      * @internal
4756      * @deprecated This API is ICU internal only. Use replaceFrom.
4757      */
4758     @Deprecated
stripFrom(CharSequence source, boolean matches)4759     public String stripFrom(CharSequence source, boolean matches) {
4760         StringBuilder result = new StringBuilder();
4761         for (int pos = 0; pos < source.length();) {
4762             int inside = findIn(source, pos, !matches);
4763             result.append(source.subSequence(pos, inside));
4764             pos = findIn(source, inside, matches); // get next start
4765         }
4766         return result.toString();
4767     }
4768 
4769     /**
4770      * Argument values for whether span() and similar functions continue while the current character is contained vs.
4771      * not contained in the set.
4772      * <p>
4773      * The functionality is straightforward for sets with only single code points, without strings (which is the common
4774      * case):
4775      * <ul>
4776      * <li>CONTAINED and SIMPLE work the same.
4777      * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
4778      * <li>span() and spanBack() partition any string the
4779      * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
4780      * <li>Using a
4781      * complemented (inverted) set and the opposite span conditions yields the same results.
4782      * </ul>
4783      * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
4784      * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
4785      * strings:
4786      * <ul>
4787      * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
4788      * Therefore, complementing both the set and the span conditions may yield different results.
4789      * <li>When starting spans
4790      * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
4791      * because a set string may start before the later position.
4792      * <li>span(SIMPLE) may be shorter than
4793      * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
4794      * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
4795      * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
4796      * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
4797      * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
4798      * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
4799      * </ul>
4800      * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
4801      * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
4802      * be used.
4803      * <p>
4804      * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
4805      * boundaries, never in the middle of a surrogate pair.
4806      *
4807      * @stable ICU 4.4
4808      */
4809     public enum SpanCondition {
4810         /**
4811          * Continues a span() while there is no set element at the current position.
4812          * Increments by one code point at a time.
4813          * Stops before the first set element (character or string).
4814          * (For code points only, this is like while contains(current)==false).
4815          * <p>
4816          * When span() returns, the substring between where it started and the position it returned consists only of
4817          * characters that are not in the set, and none of its strings overlap with the span.
4818          *
4819          * @stable ICU 4.4
4820          */
4821         NOT_CONTAINED,
4822 
4823         /**
4824          * Spans the longest substring that is a concatenation of set elements (characters or strings).
4825          * (For characters only, this is like while contains(current)==true).
4826          * <p>
4827          * When span() returns, the substring between where it started and the position it returned consists only of set
4828          * elements (characters or strings) that are in the set.
4829          * <p>
4830          * If a set contains strings, then the span will be the longest substring for which there
4831          * exists at least one non-overlapping concatenation of set elements (characters or strings).
4832          * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
4833          * (Java/ICU/Perl regex stops at the first match of an OR.)
4834          *
4835          * @stable ICU 4.4
4836          */
4837         CONTAINED,
4838 
4839         /**
4840          * Continues a span() while there is a set element at the current position.
4841          * Increments by the longest matching element at each position.
4842          * (For characters only, this is like while contains(current)==true).
4843          * <p>
4844          * When span() returns, the substring between where it started and the position it returned consists only of set
4845          * elements (characters or strings) that are in the set.
4846          * <p>
4847          * If a set only contains single characters, then this is the same as CONTAINED.
4848          * <p>
4849          * If a set contains strings, then the span will be the longest substring with a match at each position with the
4850          * longest single set element (character or string).
4851          * <p>
4852          * Use this span condition together with other longest-match algorithms, such as ICU converters
4853          * (ucnv_getUnicodeSet()).
4854          *
4855          * @stable ICU 4.4
4856          */
4857         SIMPLE,
4858 
4859         /**
4860          * One more than the last span condition.
4861          *
4862          * @stable ICU 4.4
4863          */
4864         CONDITION_COUNT
4865     }
4866 
4867     /**
4868      * Get the default symbol table. Null means ordinary processing. For internal use only.
4869      * @return the symbol table
4870      * @internal
4871      * @deprecated This API is ICU internal only.
4872      */
4873     @Deprecated
getDefaultXSymbolTable()4874     public static XSymbolTable getDefaultXSymbolTable() {
4875         return XSYMBOL_TABLE;
4876     }
4877 
4878     /**
4879      * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing
4880      * of UnicodeSets.
4881      * <p>
4882      * WARNING: If this function is used with a UnicodeProperty, and the
4883      * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call
4884      * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable}
4885      * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}.
4886      *
4887      * @param xSymbolTable the new default symbol table.
4888      * @internal
4889      * @deprecated This API is ICU internal only.
4890      */
4891     @Deprecated
setDefaultXSymbolTable(XSymbolTable xSymbolTable)4892     public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
4893         INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated.
4894         XSYMBOL_TABLE = xSymbolTable;
4895     }
4896 }
4897 //eof
4898