• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2016, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 package android.icu.text;
11 
12 import java.io.IOException;
13 import java.text.ParsePosition;
14 import java.util.ArrayList;
15 import java.util.Arrays;
16 import java.util.Collection;
17 import java.util.Collections;
18 import java.util.Iterator;
19 import java.util.NoSuchElementException;
20 import java.util.SortedSet;
21 import java.util.Spliterator;
22 import java.util.TreeSet;
23 import java.util.function.IntConsumer;
24 import java.util.stream.IntStream;
25 import java.util.stream.Stream;
26 import java.util.stream.StreamSupport;
27 
28 import android.icu.impl.BMPSet;
29 import android.icu.impl.CharacterPropertiesImpl;
30 import android.icu.impl.PatternProps;
31 import android.icu.impl.RuleCharacterIterator;
32 import android.icu.impl.SortedSetRelation;
33 import android.icu.impl.StringRange;
34 import android.icu.impl.UCaseProps;
35 import android.icu.impl.UCharacterProperty;
36 import android.icu.impl.UPropertyAliases;
37 import android.icu.impl.UnicodeSetStringSpan;
38 import android.icu.impl.Utility;
39 import android.icu.lang.CharSequences;
40 import android.icu.lang.CharacterProperties;
41 import android.icu.lang.UCharacter;
42 import android.icu.lang.UProperty;
43 import android.icu.lang.UScript;
44 import android.icu.util.Freezable;
45 import android.icu.util.ICUUncheckedIOException;
46 import android.icu.util.OutputInt;
47 import android.icu.util.ULocale;
48 import android.icu.util.VersionInfo;
49 
50 /**
51  * A mutable set of Unicode characters and multicharacter strings.
52  * Objects of this class represent <em>character classes</em> used
53  * in regular expressions. A character specifies a subset of Unicode
54  * code points.  Legal code points are U+0000 to U+10FFFF, inclusive.
55  *
56  * Note: method freeze() will not only make the set immutable, but
57  * also makes important methods much higher performance:
58  * contains(c), containsNone(...), span(...), spanBack(...) etc.
59  * After the object is frozen, any subsequent call that wants to change
60  * the object will throw UnsupportedOperationException.
61  *
62  * <p>The UnicodeSet class is not designed to be subclassed.
63  *
64  * <p><code>UnicodeSet</code> supports two APIs. The first is the
65  * <em>operand</em> API that allows the caller to modify the value of
66  * a <code>UnicodeSet</code> object. It conforms to Java 2's
67  * <code>java.util.Set</code> interface, although
68  * <code>UnicodeSet</code> does not actually implement that
69  * interface. All methods of <code>Set</code> are supported, with the
70  * modification that they take a character range or single character
71  * instead of an <code>Object</code>, and they take a
72  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
73  * operand API may be thought of in terms of boolean logic: a boolean
74  * OR is implemented by <code>add</code>, a boolean AND is implemented
75  * by <code>retain</code>, a boolean XOR is implemented by
76  * <code>complement</code> taking an argument, and a boolean NOT is
77  * implemented by <code>complement</code> with no argument.  In terms
78  * of traditional set theory function names, <code>add</code> is a
79  * union, <code>retain</code> is an intersection, <code>remove</code>
80  * is an asymmetric difference, and <code>complement</code> with no
81  * argument is a set complement with respect to the superset range
82  * <code>MIN_VALUE-MAX_VALUE</code>
83  *
84  * <p>The second API is the
85  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
86  * <code>java.text.Format</code>-derived classes.  Unlike the
87  * methods that add characters, add categories, and control the logic
88  * of the set, the method <code>applyPattern()</code> sets all
89  * attributes of a <code>UnicodeSet</code> at once, based on a
90  * string pattern.
91  *
92  * <p><b>Pattern syntax</b></p>
93  *
94  * Patterns are accepted by the constructors and the
95  * <code>applyPattern()</code> methods and returned by the
96  * <code>toPattern()</code> method.  These patterns follow a syntax
97  * similar to that employed by version 8 regular expression character
98  * classes.  Here are some simple examples:
99  *
100  * <blockquote>
101  *   <table>
102  *     <tr style="vertical-align: top">
103  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td>
104  *       <td style="vertical-align: top;">No characters</td>
105  *     </tr><tr style="vertical-align: top">
106  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td>
107  *       <td style="vertical-align: top;">The character 'a'</td>
108  *     </tr><tr style="vertical-align: top">
109  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td>
110  *       <td style="vertical-align: top;">The characters 'a' and 'e'</td>
111  *     </tr>
112  *     <tr>
113  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td>
114  *       <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code
115  *       point order</td>
116  *     </tr>
117  *     <tr>
118  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td>
119  *       <td style="vertical-align: top;">The character U+4E01</td>
120  *     </tr>
121  *     <tr>
122  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td>
123  *       <td style="vertical-align: top;">The character 'a' and the multicharacter strings &quot;ab&quot; and
124  *       &quot;ac&quot;</td>
125  *     </tr>
126  *     <tr>
127  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td>
128  *       <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td>
129  *     </tr>
130  *   </table>
131  * </blockquote>
132  *
133  * Any character may be preceded by a backslash in order to remove any special
134  * meaning.  White space characters, as defined by the Unicode Pattern_White_Space property, are
135  * ignored, unless they are escaped.
136  *
137  * <p>Property patterns specify a set of characters having a certain
138  * property as defined by the Unicode standard.  Both the POSIX-like
139  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
140  * complete list of supported property patterns, see the User's Guide
141  * for UnicodeSet at
142  * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset">
143  * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>.
144  * Actual determination of property data is defined by the underlying
145  * Unicode database as implemented by UCharacter.
146  *
147  * <p>Patterns specify individual characters, ranges of characters, and
148  * Unicode property sets.  When elements are concatenated, they
149  * specify their union.  To complement a set, place a '^' immediately
150  * after the opening '['.  Property patterns are inverted by modifying
151  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
152  * '^' has no special meaning.
153  *
154  * <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]"
155  * perform a “code point complement” (all code points minus the original set),
156  * removing all multicharacter strings,
157  * equivalent to .{@link #complement()}.{@link #removeAllStrings()} .
158  * The {@link #complement()} API function continues to perform a
159  * symmetric difference with all code points and thus retains all multicharacter strings.
160  *
161  * <p>Ranges are indicated by placing two a '-' between two
162  * characters, as in "a-z".  This specifies the range of all
163  * characters from the left to the right, in Unicode order.  If the
164  * left character is greater than or equal to the
165  * right character it is a syntax error.  If a '-' occurs as the first
166  * character after the opening '[' or '[^', or if it occurs as the
167  * last character before the closing ']', then it is taken as a
168  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
169  * set of three characters, 'a', 'b', and '-'.
170  *
171  * <p>Sets may be intersected using the '&amp;' operator or the asymmetric
172  * set difference may be taken using the '-' operator, for example,
173  * "[[:L:]&amp;[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
174  * with values less than 4096.  Operators ('&amp;' and '|') have equal
175  * precedence and bind left-to-right.  Thus
176  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
177  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
178  * difference; intersection is commutative.
179  *
180  * <table>
181  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a'
182  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a'
183  * through 'z' and all letters in between, in Unicode order
184  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing
185  * all characters but 'a' through 'z',
186  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
187  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
188  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
189  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&amp;[<em>pat2</em>]]</code>
190  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
191  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
192  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
193  * <em>pat2</em>
194  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code>
195  * <td>The set of characters having the specified
196  * Unicode property; in
197  * this case, Unicode uppercase letters
198  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code>
199  * <td>The set of characters <em>not</em> having the given
200  * Unicode property
201  * </table>
202  *
203  * <p><b>Formal syntax</b></p>
204  *
205  * <blockquote>
206  *   <table>
207  *     <tr style="vertical-align: top">
208  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern :=&nbsp; </code></td>
209  *       <td style="vertical-align: top;"><code>('[' '^'? item* ']') |
210  *       property</code></td>
211  *     </tr>
212  *     <tr style="vertical-align: top">
213  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item :=&nbsp; </code></td>
214  *       <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br>
215  *       </code></td>
216  *     </tr>
217  *     <tr style="vertical-align: top">
218  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr :=&nbsp; </code></td>
219  *       <td style="vertical-align: top;"><code>pattern | pattern-expr pattern |
220  *       pattern-expr op pattern<br>
221  *       </code></td>
222  *     </tr>
223  *     <tr style="vertical-align: top">
224  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op :=&nbsp; </code></td>
225  *       <td style="vertical-align: top;"><code>'&amp;' | '-'<br>
226  *       </code></td>
227  *     </tr>
228  *     <tr style="vertical-align: top">
229  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special :=&nbsp; </code></td>
230  *       <td style="vertical-align: top;"><code>'[' | ']' | '-'<br>
231  *       </code></td>
232  *     </tr>
233  *     <tr style="vertical-align: top">
234  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char :=&nbsp; </code></td>
235  *       <td style="vertical-align: top;"><em>any character that is not</em><code> special<br>
236  *       | ('\\' </code><em>any character</em><code>)<br>
237  *       | ('&#92;u' hex hex hex hex)<br>
238  *       </code></td>
239  *     </tr>
240  *     <tr style="vertical-align: top">
241  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex :=&nbsp; </code></td>
242  *       <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
243  *       &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
244  *     </tr>
245  *     <tr>
246  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property :=&nbsp; </code></td>
247  *       <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td>
248  *     </tr>
249  *   </table>
250  *   <br>
251  *   <table border="1">
252  *     <tr>
253  *       <td>Legend: <table>
254  *         <tr>
255  *           <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td>
256  *           <td style="width: 20; vertical-align: top;">&nbsp; </td>
257  *           <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td>
258  *         </tr>
259  *         <tr>
260  *           <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td>
261  *           <td style="vertical-align: top;"></td>
262  *           <td style="vertical-align: top;">zero or one instance of <code>a</code><br>
263  *           </td>
264  *         </tr>
265  *         <tr>
266  *           <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td>
267  *           <td style="vertical-align: top;"></td>
268  *           <td style="vertical-align: top;">one or more instances of <code>a</code><br>
269  *           </td>
270  *         </tr>
271  *         <tr>
272  *           <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td>
273  *           <td style="vertical-align: top;"></td>
274  *           <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br>
275  *           </td>
276  *         </tr>
277  *         <tr>
278  *           <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td>
279  *           <td style="vertical-align: top;"></td>
280  *           <td style="vertical-align: top;">the literal string between the quotes </td>
281  *         </tr>
282  *       </table>
283  *       </td>
284  *     </tr>
285  *   </table>
286  * </blockquote>
287  *
288  * <p>To iterate over contents of {@code UnicodeSet}, the following are available:
289  * <ul>
290  *   <li>to iterate over the ranges: {@link #ranges()}, {@link #rangeStream()}</li>
291  *   <li>to iterate over the strings: {@link #strings()}, {@link #stringStream()}</li>
292  *   <li>to iterate over the code points: {@link #codePoints()}, {@link #codePointStream()}</li>
293  *   <li>to iterate over the entire contents in a single loop: this class itself is {@link Iterable},
294  *       or use {@link #stream()}.<br>
295  *       All of these method are, however, not particularly efficient,
296  *       since they convert each individual code point to a {@code String}.
297  * </ul>
298  *
299  * <p>The iterators and streams methods work as expected in idiomatic Java usage.<br>
300  * The {@link UnicodeSetIterator} cannot be used in <b>for</b> loops, and it is not very Java-idiomatic, because it is old.
301  * But it might be faster in certain use cases. We recommend that you measure in performance sensitive code.<br>
302  *
303  * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
304  *
305  * @author Alan Liu
306  * @see UnicodeSetIterator
307  * @see UnicodeSetSpanner
308  */
309 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> {
310     private static final SortedSet<String> EMPTY_STRINGS =
311             Collections.unmodifiableSortedSet(new TreeSet<String>());
312 
313     /**
314      * Constant for the empty set.
315      */
316     public static final UnicodeSet EMPTY = new UnicodeSet().freeze();
317     /**
318      * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.)
319      */
320     public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze();
321 
322     private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing
323 
324     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
325     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
326     // 110000 for codepoints
327 
328     /**
329      * Enough for sets with few ranges.
330      * For example, White_Space has 10 ranges, list length 21.
331      */
332     private static final int INITIAL_CAPACITY = 25;
333 
334     /** Max list [0, 1, 2, ..., max code point, HIGH] */
335     private static final int MAX_LENGTH = HIGH + 1;
336 
337     /**
338      * Minimum value that can be stored in a UnicodeSet.
339      */
340     public static final int MIN_VALUE = LOW;
341 
342     /**
343      * Maximum value that can be stored in a UnicodeSet.
344      */
345     public static final int MAX_VALUE = HIGH - 1;
346 
347     private int len;      // length used; list may be longer to minimize reallocs
348     private int[] list;   // MUST be terminated with HIGH
349     private int[] rangeList; // internal buffer
350     private int[] buffer; // internal buffer
351 
352     // is not private so that UnicodeSetIterator can get access
353     SortedSet<String> strings = EMPTY_STRINGS;
354 
355     /**
356      * The pattern representation of this set.  This may not be the
357      * most economical pattern.  It is the pattern supplied to
358      * applyPattern(), with variables substituted and whitespace
359      * removed.  For sets constructed without applyPattern(), or
360      * modified using the non-pattern API, this string will be null,
361      * indicating that toPattern() must generate a pattern
362      * representation from the inversion list.
363      */
364     private String pat = null;
365 
366     // Special property set IDs
367     private static final String ANY_ID   = "ANY";   // [\u0000-\U0010FFFF]
368     private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
369     private static final String ASSIGNED = "Assigned"; // [:^Cn:]
370 
371     private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
372     private volatile UnicodeSetStringSpan stringSpan;
373     //----------------------------------------------------------------
374     // Public API
375     //----------------------------------------------------------------
376 
377     /**
378      * Constructs an empty set.
379      */
UnicodeSet()380     public UnicodeSet() {
381         list = new int[INITIAL_CAPACITY];
382         list[0] = HIGH;
383         len = 1;
384     }
385 
386     /**
387      * Constructs a copy of an existing set.
388      */
UnicodeSet(UnicodeSet other)389     public UnicodeSet(UnicodeSet other) {
390         set(other);
391     }
392 
393     /**
394      * Constructs a set containing the given range. If <code>end &gt;
395      * start</code> then an empty set is created.
396      *
397      * @param start first character, inclusive, of range
398      * @param end last character, inclusive, of range
399      */
UnicodeSet(int start, int end)400     public UnicodeSet(int start, int end) {
401         this();
402         add(start, end);
403     }
404 
405     /**
406      * Quickly constructs a set from a set of ranges &lt;s0, e0, s1, e1, s2, e2, ..., sn, en&gt;.
407      * There must be an even number of integers, and they must be all greater than zero,
408      * all less than or equal to Character.MAX_CODE_POINT.
409      * In each pair (..., si, ei, ...) it must be true that si &lt;= ei
410      * Between adjacent pairs (...ei, sj...), it must be true that ei+1 &lt; sj
411      * @param pairs pairs of character representing ranges
412      */
UnicodeSet(int... pairs)413     public UnicodeSet(int... pairs) {
414         if ((pairs.length & 1) != 0) {
415             throw new IllegalArgumentException("Must have even number of integers");
416         }
417         list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set.
418         len = list.length;
419         int last = -1; // used to ensure that the results are monotonically increasing.
420         int i = 0;
421         while (i < pairs.length) {
422             int start = pairs[i];
423             if (last >= start) {
424                 throw new IllegalArgumentException("Must be monotonically increasing.");
425             }
426             list[i++] = start;
427             int limit = pairs[i] + 1;
428             if (start >= limit) {
429                 throw new IllegalArgumentException("Must be monotonically increasing.");
430             }
431             list[i++] = last = limit;
432         }
433         list[i] = HIGH; // terminate
434     }
435 
436     /**
437      * Constructs a set from the given pattern.  See the class description
438      * for the syntax of the pattern language.  Whitespace is ignored.
439      * @param pattern a string specifying what characters are in the set
440      * @exception java.lang.IllegalArgumentException if the pattern contains
441      * a syntax error.
442      */
UnicodeSet(String pattern)443     public UnicodeSet(String pattern) {
444         this();
445         applyPattern(pattern, null, null, IGNORE_SPACE);
446     }
447 
448     /**
449      * Constructs a set from the given pattern.  See the class description
450      * for the syntax of the pattern language.
451      * @param pattern a string specifying what characters are in the set
452      * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters
453      * @exception java.lang.IllegalArgumentException if the pattern contains
454      * a syntax error.
455      */
UnicodeSet(String pattern, boolean ignoreWhitespace)456     public UnicodeSet(String pattern, boolean ignoreWhitespace) {
457         this();
458         applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
459     }
460 
461     /**
462      * Constructs a set from the given pattern.  See the class description
463      * for the syntax of the pattern language.
464      * @param pattern a string specifying what characters are in the set
465      * @param options a bitmask indicating which options to apply.
466      * Valid options are {@link #IGNORE_SPACE} and
467      * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
468      * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
469      * @exception java.lang.IllegalArgumentException if the pattern contains
470      * a syntax error.
471      */
UnicodeSet(String pattern, int options)472     public UnicodeSet(String pattern, int options) {
473         this();
474         applyPattern(pattern, null, null, options);
475     }
476 
477     /**
478      * Constructs a set from the given pattern.  See the class description
479      * for the syntax of the pattern language.
480      * @param pattern a string specifying what characters are in the set
481      * @param pos on input, the position in pattern at which to start parsing.
482      * On output, the position after the last character parsed.
483      * @param symbols a symbol table mapping variables to char[] arrays
484      * and chars to UnicodeSets
485      * @exception java.lang.IllegalArgumentException if the pattern
486      * contains a syntax error.
487      */
UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)488     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) {
489         this();
490         applyPattern(pattern, pos, symbols, IGNORE_SPACE);
491     }
492 
493     /**
494      * Constructs a set from the given pattern.  See the class description
495      * for the syntax of the pattern language.
496      * @param pattern a string specifying what characters are in the set
497      * @param pos on input, the position in pattern at which to start parsing.
498      * On output, the position after the last character parsed.
499      * @param symbols a symbol table mapping variables to char[] arrays
500      * and chars to UnicodeSets
501      * @param options a bitmask indicating which options to apply.
502      * Valid options are {@link #IGNORE_SPACE} and
503      * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
504      * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
505      * @exception java.lang.IllegalArgumentException if the pattern
506      * contains a syntax error.
507      */
UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)508     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) {
509         this();
510         applyPattern(pattern, pos, symbols, options);
511     }
512 
513 
514     /**
515      * Return a new set that is equivalent to this one.
516      */
517     @Override
clone()518     public Object clone() {
519         if (isFrozen()) {
520             return this;
521         }
522         return new UnicodeSet(this);
523     }
524 
525     /**
526      * Make this object represent the range <code>start - end</code>.
527      * If <code>start &gt; end</code> then this object is set to an empty range.
528      *
529      * @param start first character in the set, inclusive
530      * @param end last character in the set, inclusive
531      */
set(int start, int end)532     public UnicodeSet set(int start, int end) {
533         checkFrozen();
534         clear();
535         complement(start, end);
536         return this;
537     }
538 
539     /**
540      * Make this object represent the same set as <code>other</code>.
541      * @param other a <code>UnicodeSet</code> whose value will be
542      * copied to this object
543      */
set(UnicodeSet other)544     public UnicodeSet set(UnicodeSet other) {
545         checkFrozen();
546         list = Arrays.copyOf(other.list, other.len);
547         len = other.len;
548         pat = other.pat;
549         if (other.hasStrings()) {
550             strings = new TreeSet<>(other.strings);
551         } else {
552             strings = EMPTY_STRINGS;
553         }
554         return this;
555     }
556 
557     /**
558      * Modifies this set to represent the set specified by the given pattern.
559      * See the class description for the syntax of the pattern language.
560      * Whitespace is ignored.
561      * @param pattern a string specifying what characters are in the set
562      * @exception java.lang.IllegalArgumentException if the pattern
563      * contains a syntax error.
564      */
applyPattern(String pattern)565     public final UnicodeSet applyPattern(String pattern) {
566         checkFrozen();
567         return applyPattern(pattern, null, null, IGNORE_SPACE);
568     }
569 
570     /**
571      * Modifies this set to represent the set specified by the given pattern,
572      * optionally ignoring whitespace.
573      * See the class description for the syntax of the pattern language.
574      * @param pattern a string specifying what characters are in the set
575      * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored
576      * @exception java.lang.IllegalArgumentException if the pattern
577      * contains a syntax error.
578      */
applyPattern(String pattern, boolean ignoreWhitespace)579     public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) {
580         checkFrozen();
581         return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
582     }
583 
584     /**
585      * Modifies this set to represent the set specified by the given pattern,
586      * optionally ignoring whitespace.
587      * See the class description for the syntax of the pattern language.
588      * @param pattern a string specifying what characters are in the set
589      * @param options a bitmask indicating which options to apply.
590      * Valid options are {@link #IGNORE_SPACE} and
591      * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
592      * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
593      * @exception java.lang.IllegalArgumentException if the pattern
594      * contains a syntax error.
595      */
applyPattern(String pattern, int options)596     public UnicodeSet applyPattern(String pattern, int options) {
597         checkFrozen();
598         return applyPattern(pattern, null, null, options);
599     }
600 
601     /**
602      * Return true if the given position, in the given pattern, appears
603      * to be the start of a UnicodeSet pattern.
604      * @hide unsupported on Android
605      */
resemblesPattern(String pattern, int pos)606     public static boolean resemblesPattern(String pattern, int pos) {
607         return ((pos+1) < pattern.length() &&
608                 pattern.charAt(pos) == '[') ||
609                 resemblesPropertyPattern(pattern, pos);
610     }
611 
612     /**
613      * TODO: create Appendable version of UTF16.append(buf, c),
614      * maybe in new class Appendables?
615      * @throws IOException
616      */
appendCodePoint(Appendable app, int c)617     private static void appendCodePoint(Appendable app, int c) {
618         assert 0 <= c && c <= 0x10ffff;
619         try {
620             if (c <= 0xffff) {
621                 app.append((char) c);
622             } else {
623                 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c));
624             }
625         } catch (IOException e) {
626             throw new ICUUncheckedIOException(e);
627         }
628     }
629 
630     /**
631      * TODO: create class Appendables?
632      * @throws IOException
633      */
append(Appendable app, CharSequence s)634     private static void append(Appendable app, CharSequence s) {
635         try {
636             app.append(s);
637         } catch (IOException e) {
638             throw new ICUUncheckedIOException(e);
639         }
640     }
641 
642     /**
643      * Append the <code>toPattern()</code> representation of a
644      * string to the given <code>Appendable</code>.
645      */
_appendToPat(T buf, String s, boolean escapeUnprintable)646     private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) {
647         int cp;
648         for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
649             cp = s.codePointAt(i);
650             _appendToPat(buf, cp, escapeUnprintable);
651         }
652         return buf;
653     }
654 
655     /**
656      * Append the <code>toPattern()</code> representation of a
657      * character to the given <code>Appendable</code>.
658      */
_appendToPat(T buf, int c, boolean escapeUnprintable)659     private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
660         try {
661             if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) {
662                 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
663                 // unprintable
664                 return Utility.escape(buf, c);
665             }
666             // Okay to let ':' pass through
667             switch (c) {
668             case '[': // SET_OPEN:
669             case ']': // SET_CLOSE:
670             case '-': // HYPHEN:
671             case '^': // COMPLEMENT:
672             case '&': // INTERSECTION:
673             case '\\': //BACKSLASH:
674             case '{':
675             case '}':
676             case '$':
677             case ':':
678                 buf.append('\\');
679                 break;
680             default:
681                 // Escape whitespace
682                 if (PatternProps.isWhiteSpace(c)) {
683                     buf.append('\\');
684                 }
685                 break;
686             }
687             appendCodePoint(buf, c);
688             return buf;
689         } catch (IOException e) {
690             throw new ICUUncheckedIOException(e);
691         }
692     }
693 
_appendToPat( T result, int start, int end, boolean escapeUnprintable)694     private static <T extends Appendable> T _appendToPat(
695             T result, int start, int end, boolean escapeUnprintable) {
696         _appendToPat(result, start, escapeUnprintable);
697         if (start != end) {
698             if ((start+1) != end ||
699                     // Avoid writing what looks like a lead+trail surrogate pair.
700                     start == 0xdbff) {
701                 try {
702                     result.append('-');
703                 } catch (IOException e) {
704                     throw new ICUUncheckedIOException(e);
705                 }
706             }
707             _appendToPat(result, end, escapeUnprintable);
708         }
709         return result;
710     }
711 
712     /**
713      * Returns a string representation of this set.  If the result of
714      * calling this function is passed to a UnicodeSet constructor, it
715      * will produce another set that is equal to this one.
716      */
717     @Override
toPattern(boolean escapeUnprintable)718     public String toPattern(boolean escapeUnprintable) {
719         if (pat != null && !escapeUnprintable) {
720             return pat;
721         }
722         StringBuilder result = new StringBuilder();
723         return _toPattern(result, escapeUnprintable).toString();
724     }
725 
726     /**
727      * Append a string representation of this set to result.  This will be
728      * a cleaned version of the string passed to applyPattern(), if there
729      * is one.  Otherwise it will be generated.
730      */
_toPattern(T result, boolean escapeUnprintable)731     private <T extends Appendable> T _toPattern(T result,
732             boolean escapeUnprintable) {
733         if (pat == null) {
734             return appendNewPattern(result, escapeUnprintable, true);
735         }
736         try {
737             if (!escapeUnprintable) {
738                 // TODO: The C++ version does not have this shortcut, and instead
739                 // always cleans up the pattern string,
740                 // which also escapes Utility.shouldAlwaysBeEscaped(c).
741                 // We should sync these implementations.
742                 result.append(pat);
743                 return result;
744             }
745             boolean oddNumberOfBackslashes = false;
746             for (int i=0; i<pat.length(); ) {
747                 int c = pat.codePointAt(i);
748                 i += Character.charCount(c);
749                 if (Utility.isUnprintable(c)) {
750                     // If the unprintable character is preceded by an odd
751                     // number of backslashes, then it has been escaped
752                     // and we omit the last backslash.
753                     Utility.escape(result, c);
754                     oddNumberOfBackslashes = false;
755                 } else if (!oddNumberOfBackslashes && c == '\\') {
756                     // Temporarily withhold an odd-numbered backslash.
757                     oddNumberOfBackslashes = true;
758                 } else {
759                     if (oddNumberOfBackslashes) {
760                         result.append('\\');
761                     }
762                     appendCodePoint(result, c);
763                     oddNumberOfBackslashes = false;
764                 }
765             }
766             if (oddNumberOfBackslashes) {
767                 result.append('\\');
768             }
769             return result;
770         } catch (IOException e) {
771             throw new ICUUncheckedIOException(e);
772         }
773     }
774 
775     /**
776      * Generate and append a string representation of this set to result.
777      * This does not use this.pat, the cleaned up copy of the string
778      * passed to applyPattern().
779      *
780      * @param result the buffer into which to generate the pattern
781      * @param escapeUnprintable escape unprintable characters if true
782      */
_generatePattern(StringBuffer result, boolean escapeUnprintable)783     public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) {
784         return _generatePattern(result, escapeUnprintable, true);
785     }
786 
787     /**
788      * Generate and append a string representation of this set to result.
789      * This does not use this.pat, the cleaned up copy of the string
790      * passed to applyPattern().
791      *
792      * @param result the buffer into which to generate the pattern
793      * @param escapeUnprintable escape unprintable characters if true
794      * @param includeStrings if false, doesn't include the strings.
795      */
_generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)796     public StringBuffer _generatePattern(StringBuffer result,
797             boolean escapeUnprintable, boolean includeStrings) {
798         return appendNewPattern(result, escapeUnprintable, includeStrings);
799     }
800 
801     // Implementation of public _generatePattern().
802     // Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer.
appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)803     private <T extends Appendable> T appendNewPattern(
804             T result, boolean escapeUnprintable, boolean includeStrings) {
805         try {
806             result.append('[');
807 
808             int i = 0;
809             int limit = len & ~1;  // = 2 * getRangeCount()
810 
811             // If the set contains at least 2 intervals and includes both
812             // MIN_VALUE and MAX_VALUE, then the inverse representation will
813             // be more economical.
814             //     if (getRangeCount() >= 2 &&
815             //             getRangeStart(0) == MIN_VALUE &&
816             //             getRangeEnd(last) == MAX_VALUE)
817             // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
818             // If limit == len then len is even and the last range ends with MAX_VALUE.
819             //
820             // *But* do not write the inverse (complement) if there are strings.
821             // Since ICU 70, the '^' performs a code point complement which removes all strings.
822             if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
823                 // Emit the inverse
824                 result.append('^');
825                 // Offsetting the inversion list index by one lets us
826                 // iterate over the ranges of the set complement.
827                 i = 1;
828                 --limit;
829             }
830 
831             // Emit the ranges as pairs.
832             while (i < limit) {
833                 int start = list[i];  // getRangeStart()
834                 int end = list[i + 1] - 1;  // getRangeEnd() = range limit minus one
835                 if (!(0xd800 <= end && end <= 0xdbff)) {
836                     _appendToPat(result, start, end, escapeUnprintable);
837                     i += 2;
838                 } else {
839                     // The range ends with a lead surrogate.
840                     // Avoid writing what looks like a lead+trail surrogate pair.
841                     // 1. Postpone ranges that start with a lead surrogate code point.
842                     int firstLead = i;
843                     while ((i += 2) < limit && list[i] <= 0xdbff) {}
844                     int firstAfterLead = i;
845                     // 2. Write following ranges that start with a trail surrogate code point.
846                     while (i < limit && (start = list[i]) <= 0xdfff) {
847                         _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
848                         i += 2;
849                     }
850                     // 3. Now write the postponed ranges.
851                     for (int j = firstLead; j < firstAfterLead; j += 2) {
852                         _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
853                     }
854                 }
855             }
856 
857             if (includeStrings && hasStrings()) {
858                 for (String s : strings) {
859                     result.append('{');
860                     _appendToPat(result, s, escapeUnprintable);
861                     result.append('}');
862                 }
863             }
864             result.append(']');
865             return result;
866         } catch (IOException e) {
867             throw new ICUUncheckedIOException(e);
868         }
869     }
870 
871     /**
872      * Returns the number of elements in this set (its cardinality)
873      * Note than the elements of a set may include both individual
874      * codepoints and strings.
875      *
876      * @return the number of elements in this set (its cardinality).
877      */
size()878     public int size() {
879         int n = 0;
880         int count = getRangeCount();
881         for (int i = 0; i < count; ++i) {
882             n += getRangeEnd(i) - getRangeStart(i) + 1;
883         }
884         return n + strings.size();
885     }
886 
887     /**
888      * Returns <tt>true</tt> if this set contains no elements.
889      *
890      * @return <tt>true</tt> if this set contains no elements.
891      */
isEmpty()892     public boolean isEmpty() {
893         return len == 1 && !hasStrings();
894     }
895 
896     /**
897      * @return true if this set contains multi-character strings or the empty string.
898      */
hasStrings()899     public boolean hasStrings() {
900         return !strings.isEmpty();
901     }
902 
903     /**
904      * Implementation of UnicodeMatcher API.  Returns <tt>true</tt> if
905      * this set contains any character whose low byte is the given
906      * value.  This is used by <tt>RuleBasedTransliterator</tt> for
907      * indexing.
908      */
909     @Override
matchesIndexValue(int v)910     public boolean matchesIndexValue(int v) {
911         /* The index value v, in the range [0,255], is contained in this set if
912          * it is contained in any pair of this set.  Pairs either have the high
913          * bytes equal, or unequal.  If the high bytes are equal, then we have
914          * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
915          * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
916          * Then v is contained if xx <= v || v <= yy.  (This is identical to the
917          * time zone month containment logic.)
918          */
919         for (int i=0; i<getRangeCount(); ++i) {
920             int low = getRangeStart(i);
921             int high = getRangeEnd(i);
922             if ((low & ~0xFF) == (high & ~0xFF)) {
923                 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
924                     return true;
925                 }
926             } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
927                 return true;
928             }
929         }
930         if (hasStrings()) {
931             for (String s : strings) {
932                 if (s.isEmpty()) {
933                     continue;  // skip the empty string
934                 }
935                 int c = UTF16.charAt(s, 0);
936                 if ((c & 0xFF) == v) {
937                     return true;
938                 }
939             }
940         }
941         return false;
942     }
943 
944     /**
945      * Implementation of UnicodeMatcher.matches().  Always matches the
946      * longest possible multichar string.
947      */
948     @Override
matches(Replaceable text, int[] offset, int limit, boolean incremental)949     public int matches(Replaceable text,
950             int[] offset,
951             int limit,
952             boolean incremental) {
953 
954         if (offset[0] == limit) {
955             if (contains(UnicodeMatcher.ETHER)) {
956                 return incremental ? U_PARTIAL_MATCH : U_MATCH;
957             } else {
958                 return U_MISMATCH;
959             }
960         } else {
961             if (hasStrings()) { // try strings first
962 
963                 // might separate forward and backward loops later
964                 // for now they are combined
965 
966                 // TODO Improve efficiency of this, at least in the forward
967                 // direction, if not in both.  In the forward direction we
968                 // can assume the strings are sorted.
969 
970                 boolean forward = offset[0] < limit;
971 
972                 // firstChar is the leftmost char to match in the
973                 // forward direction or the rightmost char to match in
974                 // the reverse direction.
975                 char firstChar = text.charAt(offset[0]);
976 
977                 // If there are multiple strings that can match we
978                 // return the longest match.
979                 int highWaterLength = 0;
980 
981                 for (String trial : strings) {
982                     if (trial.isEmpty()) {
983                         continue;  // skip the empty string
984                     }
985 
986                     char c = trial.charAt(forward ? 0 : trial.length() - 1);
987 
988                     // Strings are sorted, so we can optimize in the
989                     // forward direction.
990                     if (forward && c > firstChar) break;
991                     if (c != firstChar) continue;
992 
993                     int length = matchRest(text, offset[0], limit, trial);
994 
995                     if (incremental) {
996                         int maxLen = forward ? limit-offset[0] : offset[0]-limit;
997                         if (length == maxLen) {
998                             // We have successfully matched but only up to limit.
999                             return U_PARTIAL_MATCH;
1000                         }
1001                     }
1002 
1003                     if (length == trial.length()) {
1004                         // We have successfully matched the whole string.
1005                         if (length > highWaterLength) {
1006                             highWaterLength = length;
1007                         }
1008                         // In the forward direction we know strings
1009                         // are sorted so we can bail early.
1010                         if (forward && length < highWaterLength) {
1011                             break;
1012                         }
1013                         continue;
1014                     }
1015                 }
1016 
1017                 // We've checked all strings without a partial match.
1018                 // If we have full matches, return the longest one.
1019                 if (highWaterLength != 0) {
1020                     offset[0] += forward ? highWaterLength : -highWaterLength;
1021                     return U_MATCH;
1022                 }
1023             }
1024             return super.matches(text, offset, limit, incremental);
1025         }
1026     }
1027 
1028     /**
1029      * Returns the longest match for s in text at the given position.
1030      * If limit > start then match forward from start+1 to limit
1031      * matching all characters except s.charAt(0).  If limit < start,
1032      * go backward starting from start-1 matching all characters
1033      * except s.charAt(s.length()-1).  This method assumes that the
1034      * first character, text.charAt(start), matches s, so it does not
1035      * check it.
1036      * @param text the text to match
1037      * @param start the first character to match.  In the forward
1038      * direction, text.charAt(start) is matched against s.charAt(0).
1039      * In the reverse direction, it is matched against
1040      * s.charAt(s.length()-1).
1041      * @param limit the limit offset for matching, either last+1 in
1042      * the forward direction, or last-1 in the reverse direction,
1043      * where last is the index of the last character to match.
1044      * @return If part of s matches up to the limit, return |limit -
1045      * start|.  If all of s matches before reaching the limit, return
1046      * s.length().  If there is a mismatch between s and text, return
1047      * 0
1048      */
matchRest(Replaceable text, int start, int limit, String s)1049     private static int matchRest (Replaceable text, int start, int limit, String s) {
1050         int maxLen;
1051         int slen = s.length();
1052         if (start < limit) {
1053             maxLen = limit - start;
1054             if (maxLen > slen) maxLen = slen;
1055             for (int i = 1; i < maxLen; ++i) {
1056                 if (text.charAt(start + i) != s.charAt(i)) return 0;
1057             }
1058         } else {
1059             maxLen = start - limit;
1060             if (maxLen > slen) maxLen = slen;
1061             --slen; // <=> slen = s.length() - 1;
1062             for (int i = 1; i < maxLen; ++i) {
1063                 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
1064             }
1065         }
1066         return maxLen;
1067     }
1068 
1069     /**
1070      * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1.
1071      * @deprecated This API is ICU internal only.
1072      * @hide original deprecated declaration
1073      * @hide draft / provisional / internal are hidden on Android
1074      */
1075     @Deprecated
matchesAt(CharSequence text, int offset)1076     public int matchesAt(CharSequence text, int offset) {
1077         int lastLen = -1;
1078         strings:
1079             if (hasStrings()) {
1080                 char firstChar = text.charAt(offset);
1081                 String trial = null;
1082                 // find the first string starting with firstChar
1083                 Iterator<String> it = strings.iterator();
1084                 while (it.hasNext()) {
1085                     trial = it.next();
1086                     char firstStringChar = trial.charAt(0);
1087                     if (firstStringChar < firstChar) continue;
1088                     if (firstStringChar > firstChar) break strings;
1089                 }
1090 
1091                 // now keep checking string until we get the longest one
1092                 for (;;) {
1093                     int tempLen = matchesAt(text, offset, trial);
1094                     if (lastLen > tempLen) break strings;
1095                     lastLen = tempLen;
1096                     if (!it.hasNext()) break;
1097                     trial = it.next();
1098                 }
1099             }
1100 
1101         if (lastLen < 2) {
1102             int cp = UTF16.charAt(text, offset);
1103             if (contains(cp)) lastLen = UTF16.getCharCount(cp);
1104         }
1105 
1106         return offset+lastLen;
1107     }
1108 
1109     /**
1110      * Does one string contain another, starting at a specific offset?
1111      * @param text text to match
1112      * @param offsetInText offset within that text
1113      * @param substring substring to match at offset in text
1114      * @return -1 if match fails, otherwise other.length()
1115      */
1116     // Note: This method was moved from CollectionUtilities
matchesAt(CharSequence text, int offsetInText, CharSequence substring)1117     private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) {
1118         int len = substring.length();
1119         int textLength = text.length();
1120         if (textLength + offsetInText > len) {
1121             return -1;
1122         }
1123         int i = 0;
1124         for (int j = offsetInText; i < len; ++i, ++j) {
1125             char pc = substring.charAt(i);
1126             char tc = text.charAt(j);
1127             if (pc != tc) return -1;
1128         }
1129         return i;
1130     }
1131 
1132     /**
1133      * Implementation of UnicodeMatcher API.  Union the set of all
1134      * characters that may be matched by this object into the given
1135      * set.
1136      * @param toUnionTo the set into which to union the source characters
1137      */
1138     @Override
addMatchSetTo(UnicodeSet toUnionTo)1139     public void addMatchSetTo(UnicodeSet toUnionTo) {
1140         toUnionTo.addAll(this);
1141     }
1142 
1143     /**
1144      * Returns the index of the given character within this set, where
1145      * the set is ordered by ascending code point.  If the character
1146      * is not in this set, return -1.  The inverse of this method is
1147      * <code>charAt()</code>.
1148      * @return an index from 0..size()-1, or -1
1149      */
indexOf(int c)1150     public int indexOf(int c) {
1151         if (c < MIN_VALUE || c > MAX_VALUE) {
1152             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1153         }
1154         int i = 0;
1155         int n = 0;
1156         for (;;) {
1157             int start = list[i++];
1158             if (c < start) {
1159                 return -1;
1160             }
1161             int limit = list[i++];
1162             if (c < limit) {
1163                 return n + c - start;
1164             }
1165             n += limit - start;
1166         }
1167     }
1168 
1169     /**
1170      * Returns the character at the given index within this set, where
1171      * the set is ordered by ascending code point.  If the index is
1172      * out of range, return -1.  The inverse of this method is
1173      * <code>indexOf()</code>.
1174      * @param index an index from 0..size()-1
1175      * @return the character at the given index, or -1.
1176      */
charAt(int index)1177     public int charAt(int index) {
1178         if (index >= 0) {
1179             // len2 is the largest even integer <= len, that is, it is len
1180             // for even values and len-1 for odd values.  With odd values
1181             // the last entry is UNICODESET_HIGH.
1182             int len2 = len & ~1;
1183             for (int i=0; i < len2;) {
1184                 int start = list[i++];
1185                 int count = list[i++] - start;
1186                 if (index < count) {
1187                     return start + index;
1188                 }
1189                 index -= count;
1190             }
1191         }
1192         return -1;
1193     }
1194 
1195     /**
1196      * Adds the specified range to this set if it is not already
1197      * present.  If this set already contains the specified range,
1198      * the call leaves this set unchanged.  If <code>start &gt; end</code>
1199      * then an empty range is added, leaving the set unchanged.
1200      *
1201      * @param start first character, inclusive, of range to be added
1202      * to this set.
1203      * @param end last character, inclusive, of range to be added
1204      * to this set.
1205      */
add(int start, int end)1206     public UnicodeSet add(int start, int end) {
1207         checkFrozen();
1208         return add_unchecked(start, end);
1209     }
1210 
1211     /**
1212      * Adds all characters in range (uses preferred naming convention).
1213      * @param start The index of where to start on adding all characters.
1214      * @param end The index of where to end on adding all characters.
1215      * @return a reference to this object
1216      */
addAll(int start, int end)1217     public UnicodeSet addAll(int start, int end) {
1218         checkFrozen();
1219         return add_unchecked(start, end);
1220     }
1221 
1222     // for internal use, after checkFrozen has been called
add_unchecked(int start, int end)1223     private UnicodeSet add_unchecked(int start, int end) {
1224         if (start < MIN_VALUE || start > MAX_VALUE) {
1225             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1226         }
1227         if (end < MIN_VALUE || end > MAX_VALUE) {
1228             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1229         }
1230         if (start < end) {
1231             int limit = end + 1;
1232             // Fast path for adding a new range after the last one.
1233             // Odd list length: [..., lastStart, lastLimit, HIGH]
1234             if ((len & 1) != 0) {
1235                 // If the list is empty, set lastLimit low enough to not be adjacent to 0.
1236                 int lastLimit = len == 1 ? -2 : list[len - 2];
1237                 if (lastLimit <= start) {
1238                     checkFrozen();
1239                     if (lastLimit == start) {
1240                         // Extend the last range.
1241                         list[len - 2] = limit;
1242                         if (limit == HIGH) {
1243                             --len;
1244                         }
1245                     } else {
1246                         list[len - 1] = start;
1247                         if (limit < HIGH) {
1248                             ensureCapacity(len + 2);
1249                             list[len++] = limit;
1250                             list[len++] = HIGH;
1251                         } else {  // limit == HIGH
1252                             ensureCapacity(len + 1);
1253                             list[len++] = HIGH;
1254                         }
1255                     }
1256                     pat = null;
1257                     return this;
1258                 }
1259             }
1260             // This is slow. Could be much faster using findCodePoint(start)
1261             // and modifying the list, dealing with adjacent & overlapping ranges.
1262             add(range(start, end), 2, 0);
1263         } else if (start == end) {
1264             add(start);
1265         }
1266         return this;
1267     }
1268 
1269     //    /**
1270     //     * Format out the inversion list as a string, for debugging.  Uncomment when
1271     //     * needed.
1272     //     */
1273     //    public final String dump() {
1274     //        StringBuffer buf = new StringBuffer("[");
1275     //        for (int i=0; i<len; ++i) {
1276     //            if (i != 0) buf.append(", ");
1277     //            int c = list[i];
1278     //            //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') {
1279     //            //    buf.append((char) c);
1280     //            //} else {
1281     //                buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
1282     //            //}
1283     //        }
1284     //        buf.append("]");
1285     //        return buf.toString();
1286     //    }
1287 
1288     /**
1289      * Adds the specified character to this set if it is not already
1290      * present.  If this set already contains the specified character,
1291      * the call leaves this set unchanged.
1292      */
add(int c)1293     public final UnicodeSet add(int c) {
1294         checkFrozen();
1295         return add_unchecked(c);
1296     }
1297 
1298     // for internal use only, after checkFrozen has been called
add_unchecked(int c)1299     private final UnicodeSet add_unchecked(int c) {
1300         if (c < MIN_VALUE || c > MAX_VALUE) {
1301             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1302         }
1303 
1304         // find smallest i such that c < list[i]
1305         // if odd, then it is IN the set
1306         // if even, then it is OUT of the set
1307         int i = findCodePoint(c);
1308 
1309         // already in set?
1310         if ((i & 1) != 0) return this;
1311 
1312         // HIGH is 0x110000
1313         // assert(list[len-1] == HIGH);
1314 
1315         // empty = [HIGH]
1316         // [start_0, limit_0, start_1, limit_1, HIGH]
1317 
1318         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1319         //                             ^
1320         //                             list[i]
1321 
1322         // i == 0 means c is before the first range
1323         // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into
1324         //      add_unchecked, the maximum value that "c" will be compared to
1325         //      is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will
1326         //      never be reached according to this logic.
1327         if (c == list[i]-1) {
1328             // c is before start of next range
1329             list[i] = c;
1330             // if we touched the HIGH mark, then add a new one
1331             if (c == MAX_VALUE) {
1332                 ensureCapacity(len+1);
1333                 list[len++] = HIGH;
1334             }
1335             if (i > 0 && c == list[i-1]) {
1336                 // collapse adjacent ranges
1337 
1338                 // [..., start_k-1, c, c, limit_k, ..., HIGH]
1339                 //                     ^
1340                 //                     list[i]
1341                 System.arraycopy(list, i+1, list, i-1, len-i-1);
1342                 len -= 2;
1343             }
1344         }
1345 
1346         else if (i > 0 && c == list[i-1]) {
1347             // c is after end of prior range
1348             list[i-1]++;
1349             // no need to check for collapse here
1350         }
1351 
1352         else {
1353             // At this point we know the new char is not adjacent to
1354             // any existing ranges, and it is not 10FFFF.
1355 
1356 
1357             // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
1358             //                             ^
1359             //                             list[i]
1360 
1361             // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
1362             //                             ^
1363             //                             list[i]
1364 
1365             // Don't use ensureCapacity() to save on copying.
1366             // NOTE: This has no measurable impact on performance,
1367             // but it might help in some usage patterns.
1368             if (len+2 > list.length) {
1369                 int[] temp = new int[nextCapacity(len + 2)];
1370                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
1371                 System.arraycopy(list, i, temp, i+2, len-i);
1372                 list = temp;
1373             } else {
1374                 System.arraycopy(list, i, list, i+2, len-i);
1375             }
1376 
1377             list[i] = c;
1378             list[i+1] = c+1;
1379             len += 2;
1380         }
1381 
1382         pat = null;
1383         return this;
1384     }
1385 
1386     /**
1387      * Adds the specified multicharacter to this set if it is not already
1388      * present.  If this set already contains the multicharacter,
1389      * the call leaves this set unchanged.
1390      * Thus "ch" =&gt; {"ch"}
1391      *
1392      * @param s the source string
1393      * @return this object, for chaining
1394      */
add(CharSequence s)1395     public final UnicodeSet add(CharSequence s) {
1396         checkFrozen();
1397         int cp = getSingleCP(s);
1398         if (cp < 0) {
1399             String str = s.toString();
1400             if (!strings.contains(str)) {
1401                 addString(str);
1402                 pat = null;
1403             }
1404         } else {
1405             add_unchecked(cp, cp);
1406         }
1407         return this;
1408     }
1409 
addString(CharSequence s)1410     private void addString(CharSequence s) {
1411         if (strings == EMPTY_STRINGS) {
1412             strings = new TreeSet<>();
1413         }
1414         strings.add(s.toString());
1415     }
1416 
1417     /**
1418      * Utility for getting code point from single code point CharSequence.
1419      * See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE).
1420      *
1421      * @return a code point IF the string consists of a single one.
1422      * otherwise returns -1.
1423      * @param s to test
1424      */
getSingleCP(CharSequence s)1425     private static int getSingleCP(CharSequence s) {
1426         if (s.length() == 1) return s.charAt(0);
1427         if (s.length() == 2) {
1428             int cp = Character.codePointAt(s, 0);
1429             if (cp > 0xFFFF) { // is surrogate pair
1430                 return cp;
1431             }
1432         }
1433         return -1;
1434     }
1435 
1436     /**
1437      * Adds each of the characters in this string to the set. Thus "ch" =&gt; {"c", "h"}
1438      * If this set already any particular character, it has no effect on that character.
1439      * @param s the source string
1440      * @return this object, for chaining
1441      */
addAll(CharSequence s)1442     public final UnicodeSet addAll(CharSequence s) {
1443         checkFrozen();
1444         int cp;
1445         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
1446             cp = UTF16.charAt(s, i);
1447             add_unchecked(cp, cp);
1448         }
1449         return this;
1450     }
1451 
1452     /**
1453      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1454      * If this set already any particular character, it has no effect on that character.
1455      * @param s the source string
1456      * @return this object, for chaining
1457      */
retainAll(CharSequence s)1458     public final UnicodeSet retainAll(CharSequence s) {
1459         return retainAll(fromAll(s));
1460     }
1461 
1462     /**
1463      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1464      * If this set already any particular character, it has no effect on that character.
1465      * @param s the source string
1466      * @return this object, for chaining
1467      */
complementAll(CharSequence s)1468     public final UnicodeSet complementAll(CharSequence s) {
1469         return complementAll(fromAll(s));
1470     }
1471 
1472     /**
1473      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1474      * If this set already any particular character, it has no effect on that character.
1475      * @param s the source string
1476      * @return this object, for chaining
1477      */
removeAll(CharSequence s)1478     public final UnicodeSet removeAll(CharSequence s) {
1479         return removeAll(fromAll(s));
1480     }
1481 
1482     /**
1483      * Remove all strings from this UnicodeSet
1484      * @return this object, for chaining
1485      */
removeAllStrings()1486     public final UnicodeSet removeAllStrings() {
1487         checkFrozen();
1488         if (hasStrings()) {
1489             strings.clear();
1490             pat = null;
1491         }
1492         return this;
1493     }
1494 
1495     /**
1496      * Makes a set from a multicharacter string. Thus "ch" =&gt; {"ch"}
1497      *
1498      * @param s the source string
1499      * @return a newly created set containing the given string
1500      */
from(CharSequence s)1501     public static UnicodeSet from(CharSequence s) {
1502         return new UnicodeSet().add(s);
1503     }
1504 
1505 
1506     /**
1507      * Makes a set from each of the characters in the string. Thus "ch" =&gt; {"c", "h"}
1508      * @param s the source string
1509      * @return a newly created set containing the given characters
1510      */
fromAll(CharSequence s)1511     public static UnicodeSet fromAll(CharSequence s) {
1512         return new UnicodeSet().addAll(s);
1513     }
1514 
1515 
1516     /**
1517      * Retain only the elements in this set that are contained in the
1518      * specified range.  If <code>start &gt; end</code> then an empty range is
1519      * retained, leaving the set empty.
1520      *
1521      * @param start first character, inclusive, of range
1522      * @param end last character, inclusive, of range
1523      */
retain(int start, int end)1524     public UnicodeSet retain(int start, int end) {
1525         checkFrozen();
1526         if (start < MIN_VALUE || start > MAX_VALUE) {
1527             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1528         }
1529         if (end < MIN_VALUE || end > MAX_VALUE) {
1530             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1531         }
1532         if (start <= end) {
1533             retain(range(start, end), 2, 0);
1534         } else {
1535             clear();
1536         }
1537         return this;
1538     }
1539 
1540     /**
1541      * Retain the specified character from this set if it is present.
1542      * Upon return this set will be empty if it did not contain c, or
1543      * will only contain c if it did contain c.
1544      * @param c the character to be retained
1545      * @return this object, for chaining
1546      */
retain(int c)1547     public final UnicodeSet retain(int c) {
1548         return retain(c, c);
1549     }
1550 
1551     /**
1552      * Retain the specified string in this set if it is present.
1553      * Upon return this set will be empty if it did not contain s, or
1554      * will only contain s if it did contain s.
1555      * @param cs the string to be retained
1556      * @return this object, for chaining
1557      */
retain(CharSequence cs)1558     public final UnicodeSet retain(CharSequence cs) {
1559         int cp = getSingleCP(cs);
1560         if (cp < 0) {
1561             checkFrozen();
1562             String s = cs.toString();
1563             boolean isIn = strings.contains(s);
1564             // Check for getRangeCount() first to avoid somewhat-expensive size()
1565             // when there are single code points.
1566             if (isIn && getRangeCount() == 0 && size() == 1) {
1567                 return this;
1568             }
1569             clear();
1570             if (isIn) {
1571                 addString(s);
1572             }
1573             pat = null;
1574         } else {
1575             retain(cp, cp);
1576         }
1577         return this;
1578     }
1579 
1580     /**
1581      * Removes the specified range from this set if it is present.
1582      * The set will not contain the specified range once the call
1583      * returns.  If <code>start &gt; end</code> then an empty range is
1584      * removed, leaving the set unchanged.
1585      *
1586      * @param start first character, inclusive, of range to be removed
1587      * from this set.
1588      * @param end last character, inclusive, of range to be removed
1589      * from this set.
1590      */
remove(int start, int end)1591     public UnicodeSet remove(int start, int end) {
1592         checkFrozen();
1593         if (start < MIN_VALUE || start > MAX_VALUE) {
1594             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1595         }
1596         if (end < MIN_VALUE || end > MAX_VALUE) {
1597             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1598         }
1599         if (start <= end) {
1600             retain(range(start, end), 2, 2);
1601         }
1602         return this;
1603     }
1604 
1605     /**
1606      * Removes the specified character from this set if it is present.
1607      * The set will not contain the specified character once the call
1608      * returns.
1609      * @param c the character to be removed
1610      * @return this object, for chaining
1611      */
remove(int c)1612     public final UnicodeSet remove(int c) {
1613         return remove(c, c);
1614     }
1615 
1616     /**
1617      * Removes the specified string from this set if it is present.
1618      * The set will not contain the specified string once the call
1619      * returns.
1620      * @param s the string to be removed
1621      * @return this object, for chaining
1622      */
remove(CharSequence s)1623     public final UnicodeSet remove(CharSequence s) {
1624         int cp = getSingleCP(s);
1625         if (cp < 0) {
1626             checkFrozen();
1627             String str = s.toString();
1628             if (strings.contains(str)) {
1629                 strings.remove(str);
1630                 pat = null;
1631             }
1632         } else {
1633             remove(cp, cp);
1634         }
1635         return this;
1636     }
1637 
1638     /**
1639      * Complements the specified range in this set.  Any character in
1640      * the range will be removed if it is in this set, or will be
1641      * added if it is not in this set.  If <code>start &gt; end</code>
1642      * then an empty range is complemented, leaving the set unchanged.
1643      *
1644      * @param start first character, inclusive, of range
1645      * @param end last character, inclusive, of range
1646      */
complement(int start, int end)1647     public UnicodeSet complement(int start, int end) {
1648         checkFrozen();
1649         if (start < MIN_VALUE || start > MAX_VALUE) {
1650             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1651         }
1652         if (end < MIN_VALUE || end > MAX_VALUE) {
1653             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1654         }
1655         if (start <= end) {
1656             xor(range(start, end), 2, 0);
1657         }
1658         pat = null;
1659         return this;
1660     }
1661 
1662     /**
1663      * Complements the specified character in this set.  The character
1664      * will be removed if it is in this set, or will be added if it is
1665      * not in this set.
1666      */
complement(int c)1667     public final UnicodeSet complement(int c) {
1668         return complement(c, c);
1669     }
1670 
1671     /**
1672      * This is equivalent to
1673      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1674      *
1675      * <p><strong>Note:</strong> This performs a symmetric difference with all code points
1676      * <em>and thus retains all multicharacter strings</em>.
1677      * In order to achieve a “code point complement” (all code points minus this set),
1678      * the easiest is to .{@link #complement()}.{@link #removeAllStrings()} .
1679      */
complement()1680     public UnicodeSet complement() {
1681         checkFrozen();
1682         if (list[0] == LOW) {
1683             System.arraycopy(list, 1, list, 0, len-1);
1684             --len;
1685         } else {
1686             ensureCapacity(len+1);
1687             System.arraycopy(list, 0, list, 1, len);
1688             list[0] = LOW;
1689             ++len;
1690         }
1691         pat = null;
1692         return this;
1693     }
1694 
1695     /**
1696      * Complement the specified string in this set.
1697      * The set will not contain the specified string once the call
1698      * returns.
1699      *
1700      * @param s the string to complement
1701      * @return this object, for chaining
1702      */
complement(CharSequence s)1703     public final UnicodeSet complement(CharSequence s) {
1704         checkFrozen();
1705         int cp = getSingleCP(s);
1706         if (cp < 0) {
1707             String s2 = s.toString();
1708             if (strings.contains(s2)) {
1709                 strings.remove(s2);
1710             } else {
1711                 addString(s2);
1712             }
1713             pat = null;
1714         } else {
1715             complement(cp, cp);
1716         }
1717         return this;
1718     }
1719 
1720     /**
1721      * Returns true if this set contains the given character.
1722      * @param c character to be checked for containment
1723      * @return true if the test condition is met
1724      */
1725     @Override
contains(int c)1726     public boolean contains(int c) {
1727         if (c < MIN_VALUE || c > MAX_VALUE) {
1728             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
1729         }
1730         if (bmpSet != null) {
1731             return bmpSet.contains(c);
1732         }
1733         if (stringSpan != null) {
1734             return stringSpan.contains(c);
1735         }
1736 
1737         /*
1738         // Set i to the index of the start item greater than ch
1739         // We know we will terminate without length test!
1740         int i = -1;
1741         while (true) {
1742             if (c < list[++i]) break;
1743         }
1744          */
1745 
1746         int i = findCodePoint(c);
1747 
1748         return ((i & 1) != 0); // return true if odd
1749     }
1750 
1751     /**
1752      * Returns the smallest value i such that c < list[i].  Caller
1753      * must ensure that c is a legal value or this method will enter
1754      * an infinite loop.  This method performs a binary search.
1755      * @param c a character in the range MIN_VALUE..MAX_VALUE
1756      * inclusive
1757      * @return the smallest integer i in the range 0..len-1,
1758      * inclusive, such that c < list[i]
1759      */
findCodePoint(int c)1760     private final int findCodePoint(int c) {
1761         /* Examples:
1762                                            findCodePoint(c)
1763            set              list[]         c=0 1 3 4 7 8
1764            ===              ==============   ===========
1765            []               [110000]         0 0 0 0 0 0
1766            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
1767            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
1768            [:all:]          [0, 110000]      1 1 1 1 1 1
1769          */
1770 
1771         // Return the smallest i such that c < list[i].  Assume
1772         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
1773         if (c < list[0]) return 0;
1774         // High runner test.  c is often after the last range, so an
1775         // initial check for this condition pays off.
1776         if (len >= 2 && c >= list[len-2]) return len-1;
1777         int lo = 0;
1778         int hi = len - 1;
1779         // invariant: c >= list[lo]
1780         // invariant: c < list[hi]
1781         for (;;) {
1782             int i = (lo + hi) >>> 1;
1783         if (i == lo) return hi;
1784         if (c < list[i]) {
1785             hi = i;
1786         } else {
1787             lo = i;
1788         }
1789         }
1790     }
1791 
1792     //    //----------------------------------------------------------------
1793     //    // Unrolled binary search
1794     //    //----------------------------------------------------------------
1795     //
1796     //    private int validLen = -1; // validated value of len
1797     //    private int topOfLow;
1798     //    private int topOfHigh;
1799     //    private int power;
1800     //    private int deltaStart;
1801     //
1802     //    private void validate() {
1803     //        if (len <= 1) {
1804     //            throw new IllegalArgumentException("list.len==" + len + "; must be >1");
1805     //        }
1806     //
1807     //        // find greatest power of 2 less than or equal to len
1808     //        for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
1809     //
1810     //        // assert(exp2[power] <= len);
1811     //
1812     //        // determine the starting points
1813     //        topOfLow = exp2[power] - 1;
1814     //        topOfHigh = len - 1;
1815     //        deltaStart = exp2[power-1];
1816     //        validLen = len;
1817     //    }
1818     //
1819     //    private static final int exp2[] = {
1820     //        0x1, 0x2, 0x4, 0x8,
1821     //        0x10, 0x20, 0x40, 0x80,
1822     //        0x100, 0x200, 0x400, 0x800,
1823     //        0x1000, 0x2000, 0x4000, 0x8000,
1824     //        0x10000, 0x20000, 0x40000, 0x80000,
1825     //        0x100000, 0x200000, 0x400000, 0x800000,
1826     //        0x1000000, 0x2000000, 0x4000000, 0x8000000,
1827     //        0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
1828     //    };
1829     //
1830     //    /**
1831     //     * Unrolled lowest index GT.
1832     //     */
1833     //    private final int leastIndexGT(int searchValue) {
1834     //
1835     //        if (len != validLen) {
1836     //            if (len == 1) return 0;
1837     //            validate();
1838     //        }
1839     //        int temp;
1840     //
1841     //        // set up initial range to search. Each subrange is a power of two in length
1842     //        int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
1843     //
1844     //        // Completely unrolled binary search, folhighing "Programming Pearls"
1845     //        // Each case deliberately falls through to the next
1846     //        // Logically, list[-1] < all_search_values && list[count] > all_search_values
1847     //        // although the values -1 and count are never actually touched.
1848     //
1849     //        // The bounds at each point are low & high,
1850     //        // where low == high - delta*2
1851     //        // so high - delta is the midpoint
1852     //
1853     //        // The invariant AFTER each line is that list[low] < searchValue <= list[high]
1854     //
1855     //        switch (power) {
1856     //        //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
1857     //        case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
1858     //        case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
1859     //
1860     //        case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
1861     //        case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
1862     //        case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
1863     //        case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
1864     //
1865     //        case 24: if (searchValue < list[temp = high-  0x800000]) high = temp;
1866     //        case 23: if (searchValue < list[temp = high-  0x400000]) high = temp;
1867     //        case 22: if (searchValue < list[temp = high-  0x200000]) high = temp;
1868     //        case 21: if (searchValue < list[temp = high-  0x100000]) high = temp;
1869     //
1870     //        case 20: if (searchValue < list[temp = high-   0x80000]) high = temp;
1871     //        case 19: if (searchValue < list[temp = high-   0x40000]) high = temp;
1872     //        case 18: if (searchValue < list[temp = high-   0x20000]) high = temp;
1873     //        case 17: if (searchValue < list[temp = high-   0x10000]) high = temp;
1874     //
1875     //        case 16: if (searchValue < list[temp = high-    0x8000]) high = temp;
1876     //        case 15: if (searchValue < list[temp = high-    0x4000]) high = temp;
1877     //        case 14: if (searchValue < list[temp = high-    0x2000]) high = temp;
1878     //        case 13: if (searchValue < list[temp = high-    0x1000]) high = temp;
1879     //
1880     //        case 12: if (searchValue < list[temp = high-     0x800]) high = temp;
1881     //        case 11: if (searchValue < list[temp = high-     0x400]) high = temp;
1882     //        case 10: if (searchValue < list[temp = high-     0x200]) high = temp;
1883     //        case  9: if (searchValue < list[temp = high-     0x100]) high = temp;
1884     //
1885     //        case  8: if (searchValue < list[temp = high-      0x80]) high = temp;
1886     //        case  7: if (searchValue < list[temp = high-      0x40]) high = temp;
1887     //        case  6: if (searchValue < list[temp = high-      0x20]) high = temp;
1888     //        case  5: if (searchValue < list[temp = high-      0x10]) high = temp;
1889     //
1890     //        case  4: if (searchValue < list[temp = high-       0x8]) high = temp;
1891     //        case  3: if (searchValue < list[temp = high-       0x4]) high = temp;
1892     //        case  2: if (searchValue < list[temp = high-       0x2]) high = temp;
1893     //        case  1: if (searchValue < list[temp = high-       0x1]) high = temp;
1894     //        }
1895     //
1896     //        return high;
1897     //    }
1898     //
1899     //    // For debugging only
1900     //    public int len() {
1901     //        return len;
1902     //    }
1903     //
1904     //    //----------------------------------------------------------------
1905     //    //----------------------------------------------------------------
1906 
1907     /**
1908      * Returns true if this set contains every character
1909      * of the given range.
1910      * @param start first character, inclusive, of the range
1911      * @param end last character, inclusive, of the range
1912      * @return true if the test condition is met
1913      */
contains(int start, int end)1914     public boolean contains(int start, int end) {
1915         if (start < MIN_VALUE || start > MAX_VALUE) {
1916             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
1917         }
1918         if (end < MIN_VALUE || end > MAX_VALUE) {
1919             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
1920         }
1921         //int i = -1;
1922         //while (true) {
1923         //    if (start < list[++i]) break;
1924         //}
1925         int i = findCodePoint(start);
1926         return ((i & 1) != 0 && end < list[i]);
1927     }
1928 
1929     /**
1930      * Returns <tt>true</tt> if this set contains the given
1931      * multicharacter string.
1932      * @param s string to be checked for containment
1933      * @return <tt>true</tt> if this set contains the specified string
1934      */
contains(CharSequence s)1935     public final boolean contains(CharSequence s) {
1936 
1937         int cp = getSingleCP(s);
1938         if (cp < 0) {
1939             return strings.contains(s.toString());
1940         } else {
1941             return contains(cp);
1942         }
1943     }
1944 
1945     /**
1946      * Returns true if this set contains all the characters and strings
1947      * of the given set.
1948      * @param b set to be checked for containment
1949      * @return true if the test condition is met
1950      */
containsAll(UnicodeSet b)1951     public boolean containsAll(UnicodeSet b) {
1952         // The specified set is a subset if all of its pairs are contained in
1953         // this set. This implementation accesses the lists directly for speed.
1954         // TODO: this could be faster if size() were cached. But that would affect building speed
1955         // so it needs investigation.
1956         int[] listB = b.list;
1957         boolean needA = true;
1958         boolean needB = true;
1959         int aPtr = 0;
1960         int bPtr = 0;
1961         int aLen = len - 1;
1962         int bLen = b.len - 1;
1963         int startA = 0, startB = 0, limitA = 0, limitB = 0;
1964         while (true) {
1965             // double iterations are such a pain...
1966             if (needA) {
1967                 if (aPtr >= aLen) {
1968                     // ran out of A. If B is also exhausted, then break;
1969                     if (needB && bPtr >= bLen) {
1970                         break;
1971                     }
1972                     return false;
1973                 }
1974                 startA = list[aPtr++];
1975                 limitA = list[aPtr++];
1976             }
1977             if (needB) {
1978                 if (bPtr >= bLen) {
1979                     // ran out of B. Since we got this far, we have an A and we are ok so far
1980                     break;
1981                 }
1982                 startB = listB[bPtr++];
1983                 limitB = listB[bPtr++];
1984             }
1985             // if B doesn't overlap and is greater than A, get new A
1986             if (startB >= limitA) {
1987                 needA = true;
1988                 needB = false;
1989                 continue;
1990             }
1991             // if B is wholy contained in A, then get a new B
1992             if (startB >= startA && limitB <= limitA) {
1993                 needA = false;
1994                 needB = true;
1995                 continue;
1996             }
1997             // all other combinations mean we fail
1998             return false;
1999         }
2000 
2001         if (!strings.containsAll(b.strings)) return false;
2002         return true;
2003     }
2004 
2005     //    /**
2006     //     * Returns true if this set contains all the characters and strings
2007     //     * of the given set.
2008     //     * @param c set to be checked for containment
2009     //     * @return true if the test condition is met
2010     //     * @stable ICU 2.0
2011     //     */
2012     //    public boolean containsAllOld(UnicodeSet c) {
2013     //        // The specified set is a subset if all of its pairs are contained in
2014     //        // this set.  It's possible to code this more efficiently in terms of
2015     //        // direct manipulation of the inversion lists if the need arises.
2016     //        int n = c.getRangeCount();
2017     //        for (int i=0; i<n; ++i) {
2018     //            if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
2019     //                return false;
2020     //            }
2021     //        }
2022     //        if (!strings.containsAll(c.strings)) return false;
2023     //        return true;
2024     //    }
2025 
2026     /**
2027      * Returns true if there is a partition of the string such that this set contains each of the partitioned strings.
2028      * For example, for the Unicode set [a{bc}{cd}]<br>
2029      * containsAll is true for each of: "a", "bc", ""cdbca"<br>
2030      * containsAll is false for each of: "acb", "bcda", "bcx"<br>
2031      * @param s string containing characters to be checked for containment
2032      * @return true if the test condition is met
2033      */
containsAll(String s)2034     public boolean containsAll(String s) {
2035         int cp;
2036         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
2037             cp = UTF16.charAt(s, i);
2038             if (!contains(cp))  {
2039                 if (!hasStrings()) {
2040                     return false;
2041                 }
2042                 return containsAll(s, 0);
2043             }
2044         }
2045         return true;
2046     }
2047 
2048     /**
2049      * Recursive routine called if we fail to find a match in containsAll, and there are strings
2050      * @param s source string
2051      * @param i point to match to the end on
2052      * @return true if ok
2053      */
containsAll(String s, int i)2054     private boolean containsAll(String s, int i) {
2055         if (i >= s.length()) {
2056             return true;
2057         }
2058         int  cp= UTF16.charAt(s, i);
2059         if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
2060             return true;
2061         }
2062         for (String setStr : strings) {
2063             if (!setStr.isEmpty() &&  // skip the empty string
2064                     s.startsWith(setStr, i) &&  containsAll(s, i+setStr.length())) {
2065                 return true;
2066             }
2067         }
2068         return false;
2069 
2070     }
2071 
2072     /**
2073      * Get the Regex equivalent for this UnicodeSet
2074      * @return regex pattern equivalent to this UnicodeSet
2075      * @deprecated This API is ICU internal only.
2076      * @hide original deprecated declaration
2077      * @hide draft / provisional / internal are hidden on Android
2078      */
2079     @Deprecated
getRegexEquivalent()2080     public String getRegexEquivalent() {
2081         if (!hasStrings()) {
2082             return toString();
2083         }
2084         StringBuilder result = new StringBuilder("(?:");
2085         appendNewPattern(result, true, false);
2086         for (String s : strings) {
2087             result.append('|');
2088             _appendToPat(result, s, true);
2089         }
2090         return result.append(")").toString();
2091     }
2092 
2093     /**
2094      * Returns true if this set contains none of the characters
2095      * of the given range.
2096      * @param start first character, inclusive, of the range
2097      * @param end last character, inclusive, of the range
2098      * @return true if the test condition is met
2099      */
containsNone(int start, int end)2100     public boolean containsNone(int start, int end) {
2101         if (start < MIN_VALUE || start > MAX_VALUE) {
2102             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
2103         }
2104         if (end < MIN_VALUE || end > MAX_VALUE) {
2105             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
2106         }
2107         int i = -1;
2108         while (true) {
2109             if (start < list[++i]) break;
2110         }
2111         return ((i & 1) == 0 && end < list[i]);
2112     }
2113 
2114     /**
2115      * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
2116      * For example, for the Unicode set [a{bc}{cd}]<br>
2117      * containsNone is true for: "xy", "cb"<br>
2118      * containsNone is false for: "a", "bc", "bcd"<br>
2119      * @param b set to be checked for containment
2120      * @return true if the test condition is met
2121      */
containsNone(UnicodeSet b)2122     public boolean containsNone(UnicodeSet b) {
2123         // The specified set is a subset if some of its pairs overlap with some of this set's pairs.
2124         // This implementation accesses the lists directly for speed.
2125         int[] listB = b.list;
2126         boolean needA = true;
2127         boolean needB = true;
2128         int aPtr = 0;
2129         int bPtr = 0;
2130         int aLen = len - 1;
2131         int bLen = b.len - 1;
2132         int startA = 0, startB = 0, limitA = 0, limitB = 0;
2133         while (true) {
2134             // double iterations are such a pain...
2135             if (needA) {
2136                 if (aPtr >= aLen) {
2137                     // ran out of A: break so we test strings
2138                     break;
2139                 }
2140                 startA = list[aPtr++];
2141                 limitA = list[aPtr++];
2142             }
2143             if (needB) {
2144                 if (bPtr >= bLen) {
2145                     // ran out of B: break so we test strings
2146                     break;
2147                 }
2148                 startB = listB[bPtr++];
2149                 limitB = listB[bPtr++];
2150             }
2151             // if B is higher than any part of A, get new A
2152             if (startB >= limitA) {
2153                 needA = true;
2154                 needB = false;
2155                 continue;
2156             }
2157             // if A is higher than any part of B, get new B
2158             if (startA >= limitB) {
2159                 needA = false;
2160                 needB = true;
2161                 continue;
2162             }
2163             // all other combinations mean we fail
2164             return false;
2165         }
2166 
2167         if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false;
2168         return true;
2169     }
2170 
2171     //    /**
2172     //     * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
2173     //     * For example, for the Unicode set [a{bc}{cd}]<br>
2174     //     * containsNone is true for: "xy", "cb"<br>
2175     //     * containsNone is false for: "a", "bc", "bcd"<br>
2176     //     * @param c set to be checked for containment
2177     //     * @return true if the test condition is met
2178     //     * @stable ICU 2.0
2179     //     */
2180     //    public boolean containsNoneOld(UnicodeSet c) {
2181     //        // The specified set is a subset if all of its pairs are contained in
2182     //        // this set.  It's possible to code this more efficiently in terms of
2183     //        // direct manipulation of the inversion lists if the need arises.
2184     //        int n = c.getRangeCount();
2185     //        for (int i=0; i<n; ++i) {
2186     //            if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
2187     //                return false;
2188     //            }
2189     //        }
2190     //        if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false;
2191     //        return true;
2192     //    }
2193 
2194     /**
2195      * Returns true if this set contains none of the characters
2196      * of the given string.
2197      * @param s string containing characters to be checked for containment
2198      * @return true if the test condition is met
2199      */
containsNone(CharSequence s)2200     public boolean containsNone(CharSequence s) {
2201         return span(s, SpanCondition.NOT_CONTAINED) == s.length();
2202     }
2203 
2204     /**
2205      * Returns true if this set contains one or more of the characters
2206      * in the given range.
2207      * @param start first character, inclusive, of the range
2208      * @param end last character, inclusive, of the range
2209      * @return true if the condition is met
2210      */
containsSome(int start, int end)2211     public final boolean containsSome(int start, int end) {
2212         return !containsNone(start, end);
2213     }
2214 
2215     /**
2216      * Returns true if this set contains one or more of the characters
2217      * and strings of the given set.
2218      * @param s set to be checked for containment
2219      * @return true if the condition is met
2220      */
containsSome(UnicodeSet s)2221     public final boolean containsSome(UnicodeSet s) {
2222         return !containsNone(s);
2223     }
2224 
2225     /**
2226      * Returns true if this set contains one or more of the characters
2227      * of the given string.
2228      * @param s string containing characters to be checked for containment
2229      * @return true if the condition is met
2230      */
containsSome(CharSequence s)2231     public final boolean containsSome(CharSequence s) {
2232         return !containsNone(s);
2233     }
2234 
2235 
2236     /**
2237      * Adds all of the elements in the specified set to this set if
2238      * they're not already present.  This operation effectively
2239      * modifies this set so that its value is the <i>union</i> of the two
2240      * sets.  The behavior of this operation is unspecified if the specified
2241      * collection is modified while the operation is in progress.
2242      *
2243      * @param c set whose elements are to be added to this set.
2244      */
addAll(UnicodeSet c)2245     public UnicodeSet addAll(UnicodeSet c) {
2246         checkFrozen();
2247         add(c.list, c.len, 0);
2248         if (c.hasStrings()) {
2249             if (strings == EMPTY_STRINGS) {
2250                 strings = new TreeSet<>(c.strings);
2251             } else {
2252                 strings.addAll(c.strings);
2253             }
2254         }
2255         return this;
2256     }
2257 
2258     /**
2259      * Retains only the elements in this set that are contained in the
2260      * specified set.  In other words, removes from this set all of
2261      * its elements that are not contained in the specified set.  This
2262      * operation effectively modifies this set so that its value is
2263      * the <i>intersection</i> of the two sets.
2264      *
2265      * @param c set that defines which elements this set will retain.
2266      */
retainAll(UnicodeSet c)2267     public UnicodeSet retainAll(UnicodeSet c) {
2268         checkFrozen();
2269         retain(c.list, c.len, 0);
2270         if (hasStrings()) {
2271             if (!c.hasStrings()) {
2272                 strings.clear();
2273             } else {
2274                 strings.retainAll(c.strings);
2275             }
2276         }
2277         return this;
2278     }
2279 
2280     /**
2281      * Removes from this set all of its elements that are contained in the
2282      * specified set.  This operation effectively modifies this
2283      * set so that its value is the <i>asymmetric set difference</i> of
2284      * the two sets.
2285      *
2286      * @param c set that defines which elements will be removed from
2287      *          this set.
2288      */
removeAll(UnicodeSet c)2289     public UnicodeSet removeAll(UnicodeSet c) {
2290         checkFrozen();
2291         retain(c.list, c.len, 2);
2292         if (hasStrings() && c.hasStrings()) {
2293             strings.removeAll(c.strings);
2294         }
2295         return this;
2296     }
2297 
2298     /**
2299      * Complements in this set all elements contained in the specified
2300      * set.  Any character in the other set will be removed if it is
2301      * in this set, or will be added if it is not in this set.
2302      *
2303      * @param c set that defines which elements will be complemented from
2304      *          this set.
2305      */
complementAll(UnicodeSet c)2306     public UnicodeSet complementAll(UnicodeSet c) {
2307         checkFrozen();
2308         xor(c.list, c.len, 0);
2309         if (c.hasStrings()) {
2310             if (strings == EMPTY_STRINGS) {
2311                 strings = new TreeSet<>(c.strings);
2312             } else {
2313                 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings);
2314             }
2315         }
2316         return this;
2317     }
2318 
2319     /**
2320      * Removes all of the elements from this set.  This set will be
2321      * empty after this call returns.
2322      */
clear()2323     public UnicodeSet clear() {
2324         checkFrozen();
2325         list[0] = HIGH;
2326         len = 1;
2327         pat = null;
2328         if (hasStrings()) {
2329             strings.clear();
2330         }
2331         return this;
2332     }
2333 
2334     /**
2335      * Iteration method that returns the number of ranges contained in
2336      * this set.
2337      * @see #getRangeStart
2338      * @see #getRangeEnd
2339      */
getRangeCount()2340     public int getRangeCount() {
2341         return len/2;
2342     }
2343 
2344     /**
2345      * Iteration method that returns the first character in the
2346      * specified range of this set.
2347      * @exception ArrayIndexOutOfBoundsException if index is outside
2348      * the range <code>0..getRangeCount()-1</code>
2349      * @see #getRangeCount
2350      * @see #getRangeEnd
2351      */
getRangeStart(int index)2352     public int getRangeStart(int index) {
2353         return list[index*2];
2354     }
2355 
2356     /**
2357      * Iteration method that returns the last character in the
2358      * specified range of this set.
2359      * @exception ArrayIndexOutOfBoundsException if index is outside
2360      * the range <code>0..getRangeCount()-1</code>
2361      * @see #getRangeStart
2362      * @see #getRangeEnd
2363      */
getRangeEnd(int index)2364     public int getRangeEnd(int index) {
2365         return (list[index*2 + 1] - 1);
2366     }
2367 
2368     /**
2369      * Reallocate this objects internal structures to take up the least
2370      * possible space, without changing this object's value.
2371      */
compact()2372     public UnicodeSet compact() {
2373         checkFrozen();
2374         if ((len + 7) < list.length) {
2375             // If we have more than a little unused capacity, shrink it to len.
2376             list = Arrays.copyOf(list, len);
2377         }
2378         rangeList = null;
2379         buffer = null;
2380         if (strings != EMPTY_STRINGS && strings.isEmpty()) {
2381             strings = EMPTY_STRINGS;
2382         }
2383         return this;
2384     }
2385 
2386     /**
2387      * Compares the specified object with this set for equality.  Returns
2388      * <tt>true</tt> if the specified object is also a set, the two sets
2389      * have the same size, and every member of the specified set is
2390      * contained in this set (or equivalently, every member of this set is
2391      * contained in the specified set).
2392      *
2393      * @param o Object to be compared for equality with this set.
2394      * @return <tt>true</tt> if the specified Object is equal to this set.
2395      */
2396     @Override
equals(Object o)2397     public boolean equals(Object o) {
2398         if (o == null) {
2399             return false;
2400         }
2401         if (this == o) {
2402             return true;
2403         }
2404         try {
2405             UnicodeSet that = (UnicodeSet) o;
2406             if (len != that.len) return false;
2407             for (int i = 0; i < len; ++i) {
2408                 if (list[i] != that.list[i]) return false;
2409             }
2410             if (!strings.equals(that.strings)) return false;
2411         } catch (Exception e) {
2412             return false;
2413         }
2414         return true;
2415     }
2416 
2417     /**
2418      * Returns the hash code value for this set.
2419      *
2420      * @return the hash code value for this set.
2421      * @see java.lang.Object#hashCode()
2422      */
2423     @Override
hashCode()2424     public int hashCode() {
2425         int result = len;
2426         for (int i = 0; i < len; ++i) {
2427             result *= 1000003;
2428             result += list[i];
2429         }
2430         return result;
2431     }
2432 
2433     /**
2434      * Return a programmer-readable string representation of this object.
2435      */
2436     @Override
toString()2437     public String toString() {
2438         return toPattern(true);
2439     }
2440 
2441     //----------------------------------------------------------------
2442     // Implementation: Pattern parsing
2443     //----------------------------------------------------------------
2444 
2445     /**
2446      * Parses the given pattern, starting at the given position.  The character
2447      * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
2448      * Parsing continues until the corresponding closing ']'.  If a syntax error
2449      * is encountered between the opening and closing brace, the parse fails.
2450      * Upon return from a successful parse, the ParsePosition is updated to
2451      * point to the character following the closing ']', and an inversion
2452      * list for the parsed pattern is returned.  This method
2453      * calls itself recursively to parse embedded subpatterns.
2454      *
2455      * @param pattern the string containing the pattern to be parsed.  The
2456      * portion of the string from pos.getIndex(), which must be a '[', to the
2457      * corresponding closing ']', is parsed.
2458      * @param pos upon entry, the position at which to being parsing.  The
2459      * character at pattern.charAt(pos.getIndex()) must be a '['.  Upon return
2460      * from a successful parse, pos.getIndex() is either the character after the
2461      * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
2462      * is the last character of the pattern string.
2463      * @return an inversion list for the parsed substring
2464      * of <code>pattern</code>
2465      * @exception java.lang.IllegalArgumentException if the parse fails.
2466      * @deprecated This API is ICU internal only.
2467      * @hide original deprecated declaration
2468      * @hide draft / provisional / internal are hidden on Android
2469      */
2470     @Deprecated
applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2471     public UnicodeSet applyPattern(String pattern,
2472             ParsePosition pos,
2473             SymbolTable symbols,
2474             int options) {
2475 
2476         // Need to build the pattern in a temporary string because
2477         // _applyPattern calls add() etc., which set pat to empty.
2478         boolean parsePositionWasNull = pos == null;
2479         if (parsePositionWasNull) {
2480             pos = new ParsePosition(0);
2481         }
2482 
2483         StringBuilder rebuiltPat = new StringBuilder();
2484         RuleCharacterIterator chars =
2485                 new RuleCharacterIterator(pattern, symbols, pos);
2486         applyPattern(chars, symbols, rebuiltPat, options, 0);
2487         if (chars.inVariable()) {
2488             syntaxError(chars, "Extra chars in variable value");
2489         }
2490         pat = rebuiltPat.toString();
2491         if (parsePositionWasNull) {
2492             int i = pos.getIndex();
2493 
2494             // Skip over trailing whitespace
2495             if ((options & IGNORE_SPACE) != 0) {
2496                 i = PatternProps.skipWhiteSpace(pattern, i);
2497             }
2498 
2499             if (i != pattern.length()) {
2500                 throw new IllegalArgumentException("Parse of \"" + pattern +
2501                         "\" failed at " + i);
2502             }
2503         }
2504         return this;
2505     }
2506 
2507     // Add constants to make the applyPattern() code easier to follow.
2508 
2509     private static final int LAST0_START = 0,
2510             LAST1_RANGE = 1,
2511             LAST2_SET = 2;
2512 
2513     private static final int MODE0_NONE = 0,
2514             MODE1_INBRACKET = 1,
2515             MODE2_OUTBRACKET = 2;
2516 
2517     private static final int SETMODE0_NONE = 0,
2518             SETMODE1_UNICODESET = 1,
2519             SETMODE2_PROPERTYPAT = 2,
2520             SETMODE3_PREPARSED = 3;
2521 
2522     private static final int MAX_DEPTH = 100;
2523 
2524     /**
2525      * Parse the pattern from the given RuleCharacterIterator.  The
2526      * iterator is advanced over the parsed pattern.
2527      * @param chars iterator over the pattern characters.  Upon return
2528      * it will be advanced to the first character after the parsed
2529      * pattern, or the end of the iteration if all characters are
2530      * parsed.
2531      * @param symbols symbol table to use to parse and dereference
2532      * variables, or null if none.
2533      * @param rebuiltPat the pattern that was parsed, rebuilt or
2534      * copied from the input pattern, as appropriate.
2535      * @param options a bit mask.
2536      * Valid options are {@link #IGNORE_SPACE} and
2537      * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
2538      * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
2539      */
applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth)2540     private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
2541             Appendable rebuiltPat, int options, int depth) {
2542         if (depth > MAX_DEPTH) {
2543             syntaxError(chars, "Pattern nested too deeply");
2544         }
2545 
2546         // Syntax characters: [ ] ^ - & { }
2547 
2548         // Recognized special forms for chars, sets: c-c s-s s&s
2549 
2550         int opts = RuleCharacterIterator.PARSE_VARIABLES |
2551                 RuleCharacterIterator.PARSE_ESCAPES;
2552         if ((options & IGNORE_SPACE) != 0) {
2553             opts |= RuleCharacterIterator.SKIP_WHITESPACE;
2554         }
2555 
2556         StringBuilder patBuf = new StringBuilder(), buf = null;
2557         boolean usePat = false;
2558         UnicodeSet scratch = null;
2559         RuleCharacterIterator.Position backup = null;
2560 
2561         // mode: 0=before [, 1=between [...], 2=after ]
2562         // lastItem: 0=none, 1=char, 2=set
2563         int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE;
2564         char op = 0;
2565 
2566         boolean invert = false;
2567 
2568         clear();
2569         String lastString = null;
2570 
2571         while (mode != MODE2_OUTBRACKET && !chars.atEnd()) {
2572             //Eclipse stated the following is "dead code"
2573             /*
2574             if (false) {
2575                 // Debugging assertion
2576                 if (!((lastItem == 0 && op == 0) ||
2577                         (lastItem == 1 && (op == 0 || op == '-')) ||
2578                         (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
2579                     throw new IllegalArgumentException();
2580                 }
2581             }*/
2582 
2583             int c = 0;
2584             boolean literal = false;
2585             UnicodeSet nested = null;
2586 
2587             // -------- Check for property pattern
2588 
2589             // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
2590             int setMode = SETMODE0_NONE;
2591             if (resemblesPropertyPattern(chars, opts)) {
2592                 setMode = SETMODE2_PROPERTYPAT;
2593             }
2594 
2595             // -------- Parse '[' of opening delimiter OR nested set.
2596             // If there is a nested set, use `setMode' to define how
2597             // the set should be parsed.  If the '[' is part of the
2598             // opening delimiter for this pattern, parse special
2599             // strings "[", "[^", "[-", and "[^-".  Check for stand-in
2600             // characters representing a nested set in the symbol
2601             // table.
2602 
2603             else {
2604                 // Prepare to backup if necessary
2605                 backup = chars.getPos(backup);
2606                 c = chars.next(opts);
2607                 literal = chars.isEscaped();
2608 
2609                 if (c == '[' && !literal) {
2610                     if (mode == MODE1_INBRACKET) {
2611                         chars.setPos(backup); // backup
2612                         setMode = SETMODE1_UNICODESET;
2613                     } else {
2614                         // Handle opening '[' delimiter
2615                         mode = MODE1_INBRACKET;
2616                         patBuf.append('[');
2617                         backup = chars.getPos(backup); // prepare to backup
2618                         c = chars.next(opts);
2619                         literal = chars.isEscaped();
2620                         if (c == '^' && !literal) {
2621                             invert = true;
2622                             patBuf.append('^');
2623                             backup = chars.getPos(backup); // prepare to backup
2624                             c = chars.next(opts);
2625                             literal = chars.isEscaped();
2626                         }
2627                         // Fall through to handle special leading '-';
2628                         // otherwise restart loop for nested [], \p{}, etc.
2629                         if (c == '-') {
2630                             literal = true;
2631                             // Fall through to handle literal '-' below
2632                         } else {
2633                             chars.setPos(backup); // backup
2634                             continue;
2635                         }
2636                     }
2637                 } else if (symbols != null) {
2638                     UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
2639                     if (m != null) {
2640                         try {
2641                             nested = (UnicodeSet) m;
2642                             setMode = SETMODE3_PREPARSED;
2643                         } catch (ClassCastException e) {
2644                             syntaxError(chars, "Syntax error");
2645                         }
2646                     }
2647                 }
2648             }
2649 
2650             // -------- Handle a nested set.  This either is inline in
2651             // the pattern or represented by a stand-in that has
2652             // previously been parsed and was looked up in the symbol
2653             // table.
2654 
2655             if (setMode != SETMODE0_NONE) {
2656                 if (lastItem == LAST1_RANGE) {
2657                     if (op != 0) {
2658                         syntaxError(chars, "Char expected after operator");
2659                     }
2660                     add_unchecked(lastChar, lastChar);
2661                     _appendToPat(patBuf, lastChar, false);
2662                     lastItem = LAST0_START;
2663                     op = 0;
2664                 }
2665 
2666                 if (op == '-' || op == '&') {
2667                     patBuf.append(op);
2668                 }
2669 
2670                 if (nested == null) {
2671                     if (scratch == null) scratch = new UnicodeSet();
2672                     nested = scratch;
2673                 }
2674                 switch (setMode) {
2675                 case SETMODE1_UNICODESET:
2676                     nested.applyPattern(chars, symbols, patBuf, options, depth + 1);
2677                     break;
2678                 case SETMODE2_PROPERTYPAT:
2679                     chars.skipIgnored(opts);
2680                     nested.applyPropertyPattern(chars, patBuf, symbols);
2681                     break;
2682                 case SETMODE3_PREPARSED: // `nested' already parsed
2683                     nested._toPattern(patBuf, false);
2684                     break;
2685                 }
2686 
2687                 usePat = true;
2688 
2689                 if (mode == MODE0_NONE) {
2690                     // Entire pattern is a category; leave parse loop
2691                     set(nested);
2692                     mode = MODE2_OUTBRACKET;
2693                     break;
2694                 }
2695 
2696                 switch (op) {
2697                 case '-':
2698                     removeAll(nested);
2699                     break;
2700                 case '&':
2701                     retainAll(nested);
2702                     break;
2703                 case 0:
2704                     addAll(nested);
2705                     break;
2706                 }
2707 
2708                 op = 0;
2709                 lastItem = LAST2_SET;
2710 
2711                 continue;
2712             }
2713 
2714             if (mode == MODE0_NONE) {
2715                 syntaxError(chars, "Missing '['");
2716             }
2717 
2718             // -------- Parse special (syntax) characters.  If the
2719             // current character is not special, or if it is escaped,
2720             // then fall through and handle it below.
2721 
2722             if (!literal) {
2723                 switch (c) {
2724                 case ']':
2725                     if (lastItem == LAST1_RANGE) {
2726                         add_unchecked(lastChar, lastChar);
2727                         _appendToPat(patBuf, lastChar, false);
2728                     }
2729                     // Treat final trailing '-' as a literal
2730                     if (op == '-') {
2731                         add_unchecked(op, op);
2732                         patBuf.append(op);
2733                     } else if (op == '&') {
2734                         syntaxError(chars, "Trailing '&'");
2735                     }
2736                     patBuf.append(']');
2737                     mode = MODE2_OUTBRACKET;
2738                     continue;
2739                 case '-':
2740                     if (op == 0) {
2741                         if (lastItem != LAST0_START) {
2742                             op = (char) c;
2743                             continue;
2744                         } else if (lastString != null) {
2745                             op = (char) c;
2746                             continue;
2747                         } else {
2748                             // Treat final trailing '-' as a literal
2749                             add_unchecked(c, c);
2750                             c = chars.next(opts);
2751                             literal = chars.isEscaped();
2752                             if (c == ']' && !literal) {
2753                                 patBuf.append("-]");
2754                                 mode = MODE2_OUTBRACKET;
2755                                 continue;
2756                             }
2757                         }
2758                     }
2759                     syntaxError(chars, "'-' not after char, string, or set");
2760                     break;
2761                 case '&':
2762                     if (lastItem == LAST2_SET && op == 0) {
2763                         op = (char) c;
2764                         continue;
2765                     }
2766                     syntaxError(chars, "'&' not after set");
2767                     break;
2768                 case '^':
2769                     syntaxError(chars, "'^' not after '['");
2770                     break;
2771                 case '{':
2772                     if (op != 0 && op != '-') {
2773                         syntaxError(chars, "Missing operand after operator");
2774                     }
2775                     if (lastItem == LAST1_RANGE) {
2776                         add_unchecked(lastChar, lastChar);
2777                         _appendToPat(patBuf, lastChar, false);
2778                     }
2779                     lastItem = LAST0_START;
2780                     if (buf == null) {
2781                         buf = new StringBuilder();
2782                     } else {
2783                         buf.setLength(0);
2784                     }
2785                     boolean ok = false;
2786                     while (!chars.atEnd()) {
2787                         c = chars.next(opts);
2788                         literal = chars.isEscaped();
2789                         if (c == '}' && !literal) {
2790                             ok = true;
2791                             break;
2792                         }
2793                         appendCodePoint(buf, c);
2794                     }
2795                     if (!ok) {
2796                         syntaxError(chars, "Invalid multicharacter string");
2797                     }
2798                     // We have new string. Add it to set and continue;
2799                     // we don't need to drop through to the further
2800                     // processing
2801                     String curString = buf.toString();
2802                     if (op == '-') {
2803                         int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString);
2804                         int curSingle = CharSequences.getSingleCodePoint(curString);
2805                         if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) {
2806                             add(lastSingle,curSingle);
2807                         } else {
2808                             if (strings == EMPTY_STRINGS) {
2809                                 strings = new TreeSet<>();
2810                             }
2811                             try {
2812                                 StringRange.expand(lastString, curString, true, strings);
2813                             } catch (Exception e) {
2814                                 syntaxError(chars, e.getMessage());
2815                             }
2816                         }
2817                         lastString = null;
2818                         op = 0;
2819                     } else {
2820                         add(curString);
2821                         lastString = curString;
2822                     }
2823                     patBuf.append('{');
2824                     _appendToPat(patBuf, curString, false);
2825                     patBuf.append('}');
2826                     continue;
2827                 case SymbolTable.SYMBOL_REF:
2828                     //         symbols  nosymbols
2829                     // [a-$]   error    error (ambiguous)
2830                     // [a$]    anchor   anchor
2831                     // [a-$x]  var "x"* literal '$'
2832                     // [a-$.]  error    literal '$'
2833                     // *We won't get here in the case of var "x"
2834                     backup = chars.getPos(backup);
2835                     c = chars.next(opts);
2836                     literal = chars.isEscaped();
2837                     boolean anchor = (c == ']' && !literal);
2838                     if (symbols == null && !anchor) {
2839                         c = SymbolTable.SYMBOL_REF;
2840                         chars.setPos(backup);
2841                         break; // literal '$'
2842                     }
2843                     if (anchor && op == 0) {
2844                         if (lastItem == LAST1_RANGE) {
2845                             add_unchecked(lastChar, lastChar);
2846                             _appendToPat(patBuf, lastChar, false);
2847                         }
2848                         add_unchecked(UnicodeMatcher.ETHER);
2849                         usePat = true;
2850                         patBuf.append(SymbolTable.SYMBOL_REF).append(']');
2851                         mode = MODE2_OUTBRACKET;
2852                         continue;
2853                     }
2854                     syntaxError(chars, "Unquoted '$'");
2855                     break;
2856                 default:
2857                     break;
2858                 }
2859             }
2860 
2861             // -------- Parse literal characters.  This includes both
2862             // escaped chars ("\u4E01") and non-syntax characters
2863             // ("a").
2864 
2865             switch (lastItem) {
2866             case LAST0_START:
2867                 if (op == '-' && lastString != null) {
2868                     syntaxError(chars, "Invalid range");
2869                 }
2870                 lastItem = LAST1_RANGE;
2871                 lastChar = c;
2872                 lastString = null;
2873                 break;
2874             case LAST1_RANGE:
2875                 if (op == '-') {
2876                     if (lastString != null) {
2877                         syntaxError(chars, "Invalid range");
2878                     }
2879                     if (lastChar >= c) {
2880                         // Don't allow redundant (a-a) or empty (b-a) ranges;
2881                         // these are most likely typos.
2882                         syntaxError(chars, "Invalid range");
2883                     }
2884                     add_unchecked(lastChar, c);
2885                     _appendToPat(patBuf, lastChar, false);
2886                     patBuf.append(op);
2887                     _appendToPat(patBuf, c, false);
2888                     lastItem = LAST0_START;
2889                     op = 0;
2890                 } else {
2891                     add_unchecked(lastChar, lastChar);
2892                     _appendToPat(patBuf, lastChar, false);
2893                     lastChar = c;
2894                 }
2895                 break;
2896             case LAST2_SET:
2897                 if (op != 0) {
2898                     syntaxError(chars, "Set expected after operator");
2899                 }
2900                 lastChar = c;
2901                 lastItem = LAST1_RANGE;
2902                 break;
2903             }
2904         }
2905 
2906         if (mode != MODE2_OUTBRACKET) {
2907             syntaxError(chars, "Missing ']'");
2908         }
2909 
2910         chars.skipIgnored(opts);
2911 
2912         /**
2913          * Handle global flags (invert, case insensitivity).  If this
2914          * pattern should be compiled case-insensitive, then we need
2915          * to close over case BEFORE COMPLEMENTING.  This makes
2916          * patterns like /[^abc]/i work.
2917          */
2918         if ((options & CASE_MASK) != 0) {
2919             closeOver(options);
2920         }
2921         if (invert) {
2922             complement().removeAllStrings();  // code point complement
2923         }
2924 
2925         // Use the rebuilt pattern (pat) only if necessary.  Prefer the
2926         // generated pattern.
2927         if (usePat) {
2928             append(rebuiltPat, patBuf.toString());
2929         } else {
2930             appendNewPattern(rebuiltPat, false, true);
2931         }
2932     }
2933 
syntaxError(RuleCharacterIterator chars, String msg)2934     private static void syntaxError(RuleCharacterIterator chars, String msg) {
2935         throw new IllegalArgumentException("Error: " + msg + " at \"" +
2936                 Utility.escape(chars.toString()) +
2937                 '"');
2938     }
2939 
2940     /**
2941      * Add the contents of the UnicodeSet (as strings) into a collection.
2942      * @param target collection to add into
2943      */
addAllTo(T target)2944     public <T extends Collection<String>> T addAllTo(T target) {
2945         return addAllTo(this, target);
2946     }
2947 
2948 
2949     /**
2950      * Add the contents of the UnicodeSet (as strings) into a collection.
2951      * @param target collection to add into
2952      * @hide unsupported on Android
2953      */
addAllTo(String[] target)2954     public String[] addAllTo(String[] target) {
2955         return addAllTo(this, target);
2956     }
2957 
2958     /**
2959      * Add the contents of the UnicodeSet (as strings) into an array.
2960      * @hide unsupported on Android
2961      */
toArray(UnicodeSet set)2962     public static String[] toArray(UnicodeSet set) {
2963         return addAllTo(set, new String[set.size()]);
2964     }
2965 
2966     /**
2967      * Add the contents of the collection (as strings) into this UnicodeSet.
2968      * The collection must not contain null.
2969      * @param source the collection to add
2970      * @return a reference to this object
2971      */
add(Iterable<?> source)2972     public UnicodeSet add(Iterable<?> source) {
2973         return addAll(source);
2974     }
2975 
2976     /**
2977      * Add a collection (as strings) into this UnicodeSet.
2978      * Uses standard naming convention.
2979      * @param source collection to add into
2980      * @return a reference to this object
2981      */
addAll(Iterable<?> source)2982     public UnicodeSet addAll(Iterable<?> source) {
2983         checkFrozen();
2984         for (Object o : source) {
2985             add(o.toString());
2986         }
2987         return this;
2988     }
2989 
2990     //----------------------------------------------------------------
2991     // Implementation: Utility methods
2992     //----------------------------------------------------------------
2993 
nextCapacity(int minCapacity)2994     private int nextCapacity(int minCapacity) {
2995         // Grow exponentially to reduce the frequency of allocations.
2996         if (minCapacity < INITIAL_CAPACITY) {
2997             return minCapacity + INITIAL_CAPACITY;
2998         } else if (minCapacity <= 2500) {
2999             return 5 * minCapacity;
3000         } else {
3001             int newCapacity = 2 * minCapacity;
3002             if (newCapacity > MAX_LENGTH) {
3003                 newCapacity = MAX_LENGTH;
3004             }
3005             return newCapacity;
3006         }
3007     }
3008 
ensureCapacity(int newLen)3009     private void ensureCapacity(int newLen) {
3010         if (newLen > MAX_LENGTH) {
3011             newLen = MAX_LENGTH;
3012         }
3013         if (newLen <= list.length) return;
3014         int newCapacity = nextCapacity(newLen);
3015         int[] temp = new int[newCapacity];
3016         // Copy only the actual contents.
3017         System.arraycopy(list, 0, temp, 0, len);
3018         list = temp;
3019     }
3020 
ensureBufferCapacity(int newLen)3021     private void ensureBufferCapacity(int newLen) {
3022         if (newLen > MAX_LENGTH) {
3023             newLen = MAX_LENGTH;
3024         }
3025         if (buffer != null && newLen <= buffer.length) return;
3026         int newCapacity = nextCapacity(newLen);
3027         buffer = new int[newCapacity];
3028         // The buffer has no contents to be copied.
3029         // It is always filled from scratch after this call.
3030     }
3031 
3032     /**
3033      * Assumes start <= end.
3034      */
range(int start, int end)3035     private int[] range(int start, int end) {
3036         if (rangeList == null) {
3037             rangeList = new int[] { start, end+1, HIGH };
3038         } else {
3039             rangeList[0] = start;
3040             rangeList[1] = end+1;
3041         }
3042         return rangeList;
3043     }
3044 
3045     //----------------------------------------------------------------
3046     // Implementation: Fundamental operations
3047     //----------------------------------------------------------------
3048 
3049     // polarity = 0, 3 is normal: x xor y
3050     // polarity = 1, 2: x xor ~y == x === y
3051 
xor(int[] other, int otherLen, int polarity)3052     private UnicodeSet xor(int[] other, int otherLen, int polarity) {
3053         ensureBufferCapacity(len + otherLen);
3054         int i = 0, j = 0, k = 0;
3055         int a = list[i++];
3056         int b;
3057         // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used
3058         //      so the following if statement will not be called.
3059         ///CLOVER:OFF
3060         if (polarity == 1 || polarity == 2) {
3061             b = LOW;
3062             if (other[j] == LOW) { // skip base if already LOW
3063                 ++j;
3064                 b = other[j];
3065             }
3066             ///CLOVER:ON
3067         } else {
3068             b = other[j++];
3069         }
3070         // simplest of all the routines
3071         // sort the values, discarding identicals!
3072         while (true) {
3073             if (a < b) {
3074                 buffer[k++] = a;
3075                 a = list[i++];
3076             } else if (b < a) {
3077                 buffer[k++] = b;
3078                 b = other[j++];
3079             } else if (a != HIGH) { // at this point, a == b
3080                 // discard both values!
3081                 a = list[i++];
3082                 b = other[j++];
3083             } else { // DONE!
3084                 buffer[k++] = HIGH;
3085                 len = k;
3086                 break;
3087             }
3088         }
3089         // swap list and buffer
3090         int[] temp = list;
3091         list = buffer;
3092         buffer = temp;
3093         pat = null;
3094         return this;
3095     }
3096 
3097     // polarity = 0 is normal: x union y
3098     // polarity = 2: x union ~y
3099     // polarity = 1: ~x union y
3100     // polarity = 3: ~x union ~y
3101 
add(int[] other, int otherLen, int polarity)3102     private UnicodeSet add(int[] other, int otherLen, int polarity) {
3103         ensureBufferCapacity(len + otherLen);
3104         int i = 0, j = 0, k = 0;
3105         int a = list[i++];
3106         int b = other[j++];
3107         // change from xor is that we have to check overlapping pairs
3108         // polarity bit 1 means a is second, bit 2 means b is.
3109         main:
3110             while (true) {
3111                 switch (polarity) {
3112                 case 0: // both first; take lower if unequal
3113                     if (a < b) { // take a
3114                         // Back up over overlapping ranges in buffer[]
3115                         if (k > 0 && a <= buffer[k-1]) {
3116                             // Pick latter end value in buffer[] vs. list[]
3117                             a = max(list[i], buffer[--k]);
3118                         } else {
3119                             // No overlap
3120                             buffer[k++] = a;
3121                             a = list[i];
3122                         }
3123                         i++; // Common if/else code factored out
3124                         polarity ^= 1;
3125                     } else if (b < a) { // take b
3126                         if (k > 0 && b <= buffer[k-1]) {
3127                             b = max(other[j], buffer[--k]);
3128                         } else {
3129                             buffer[k++] = b;
3130                             b = other[j];
3131                         }
3132                         j++;
3133                         polarity ^= 2;
3134                     } else { // a == b, take a, drop b
3135                         if (a == HIGH) break main;
3136                         // This is symmetrical; it doesn't matter if
3137                         // we backtrack with a or b. - liu
3138                         if (k > 0 && a <= buffer[k-1]) {
3139                             a = max(list[i], buffer[--k]);
3140                         } else {
3141                             // No overlap
3142                             buffer[k++] = a;
3143                             a = list[i];
3144                         }
3145                         i++;
3146                         polarity ^= 1;
3147                         b = other[j++]; polarity ^= 2;
3148                     }
3149                     break;
3150                 case 3: // both second; take higher if unequal, and drop other
3151                     if (b <= a) { // take a
3152                         if (a == HIGH) break main;
3153                         buffer[k++] = a;
3154                     } else { // take b
3155                         if (b == HIGH) break main;
3156                         buffer[k++] = b;
3157                     }
3158                     a = list[i++]; polarity ^= 1;   // factored common code
3159                     b = other[j++]; polarity ^= 2;
3160                     break;
3161                 case 1: // a second, b first; if b < a, overlap
3162                     if (a < b) { // no overlap, take a
3163                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3164                     } else if (b < a) { // OVERLAP, drop b
3165                         b = other[j++]; polarity ^= 2;
3166                     } else { // a == b, drop both!
3167                         if (a == HIGH) break main;
3168                         a = list[i++]; polarity ^= 1;
3169                         b = other[j++]; polarity ^= 2;
3170                     }
3171                     break;
3172                 case 2: // a first, b second; if a < b, overlap
3173                     if (b < a) { // no overlap, take b
3174                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3175                     } else  if (a < b) { // OVERLAP, drop a
3176                         a = list[i++]; polarity ^= 1;
3177                     } else { // a == b, drop both!
3178                         if (a == HIGH) break main;
3179                         a = list[i++]; polarity ^= 1;
3180                         b = other[j++]; polarity ^= 2;
3181                     }
3182                     break;
3183                 }
3184             }
3185         buffer[k++] = HIGH;    // terminate
3186         len = k;
3187         // swap list and buffer
3188         int[] temp = list;
3189         list = buffer;
3190         buffer = temp;
3191         pat = null;
3192         return this;
3193     }
3194 
3195     // polarity = 0 is normal: x intersect y
3196     // polarity = 2: x intersect ~y == set-minus
3197     // polarity = 1: ~x intersect y
3198     // polarity = 3: ~x intersect ~y
3199 
retain(int[] other, int otherLen, int polarity)3200     private UnicodeSet retain(int[] other, int otherLen, int polarity) {
3201         ensureBufferCapacity(len + otherLen);
3202         int i = 0, j = 0, k = 0;
3203         int a = list[i++];
3204         int b = other[j++];
3205         // change from xor is that we have to check overlapping pairs
3206         // polarity bit 1 means a is second, bit 2 means b is.
3207         main:
3208             while (true) {
3209                 switch (polarity) {
3210                 case 0: // both first; drop the smaller
3211                     if (a < b) { // drop a
3212                         a = list[i++]; polarity ^= 1;
3213                     } else if (b < a) { // drop b
3214                         b = other[j++]; polarity ^= 2;
3215                     } else { // a == b, take one, drop other
3216                         if (a == HIGH) break main;
3217                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3218                         b = other[j++]; polarity ^= 2;
3219                     }
3220                     break;
3221                 case 3: // both second; take lower if unequal
3222                     if (a < b) { // take a
3223                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3224                     } else if (b < a) { // take b
3225                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3226                     } else { // a == b, take one, drop other
3227                         if (a == HIGH) break main;
3228                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3229                         b = other[j++]; polarity ^= 2;
3230                     }
3231                     break;
3232                 case 1: // a second, b first;
3233                     if (a < b) { // NO OVERLAP, drop a
3234                         a = list[i++]; polarity ^= 1;
3235                     } else if (b < a) { // OVERLAP, take b
3236                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
3237                     } else { // a == b, drop both!
3238                         if (a == HIGH) break main;
3239                         a = list[i++]; polarity ^= 1;
3240                         b = other[j++]; polarity ^= 2;
3241                     }
3242                     break;
3243                 case 2: // a first, b second; if a < b, overlap
3244                     if (b < a) { // no overlap, drop b
3245                         b = other[j++]; polarity ^= 2;
3246                     } else  if (a < b) { // OVERLAP, take a
3247                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
3248                     } else { // a == b, drop both!
3249                         if (a == HIGH) break main;
3250                         a = list[i++]; polarity ^= 1;
3251                         b = other[j++]; polarity ^= 2;
3252                     }
3253                     break;
3254                 }
3255             }
3256         buffer[k++] = HIGH;    // terminate
3257         len = k;
3258         // swap list and buffer
3259         int[] temp = list;
3260         list = buffer;
3261         buffer = temp;
3262         pat = null;
3263         return this;
3264     }
3265 
max(int a, int b)3266     private static final int max(int a, int b) {
3267         return (a > b) ? a : b;
3268     }
3269 
3270     //----------------------------------------------------------------
3271     // Generic filter-based scanning code
3272     //----------------------------------------------------------------
3273 
3274     private static interface Filter {
contains(int codePoint)3275         boolean contains(int codePoint);
3276     }
3277 
3278     private static final class NumericValueFilter implements Filter {
3279         double value;
NumericValueFilter(double value)3280         NumericValueFilter(double value) { this.value = value; }
3281         @Override
contains(int ch)3282         public boolean contains(int ch) {
3283             return UCharacter.getUnicodeNumericValue(ch) == value;
3284         }
3285     }
3286 
3287     private static final class GeneralCategoryMaskFilter implements Filter {
3288         int mask;
GeneralCategoryMaskFilter(int mask)3289         GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
3290         @Override
contains(int ch)3291         public boolean contains(int ch) {
3292             return ((1 << UCharacter.getType(ch)) & mask) != 0;
3293         }
3294     }
3295 
3296     private static final class IntPropertyFilter implements Filter {
3297         int prop;
3298         int value;
IntPropertyFilter(int prop, int value)3299         IntPropertyFilter(int prop, int value) {
3300             this.prop = prop;
3301             this.value = value;
3302         }
3303         @Override
contains(int ch)3304         public boolean contains(int ch) {
3305             return UCharacter.getIntPropertyValue(ch, prop) == value;
3306         }
3307     }
3308 
3309     private static final class ScriptExtensionsFilter implements Filter {
3310         int script;
ScriptExtensionsFilter(int script)3311         ScriptExtensionsFilter(int script) { this.script = script; }
3312         @Override
contains(int c)3313         public boolean contains(int c) {
3314             return UScript.hasScript(c, script);
3315         }
3316     }
3317 
3318     private static final class IdentifierTypeFilter implements Filter {
3319         int idType;
IdentifierTypeFilter(int idType)3320         IdentifierTypeFilter(int idType) { this.idType = idType; }
3321         @Override
contains(int c)3322         public boolean contains(int c) {
3323             return UCharacterProperty.INSTANCE.hasIDType(c, idType);
3324         }
3325     }
3326 
3327     // VersionInfo for unassigned characters
3328     private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
3329 
3330     private static final class VersionFilter implements Filter {
3331         VersionInfo version;
VersionFilter(VersionInfo version)3332         VersionFilter(VersionInfo version) { this.version = version; }
3333         @Override
contains(int ch)3334         public boolean contains(int ch) {
3335             VersionInfo v = UCharacter.getAge(ch);
3336             // Reference comparison ok; VersionInfo caches and reuses
3337             // unique objects.
3338             return !Utility.sameObjects(v, NO_VERSION) &&
3339                     v.compareTo(version) <= 0;
3340         }
3341     }
3342 
3343     /**
3344      * Generic filter-based scanning code for UCD property UnicodeSets.
3345      */
applyFilter(Filter filter, UnicodeSet inclusions)3346     private void applyFilter(Filter filter, UnicodeSet inclusions) {
3347         // Logically, walk through all Unicode characters, noting the start
3348         // and end of each range for which filter.contain(c) is
3349         // true.  Add each range to a set.
3350         //
3351         // To improve performance, use an inclusions set which
3352         // encodes information about character ranges that are known
3353         // to have identical properties.
3354         // inclusions contains the first characters of
3355         // same-value ranges for the given property.
3356 
3357         clear();
3358 
3359         int startHasProperty = -1;
3360         int limitRange = inclusions.getRangeCount();
3361 
3362         for (int j=0; j<limitRange; ++j) {
3363             // get current range
3364             int start = inclusions.getRangeStart(j);
3365             int end = inclusions.getRangeEnd(j);
3366 
3367             // for all the code points in the range, process
3368             for (int ch = start; ch <= end; ++ch) {
3369                 // only add to the unicodeset on inflection points --
3370                 // where the hasProperty value changes to false
3371                 if (filter.contains(ch)) {
3372                     if (startHasProperty < 0) {
3373                         startHasProperty = ch;
3374                     }
3375                 } else if (startHasProperty >= 0) {
3376                     add_unchecked(startHasProperty, ch-1);
3377                     startHasProperty = -1;
3378                 }
3379             }
3380         }
3381         if (startHasProperty >= 0) {
3382             add_unchecked(startHasProperty, 0x10FFFF);
3383         }
3384     }
3385 
3386     /**
3387      * Remove leading and trailing Pattern_White_Space and compress
3388      * internal Pattern_White_Space to a single space character.
3389      */
mungeCharName(String source)3390     private static String mungeCharName(String source) {
3391         source = PatternProps.trimWhiteSpace(source);
3392         StringBuilder buf = null;
3393         for (int i=0; i<source.length(); ++i) {
3394             char ch = source.charAt(i);
3395             if (PatternProps.isWhiteSpace(ch)) {
3396                 if (buf == null) {
3397                     buf = new StringBuilder().append(source, 0, i);
3398                 } else if (buf.charAt(buf.length() - 1) == ' ') {
3399                     continue;
3400                 }
3401                 ch = ' '; // convert to ' '
3402             }
3403             if (buf != null) {
3404                 buf.append(ch);
3405             }
3406         }
3407         return buf == null ? source : buf.toString();
3408     }
3409 
3410     //----------------------------------------------------------------
3411     // Property set API
3412     //----------------------------------------------------------------
3413 
3414     /**
3415      * Modifies this set to contain those code points which have the
3416      * given value for the given binary or enumerated property, as
3417      * returned by UCharacter.getIntPropertyValue.  Prior contents of
3418      * this set are lost.
3419      *
3420      * @param prop a property in the range
3421      * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or
3422      * UProperty.INT_START..UProperty.INT_LIMIT-1 or.
3423      * UProperty.MASK_START..UProperty.MASK_LIMIT-1.
3424      *
3425      * @param value a value in the range
3426      * UCharacter.getIntPropertyMinValue(prop)..
3427      * UCharacter.getIntPropertyMaxValue(prop), with one exception.
3428      * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be
3429      * a UCharacter.getType() result, but rather a mask value produced
3430      * by logically ORing (1 &lt;&lt; UCharacter.getType()) values together.
3431      * This allows grouped categories such as [:L:] to be represented.
3432      *
3433      * @return a reference to this set
3434      */
applyIntPropertyValue(int prop, int value)3435     public UnicodeSet applyIntPropertyValue(int prop, int value) {
3436         // All of the following include checkFrozen() before modifying this set.
3437         if (prop == UProperty.GENERAL_CATEGORY_MASK) {
3438             UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
3439             applyFilter(new GeneralCategoryMaskFilter(value), inclusions);
3440         } else if (prop == UProperty.SCRIPT_EXTENSIONS) {
3441             UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
3442             applyFilter(new ScriptExtensionsFilter(value), inclusions);
3443         } else if (prop == UProperty.IDENTIFIER_TYPE) {
3444             UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
3445             applyFilter(new IdentifierTypeFilter(value), inclusions);
3446         } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) {
3447             if (value == 0 || value == 1) {
3448                 set(CharacterProperties.getBinaryPropertySet(prop));
3449                 if (value == 0) {
3450                     complement().removeAllStrings();  // code point complement
3451                 }
3452             } else {
3453                 clear();
3454             }
3455         } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) {
3456             UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop);
3457             applyFilter(new IntPropertyFilter(prop, value), inclusions);
3458         } else {
3459             throw new IllegalArgumentException("unsupported property " + prop);
3460         }
3461         return this;
3462     }
3463 
3464 
3465 
3466     /**
3467      * Modifies this set to contain those code points which have the
3468      * given value for the given property.  Prior contents of this
3469      * set are lost.
3470      *
3471      * @param propertyAlias a property alias, either short or long.
3472      * The name is matched loosely.  See PropertyAliases.txt for names
3473      * and a description of loose matching.  If the value string is
3474      * empty, then this string is interpreted as either a
3475      * General_Category value alias, a Script value alias, a binary
3476      * property alias, or a special ID.  Special IDs are matched
3477      * loosely and correspond to the following sets:
3478      *
3479      * "ANY" = [\\u0000-\\U0010FFFF],
3480      * "ASCII" = [\\u0000-\\u007F].
3481      *
3482      * @param valueAlias a value alias, either short or long.  The
3483      * name is matched loosely.  See PropertyValueAliases.txt for
3484      * names and a description of loose matching.  In addition to
3485      * aliases listed, numeric values and canonical combining classes
3486      * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc",
3487      * "220").  The value string may also be empty.
3488      *
3489      * @return a reference to this set
3490      */
applyPropertyAlias(String propertyAlias, String valueAlias)3491     public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) {
3492         return applyPropertyAlias(propertyAlias, valueAlias, null);
3493     }
3494 
3495     /**
3496      * Modifies this set to contain those code points which have the
3497      * given value for the given property.  Prior contents of this
3498      * set are lost.
3499      * @param propertyAlias A string of the property alias.
3500      * @param valueAlias A string of the value alias.
3501      * @param symbols if not null, then symbols are first called to see if a property
3502      * is available. If true, then everything else is skipped.
3503      * @return this set
3504      */
applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3505     public UnicodeSet applyPropertyAlias(String propertyAlias,
3506             String valueAlias, SymbolTable symbols) {
3507         checkFrozen();
3508         int p;
3509         int v;
3510         boolean invert = false;
3511 
3512         if (symbols != null
3513                 && (symbols instanceof XSymbolTable)
3514                 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
3515             return this;
3516         }
3517 
3518         if (XSYMBOL_TABLE != null) {
3519             if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) {
3520                 return this;
3521             }
3522         }
3523 
3524         if (valueAlias.length() > 0) {
3525             p = UCharacter.getPropertyEnum(propertyAlias);
3526 
3527             // Treat gc as gcm
3528             if (p == UProperty.GENERAL_CATEGORY) {
3529                 p = UProperty.GENERAL_CATEGORY_MASK;
3530             }
3531 
3532             if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) ||
3533                     (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) ||
3534                     (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) {
3535                 try {
3536                     v = UCharacter.getPropertyValueEnum(p, valueAlias);
3537                 } catch (IllegalArgumentException e) {
3538                     // Handle numeric CCC
3539                     if (p == UProperty.CANONICAL_COMBINING_CLASS ||
3540                             p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
3541                             p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
3542                         v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias));
3543                         // Anything between 0 and 255 is valid even if unused.
3544                         if (v < 0 || v > 255) throw e;
3545                     } else {
3546                         throw e;
3547                     }
3548                 }
3549             }
3550 
3551             else {
3552                 switch (p) {
3553                 case UProperty.NUMERIC_VALUE:
3554                 {
3555                     double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias));
3556                     applyFilter(new NumericValueFilter(value),
3557                             CharacterPropertiesImpl.getInclusionsForProperty(p));
3558                     return this;
3559                 }
3560                 case UProperty.NAME:
3561                 {
3562                     // Must munge name, since
3563                     // UCharacter.charFromName() does not do
3564                     // 'loose' matching.
3565                     String buf = mungeCharName(valueAlias);
3566                     int ch = UCharacter.getCharFromExtendedName(buf);
3567                     if (ch == -1) {
3568                         throw new IllegalArgumentException("Invalid character name");
3569                     }
3570                     clear();
3571                     add_unchecked(ch);
3572                     return this;
3573                 }
3574                 case UProperty.UNICODE_1_NAME:
3575                     // ICU 49 deprecates the Unicode_1_Name property APIs.
3576                     throw new IllegalArgumentException("Unicode_1_Name (na1) not supported");
3577                 case UProperty.AGE:
3578                 {
3579                     // Must munge name, since
3580                     // VersionInfo.getInstance() does not do
3581                     // 'loose' matching.
3582                     VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
3583                     applyFilter(new VersionFilter(version),
3584                             CharacterPropertiesImpl.getInclusionsForProperty(p));
3585                     return this;
3586                 }
3587                 case UProperty.SCRIPT_EXTENSIONS:
3588                     v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias);
3589                     // fall through to calling applyIntPropertyValue()
3590                     break;
3591                 case UProperty.IDENTIFIER_TYPE:
3592                     v = UCharacter.getPropertyValueEnum(p, valueAlias);
3593                     // fall through to calling applyIntPropertyValue()
3594                     break;
3595                 default:
3596                     // p is a non-binary, non-enumerated property that we
3597                     // don't support (yet).
3598                     throw new IllegalArgumentException("Unsupported property");
3599                 }
3600             }
3601         }
3602 
3603         else {
3604             // valueAlias is empty.  Interpret as General Category, Script,
3605             // Binary property, or ANY or ASCII.  Upon success, p and v will
3606             // be set.
3607             UPropertyAliases pnames = UPropertyAliases.INSTANCE;
3608             p = UProperty.GENERAL_CATEGORY_MASK;
3609             v = pnames.getPropertyValueEnum(p, propertyAlias);
3610             if (v == UProperty.UNDEFINED) {
3611                 p = UProperty.SCRIPT;
3612                 v = pnames.getPropertyValueEnum(p, propertyAlias);
3613                 if (v == UProperty.UNDEFINED) {
3614                     p = pnames.getPropertyEnum(propertyAlias);
3615                     if (p == UProperty.UNDEFINED) {
3616                         p = -1;
3617                     }
3618                     if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) {
3619                         v = 1;
3620                     } else if (p == -1) {
3621                         if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) {
3622                             set(MIN_VALUE, MAX_VALUE);
3623                             return this;
3624                         } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) {
3625                             set(0, 0x7F);
3626                             return this;
3627                         } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) {
3628                             // [:Assigned:]=[:^Cn:]
3629                             p = UProperty.GENERAL_CATEGORY_MASK;
3630                             v = (1<<UCharacter.UNASSIGNED);
3631                             invert = true;
3632                         } else {
3633                             // Property name was never matched.
3634                             throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias);
3635                         }
3636                     } else {
3637                         // Valid property name, but it isn't binary, so the value
3638                         // must be supplied.
3639                         throw new IllegalArgumentException("Missing property value");
3640                     }
3641                 }
3642             }
3643         }
3644 
3645         applyIntPropertyValue(p, v);
3646         if(invert) {
3647             complement().removeAllStrings();  // code point complement
3648         }
3649 
3650         return this;
3651     }
3652 
3653     //----------------------------------------------------------------
3654     // Property set patterns
3655     //----------------------------------------------------------------
3656 
3657     /**
3658      * Return true if the given position, in the given pattern, appears
3659      * to be the start of a property set pattern.
3660      */
resemblesPropertyPattern(String pattern, int pos)3661     private static boolean resemblesPropertyPattern(String pattern, int pos) {
3662         // Patterns are at least 5 characters long
3663         if ((pos+5) > pattern.length()) {
3664             return false;
3665         }
3666 
3667         // Look for an opening [:, [:^, \p, or \P
3668         return pattern.regionMatches(pos, "[:", 0, 2) ||
3669                 pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3670                 pattern.regionMatches(pos, "\\N", 0, 2);
3671     }
3672 
3673     /**
3674      * Return true if the given iterator appears to point at a
3675      * property pattern.  Regardless of the result, return with the
3676      * iterator unchanged.
3677      * @param chars iterator over the pattern characters.  Upon return
3678      * it will be unchanged.
3679      * @param iterOpts RuleCharacterIterator options
3680      */
resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3681     private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
3682             int iterOpts) {
3683         boolean result = false;
3684         iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
3685         RuleCharacterIterator.Position pos = chars.getPos(null);
3686         int c = chars.next(iterOpts);
3687         if (c == '[' || c == '\\') {
3688             int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
3689             result = (c == '[') ? (d == ':') :
3690                 (d == 'N' || d == 'p' || d == 'P');
3691         }
3692         chars.setPos(pos);
3693         return result;
3694     }
3695 
3696     /**
3697      * Parse the given property pattern at the given parse position.
3698      * @param symbols TODO
3699      */
applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3700     private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
3701         int pos = ppos.getIndex();
3702 
3703         // On entry, ppos should point to one of the following locations:
3704 
3705         // Minimum length is 5 characters, e.g. \p{L}
3706         if ((pos+5) > pattern.length()) {
3707             return null;
3708         }
3709 
3710         boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
3711         boolean isName = false; // true for \N{pat}, o/w false
3712         boolean invert = false;
3713 
3714         // Look for an opening [:, [:^, \p, or \P
3715         if (pattern.regionMatches(pos, "[:", 0, 2)) {
3716             posix = true;
3717             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
3718             if (pos < pattern.length() && pattern.charAt(pos) == '^') {
3719                 ++pos;
3720                 invert = true;
3721             }
3722         } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
3723                 pattern.regionMatches(pos, "\\N", 0, 2)) {
3724             char c = pattern.charAt(pos+1);
3725             invert = (c == 'P');
3726             isName = (c == 'N');
3727             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
3728             if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
3729                 // Syntax error; "\p" or "\P" not followed by "{"
3730                 return null;
3731             }
3732         } else {
3733             // Open delimiter not seen
3734             return null;
3735         }
3736 
3737         // Look for the matching close delimiter, either :] or }
3738         int close = pattern.indexOf(posix ? ":]" : "}", pos);
3739         if (close < 0) {
3740             // Syntax error; close delimiter missing
3741             return null;
3742         }
3743 
3744         // Look for an '=' sign.  If this is present, we will parse a
3745         // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
3746         // pattern.
3747         int equals = pattern.indexOf('=', pos);
3748         String propName, valueName;
3749         if (equals >= 0 && equals < close && !isName) {
3750             // Equals seen; parse medium/long pattern
3751             propName = pattern.substring(pos, equals);
3752             valueName = pattern.substring(equals+1, close);
3753         }
3754 
3755         else {
3756             // Handle case where no '=' is seen, and \N{}
3757             propName = pattern.substring(pos, close);
3758             valueName = "";
3759 
3760             // Handle \N{name}
3761             if (isName) {
3762                 // This is a little inefficient since it means we have to
3763                 // parse "na" back to UProperty.NAME even though we already
3764                 // know it's UProperty.NAME.  If we refactor the API to
3765                 // support args of (int, String) then we can remove
3766                 // "na" and make this a little more efficient.
3767                 valueName = propName;
3768                 propName = "na";
3769             }
3770         }
3771 
3772         applyPropertyAlias(propName, valueName, symbols);
3773 
3774         if (invert) {
3775             complement().removeAllStrings();  // code point complement
3776         }
3777 
3778         // Move to the limit position after the close delimiter
3779         ppos.setIndex(close + (posix ? 2 : 1));
3780 
3781         return this;
3782     }
3783 
3784     /**
3785      * Parse a property pattern.
3786      * @param chars iterator over the pattern characters.  Upon return
3787      * it will be advanced to the first character after the parsed
3788      * pattern, or the end of the iteration if all characters are
3789      * parsed.
3790      * @param rebuiltPat the pattern that was parsed, rebuilt or
3791      * copied from the input pattern, as appropriate.
3792      * @param symbols TODO
3793      */
applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3794     private void applyPropertyPattern(RuleCharacterIterator chars,
3795             Appendable rebuiltPat, SymbolTable symbols) {
3796         String patStr = chars.getCurrentBuffer();
3797         int start = chars.getCurrentBufferPos();
3798         ParsePosition pos = new ParsePosition(start);
3799         applyPropertyPattern(patStr, pos, symbols);
3800         int length = pos.getIndex() - start;
3801         if (length == 0) {
3802             syntaxError(chars, "Invalid property pattern");
3803         }
3804         chars.jumpahead(length);
3805         append(rebuiltPat, patStr.substring(start, pos.getIndex()));
3806     }
3807 
3808     //----------------------------------------------------------------
3809     // Case folding API
3810     //----------------------------------------------------------------
3811 
3812     /**
3813      * Bitmask for constructor and applyPattern() indicating that
3814      * white space should be ignored.  If set, ignore Unicode Pattern_White_Space characters,
3815      * unless they are quoted or escaped.  This may be ORed together
3816      * with other selectors.
3817      */
3818     public static final int IGNORE_SPACE = 1;
3819 
3820     /**
3821      * Alias for {@link #CASE_INSENSITIVE}.
3822      *
3823      * @deprecated ICU 73 Use {@link #CASE_INSENSITIVE} instead.
3824      */
3825     @Deprecated
3826     public static final int CASE = 2;
3827 
3828     /**
3829      * Enable case insensitive matching.  E.g., "[ab]" with this flag
3830      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
3831      * match all except 'a', 'A', 'b', and 'B'. This performs a full
3832      * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
3833      *
3834      * <p>This value is an options bit set value for some
3835      * constructors, applyPattern(), and closeOver().
3836      * It can be ORed together with other, unrelated options.
3837      *
3838      * <p>The resulting set is a superset of the input for the code points but
3839      * not for the strings.
3840      * It performs a case mapping closure of the code points and adds
3841      * full case folding strings for the code points, and reduces strings of
3842      * the original set to their full case folding equivalents.
3843      *
3844      * <p>This is designed for case-insensitive matches, for example
3845      * in regular expressions. The full code point case closure allows checking of
3846      * an input character directly against the closure set.
3847      * Strings are matched by comparing the case-folded form from the closure
3848      * set with an incremental case folding of the string in question.
3849      *
3850      * <p>The closure set will also contain single code points if the original
3851      * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
3852      * This is not necessary (that is, redundant) for the above matching method
3853      * but results in the same closure sets regardless of whether the original
3854      * set contained the code point or a string.
3855      */
3856     public static final int CASE_INSENSITIVE = 2;
3857 
3858     /**
3859      * Adds all case mappings for each element in the set.
3860      * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
3861      * of each existing element in the set.
3862      *
3863      * <p>This value is an options bit set value for some
3864      * constructors, applyPattern(), and closeOver().
3865      * It can be ORed together with other, unrelated options.
3866      *
3867      * <p>Unlike the “case insensitive” options, this does not perform a closure.
3868      * For example, it does not add 'ſ' (U+017F long s) for 's',
3869      * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
3870      */
3871     public static final int ADD_CASE_MAPPINGS = 4;
3872 
3873     /**
3874      * Enable case insensitive matching.
3875      * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings,
3876      * which map each code point to one code point,
3877      * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
3878      *
3879      * <p>This is designed for case-insensitive matches, for example in certain
3880      * regular expression implementations where only Simple_Case_Folding mappings are used,
3881      * such as in ECMAScript (JavaScript) regular expressions.
3882      *
3883      * <p>This value is an options bit set value for some
3884      * constructors, applyPattern(), and closeOver().
3885      * It can be ORed together with other, unrelated options.
3886      */
3887     @android.annotation.FlaggedApi(com.android.icu.Flags.FLAG_ICU_25Q2_API)
3888     public static final int SIMPLE_CASE_INSENSITIVE = 6;
3889 
3890     private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS;
3891 
3892     //  add the result of a full case mapping to the set
3893     //  use str as a temporary string to avoid constructing one
addCaseMapping(UnicodeSet set, int result, StringBuilder full)3894     private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
3895         if(result >= 0) {
3896             if(result > UCaseProps.MAX_STRING_LENGTH) {
3897                 // add a single-code point case mapping
3898                 set.add(result);
3899             } else {
3900                 // add a string case mapping from full with length result
3901                 set.add(full.toString());
3902                 full.setLength(0);
3903             }
3904         }
3905         // result < 0: the code point mapped to itself, no need to add it
3906         // see UCaseProps
3907     }
3908 
3909     /** For case closure on a large set, look only at code points with relevant properties. */
maybeOnlyCaseSensitive(UnicodeSet src)3910     UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) {
3911         if (src.size() < 30) {
3912             return src;
3913         }
3914         // Return the intersection of the src code points with Case_Sensitive ones.
3915         UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE);
3916         // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src.
3917         if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) {
3918             return sensitive.cloneAsThawed().retainAll(src);
3919         } else {
3920             return ((UnicodeSet) src.clone()).retainAll(sensitive);
3921         }
3922     }
3923 
3924     // Per-character scf = Simple_Case_Folding of a string.
3925     // (Normally when we case-fold a string we use full case foldings.)
scfString(CharSequence s, StringBuilder scf)3926     private static final boolean scfString(CharSequence s, StringBuilder scf) {
3927         int length = s.length();
3928         // Loop while not needing modification.
3929         for (int i = 0; i < length;) {
3930             int c = Character.codePointAt(s, i);
3931             int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
3932             if (scfChar != c) {
3933                 // Copy the characters before c.
3934                 scf.setLength(0);
3935                 scf.append(s, 0, i);
3936                 // Loop over the rest of the string and keep case-folding.
3937                 for (;;) {
3938                     scf.appendCodePoint(scfChar);
3939                     i += Character.charCount(c);
3940                     if (i == length) {
3941                         return true;
3942                     }
3943                     c = Character.codePointAt(s, i);
3944                     scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT);
3945                 }
3946             }
3947             i += Character.charCount(c);
3948         }
3949         return false;
3950     }
3951 
3952     /**
3953      * Close this set over the given attribute.  For the attribute
3954      * {@link #CASE_INSENSITIVE}, the result is to modify this set so that:
3955      *
3956      * <ol>
3957      * <li>For each character or string 'a' in this set, all strings
3958      * 'b' such that foldCase(a) == foldCase(b) are added to this set.
3959      * (For most 'a' that are single characters, 'b' will have
3960      * b.length() == 1.)
3961      *
3962      * <li>For each string 'e' in the resulting set, if e !=
3963      * foldCase(e), 'e' will be removed.
3964      * </ol>
3965      *
3966      * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
3967      *
3968      * <p>(Here foldCase(x) refers to the operation
3969      * UCharacter.foldCase(x, true), and a == b actually denotes
3970      * a.equals(b), not pointer comparison.)
3971      *
3972      * @param attribute bitmask for attributes to close over.
3973      * Valid options:
3974      * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS},
3975      * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive.
3976      * Unrelated options bits are ignored.
3977      * @return a reference to this set.
3978      */
closeOver(int attribute)3979     public UnicodeSet closeOver(int attribute) {
3980         checkFrozen();
3981         switch (attribute & CASE_MASK) {
3982         case 0:
3983             break;
3984         case CASE_INSENSITIVE:
3985             closeOverCaseInsensitive(/* simple= */ false);
3986             break;
3987         case ADD_CASE_MAPPINGS:
3988             closeOverAddCaseMappings();
3989             break;
3990         case SIMPLE_CASE_INSENSITIVE:
3991             closeOverCaseInsensitive(/* simple= */ true);
3992             break;
3993         default:
3994             // bad option (unreachable)
3995             break;
3996         }
3997         return this;
3998     }
3999 
closeOverCaseInsensitive(boolean simple)4000     private void closeOverCaseInsensitive(boolean simple) {
4001         UCaseProps csp = UCaseProps.INSTANCE;
4002         // Start with input set to guarantee inclusion.
4003         UnicodeSet foldSet = new UnicodeSet(this);
4004 
4005         // Full case mappings closure:
4006         // Remove strings because the strings will actually be reduced (folded);
4007         // therefore, start with no strings and add only those needed.
4008         // Do this before processing code points, because they may add strings.
4009         if (!simple && foldSet.hasStrings()) {
4010             foldSet.strings.clear();
4011         }
4012 
4013         UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
4014 
4015         // Iterate over the ranges of single code points. Nested loop for each code point.
4016         int n = codePoints.getRangeCount();
4017         for (int i=0; i<n; ++i) {
4018             int start = codePoints.getRangeStart(i);
4019             int end   = codePoints.getRangeEnd(i);
4020 
4021             if (simple) {
4022                 for (int cp=start; cp<=end; ++cp) {
4023                     csp.addSimpleCaseClosure(cp, foldSet);
4024                 }
4025             } else {
4026                 for (int cp=start; cp<=end; ++cp) {
4027                     csp.addCaseClosure(cp, foldSet);
4028                 }
4029             }
4030         }
4031         if (hasStrings()) {
4032             StringBuilder sb = simple ? new StringBuilder() : null;
4033             for (String s : strings) {
4034                 if (simple) {
4035                     if (scfString(s, sb)) {
4036                         foldSet.remove(s).add(sb);
4037                     }
4038                 } else {
4039                     String str = UCharacter.foldCase(s, 0);
4040                     if(!csp.addStringCaseClosure(str, foldSet)) {
4041                         foldSet.add(str); // does not map to code points: add the folded string itself
4042                     }
4043                 }
4044             }
4045         }
4046         set(foldSet);
4047     }
4048 
closeOverAddCaseMappings()4049     private void closeOverAddCaseMappings() {
4050         UCaseProps csp = UCaseProps.INSTANCE;
4051         // Start with input set to guarantee inclusion.
4052         UnicodeSet foldSet = new UnicodeSet(this);
4053 
4054         UnicodeSet codePoints = maybeOnlyCaseSensitive(this);
4055 
4056         // Iterate over the ranges of single code points. Nested loop for each code point.
4057         int n = codePoints.getRangeCount();
4058         int result;
4059         StringBuilder full = new StringBuilder();
4060 
4061         for (int i=0; i<n; ++i) {
4062             int start = codePoints.getRangeStart(i);
4063             int end   = codePoints.getRangeEnd(i);
4064 
4065             // add case mappings
4066             // (does not add long s for regular s, or Kelvin for k, for example)
4067             for (int cp=start; cp<=end; ++cp) {
4068                 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
4069                 addCaseMapping(foldSet, result, full);
4070 
4071                 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
4072                 addCaseMapping(foldSet, result, full);
4073 
4074                 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
4075                 addCaseMapping(foldSet, result, full);
4076 
4077                 result = csp.toFullFolding(cp, full, 0);
4078                 addCaseMapping(foldSet, result, full);
4079             }
4080         }
4081         if (hasStrings()) {
4082             ULocale root = ULocale.ROOT;
4083             BreakIterator bi = BreakIterator.getWordInstance(root);
4084             for (String str : strings) {
4085                 // TODO: call lower-level functions
4086                 foldSet.add(UCharacter.toLowerCase(root, str));
4087                 foldSet.add(UCharacter.toTitleCase(root, str, bi));
4088                 foldSet.add(UCharacter.toUpperCase(root, str));
4089                 foldSet.add(UCharacter.foldCase(str, 0));
4090             }
4091         }
4092         set(foldSet);
4093     }
4094 
4095     /**
4096      * Internal class for customizing UnicodeSet parsing of properties.
4097      * TODO: extend to allow customizing of codepoint ranges
4098      * @author medavis
4099      * @hide Only a subset of ICU is exposed in Android
4100      * @hide draft / provisional / internal are hidden on Android
4101      */
4102     abstract public static class XSymbolTable implements SymbolTable {
4103         /**
4104          * Default constructor
4105          * @hide draft / provisional / internal are hidden on Android
4106          */
XSymbolTable()4107         public XSymbolTable(){}
4108         /**
4109          * Supplies default implementation for SymbolTable (no action).
4110          * @hide draft / provisional / internal are hidden on Android
4111          */
4112         @Override
lookupMatcher(int i)4113         public UnicodeMatcher lookupMatcher(int i) {
4114             return null;
4115         }
4116 
4117         /**
4118          * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style
4119          * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be.
4120          * <p>
4121          * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a
4122          * propertyName+propertyValue combination.
4123          *
4124          * @param propertyName
4125          *            the name of the property
4126          * @param propertyValue
4127          *            the name of the property value
4128          * @param result UnicodeSet value to change
4129          *            a set to which the characters having the propertyName+propertyValue are to be added.
4130          * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters
4131          *         with that property have been added to the UnicodeSet, and returns false if the
4132          *         propertyName+propertyValue combination is not recognized (in which case result is unaltered).
4133          * @hide draft / provisional / internal are hidden on Android
4134          */
applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)4135         public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) {
4136             return false;
4137         }
4138         /**
4139          * Supplies default implementation for SymbolTable (no action).
4140          * @hide draft / provisional / internal are hidden on Android
4141          */
4142         @Override
lookup(String s)4143         public char[] lookup(String s) {
4144             return null;
4145         }
4146         /**
4147          * Supplies default implementation for SymbolTable (no action).
4148          * @hide draft / provisional / internal are hidden on Android
4149          */
4150         @Override
parseReference(String text, ParsePosition pos, int limit)4151         public String parseReference(String text, ParsePosition pos, int limit) {
4152             return null;
4153         }
4154     }
4155 
4156     /**
4157      * Is this frozen, according to the Freezable interface?
4158      *
4159      * @return value
4160      */
4161     @Override
isFrozen()4162     public boolean isFrozen() {
4163         return (bmpSet != null || stringSpan != null);
4164     }
4165 
4166     /**
4167      * Freeze this class, according to the Freezable interface.
4168      *
4169      * @return this
4170      */
4171     @Override
freeze()4172     public UnicodeSet freeze() {
4173         if (!isFrozen()) {
4174             compact();
4175 
4176             // Optimize contains() and span() and similar functions.
4177             if (hasStrings()) {
4178                 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL);
4179             }
4180             if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
4181                 // Optimize for code point spans.
4182                 // There are no strings, or
4183                 // all strings are irrelevant for span() etc. because
4184                 // all of each string's code points are contained in this set.
4185                 // However, fully contained strings are relevant for spanAndCount(),
4186                 // so we create both objects.
4187                 bmpSet = new BMPSet(list, len);
4188             }
4189         }
4190         return this;
4191     }
4192 
4193     /**
4194      * Span a string using this UnicodeSet.
4195      * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4196      * @param s The string to be spanned
4197      * @param spanCondition The span condition
4198      * @return the length of the span
4199      */
span(CharSequence s, SpanCondition spanCondition)4200     public int span(CharSequence s, SpanCondition spanCondition) {
4201         return span(s, 0, spanCondition);
4202     }
4203 
4204     /**
4205      * Span a string using this UnicodeSet.
4206      *   If the start index is less than 0, span will start from 0.
4207      *   If the start index is greater than the string length, span returns the string length.
4208      * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4209      * @param s The string to be spanned
4210      * @param start The start index that the span begins
4211      * @param spanCondition The span condition
4212      * @return the string index which ends the span (i.e. exclusive)
4213      */
span(CharSequence s, int start, SpanCondition spanCondition)4214     public int span(CharSequence s, int start, SpanCondition spanCondition) {
4215         int end = s.length();
4216         if (start < 0) {
4217             start = 0;
4218         } else if (start >= end) {
4219             return end;
4220         }
4221         if (bmpSet != null) {
4222             // Frozen set without strings, or no string is relevant for span().
4223             return bmpSet.span(s, start, spanCondition, null);
4224         }
4225         if (stringSpan != null) {
4226             return stringSpan.span(s, start, spanCondition);
4227         } else if (hasStrings()) {
4228             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
4229                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
4230             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which);
4231             if (strSpan.needsStringSpanUTF16()) {
4232                 return strSpan.span(s, start, spanCondition);
4233             }
4234         }
4235 
4236         return spanCodePointsAndCount(s, start, spanCondition, null);
4237     }
4238 
4239     /**
4240      * Same as span() but also counts the smallest number of set elements on any path across the span.
4241      * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4242      * @param outCount An output-only object (must not be null) for returning the count.
4243      * @return the limit (exclusive end) of the span
4244      * @deprecated This API is ICU internal only.
4245      * @hide original deprecated declaration
4246      * @hide draft / provisional / internal are hidden on Android
4247      */
4248     @Deprecated
spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4249     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
4250         if (outCount == null) {
4251             throw new IllegalArgumentException("outCount must not be null");
4252         }
4253         int end = s.length();
4254         if (start < 0) {
4255             start = 0;
4256         } else if (start >= end) {
4257             return end;
4258         }
4259         if (stringSpan != null) {
4260             // We might also have bmpSet != null,
4261             // but fully-contained strings are relevant for counting elements.
4262             return stringSpan.spanAndCount(s, start, spanCondition, outCount);
4263         } else if (bmpSet != null) {
4264             return bmpSet.span(s, start, spanCondition, outCount);
4265         } else if (hasStrings()) {
4266             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
4267                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
4268             which |= UnicodeSetStringSpan.WITH_COUNT;
4269             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which);
4270             return strSpan.spanAndCount(s, start, spanCondition, outCount);
4271         }
4272 
4273         return spanCodePointsAndCount(s, start, spanCondition, outCount);
4274     }
4275 
spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4276     private int spanCodePointsAndCount(CharSequence s, int start,
4277             SpanCondition spanCondition, OutputInt outCount) {
4278         // Pin to 0/1 values.
4279         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
4280 
4281         int c;
4282         int next = start;
4283         int length = s.length();
4284         int count = 0;
4285         do {
4286             c = Character.codePointAt(s, next);
4287             if (spanContained != contains(c)) {
4288                 break;
4289             }
4290             ++count;
4291             next += Character.charCount(c);
4292         } while (next < length);
4293         if (outCount != null) { outCount.value = count; }
4294         return next;
4295     }
4296 
4297     /**
4298      * Span a string backwards (from the end) using this UnicodeSet.
4299      * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4300      * @param s The string to be spanned
4301      * @param spanCondition The span condition
4302      * @return The string index which starts the span (i.e. inclusive).
4303      */
spanBack(CharSequence s, SpanCondition spanCondition)4304     public int spanBack(CharSequence s, SpanCondition spanCondition) {
4305         return spanBack(s, s.length(), spanCondition);
4306     }
4307 
4308     /**
4309      * Span a string backwards (from the fromIndex) using this UnicodeSet.
4310      * If the fromIndex is less than 0, spanBack will return 0.
4311      * If fromIndex is greater than the string length, spanBack will start from the string length.
4312      * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
4313      * @param s The string to be spanned
4314      * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
4315      * @param spanCondition The span condition
4316      * @return The string index which starts the span (i.e. inclusive).
4317      */
spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4318     public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
4319         if (fromIndex <= 0) {
4320             return 0;
4321         }
4322         if (fromIndex > s.length()) {
4323             fromIndex = s.length();
4324         }
4325         if (bmpSet != null) {
4326             // Frozen set without strings, or no string is relevant for spanBack().
4327             return bmpSet.spanBack(s, fromIndex, spanCondition);
4328         }
4329         if (stringSpan != null) {
4330             return stringSpan.spanBack(s, fromIndex, spanCondition);
4331         } else if (hasStrings()) {
4332             int which = (spanCondition == SpanCondition.NOT_CONTAINED)
4333                     ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
4334                             : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
4335             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which);
4336             if (strSpan.needsStringSpanUTF16()) {
4337                 return strSpan.spanBack(s, fromIndex, spanCondition);
4338             }
4339         }
4340 
4341         // Pin to 0/1 values.
4342         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
4343 
4344         int c;
4345         int prev = fromIndex;
4346         do {
4347             c = Character.codePointBefore(s, prev);
4348             if (spanContained != contains(c)) {
4349                 break;
4350             }
4351             prev -= Character.charCount(c);
4352         } while (prev > 0);
4353         return prev;
4354     }
4355 
4356     /**
4357      * Clone a thawed version of this class, according to the Freezable interface.
4358      * @return the clone, not frozen
4359      */
4360     @Override
cloneAsThawed()4361     public UnicodeSet cloneAsThawed() {
4362         UnicodeSet result = new UnicodeSet(this);
4363         assert !result.isFrozen();
4364         return result;
4365     }
4366 
4367     // internal function
checkFrozen()4368     private void checkFrozen() {
4369         if (isFrozen()) {
4370             throw new UnsupportedOperationException("Attempt to modify frozen object");
4371         }
4372     }
4373 
4374     // ************************
4375     // Additional methods for integration with Generics and Collections
4376     // ************************
4377 
4378     /**
4379      * A struct-like class used for iteration through ranges, for faster iteration than by String.
4380      * Read about the restrictions on usage in {@link UnicodeSet#ranges()}.
4381      */
4382     public static class EntryRange {
4383         /**
4384          * The starting code point of the range.
4385          */
4386         public int codepoint;
4387         /**
4388          * The ending code point of the range
4389          */
4390         public int codepointEnd;
4391 
EntryRange()4392         EntryRange() {
4393         }
4394 
4395         /**
4396          * {@inheritDoc}
4397          */
4398         @Override
toString()4399         public String toString() {
4400             StringBuilder b = new StringBuilder();
4401             return (
4402                     codepoint == codepointEnd ? _appendToPat(b, codepoint, false)
4403                             : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false))
4404                             .toString();
4405         }
4406     }
4407 
4408     /**
4409      * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points.
4410      * The UnicodeSet must not be altered during the iteration.
4411      * The EntryRange instance is the same each time; the contents are just reset.
4412      *
4413      * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
4414      *
4415      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
4416      * Do not alter the UnicodeSet while iterating.
4417      *
4418      * <pre>
4419      * // Sample code
4420      * for (EntryRange range : us1.ranges()) {
4421      *     // do something with code points between range.codepoint and range.codepointEnd;
4422      * }
4423      * for (String s : us1.strings()) {
4424      *     // do something with each string;
4425      * }
4426      * </pre>
4427      */
ranges()4428     public Iterable<EntryRange> ranges() {
4429         return new EntryRangeIterable();
4430     }
4431 
4432     private class EntryRangeIterable implements Iterable<EntryRange> {
4433         @Override
iterator()4434         public Iterator<EntryRange> iterator() {
4435             return new EntryRangeIterator();
4436         }
4437     }
4438 
4439     private class EntryRangeIterator implements Iterator<EntryRange> {
4440         int pos;
4441         EntryRange result = new EntryRange();
4442 
4443         @Override
hasNext()4444         public boolean hasNext() {
4445             return pos < len-1;
4446         }
4447         @Override
next()4448         public EntryRange next() {
4449             if (pos < len-1) {
4450                 result.codepoint = list[pos++];
4451                 result.codepointEnd = list[pos++]-1;
4452             } else {
4453                 throw new NoSuchElementException();
4454             }
4455             return result;
4456         }
4457         @Override
remove()4458         public void remove() {
4459             throw new UnsupportedOperationException();
4460         }
4461     }
4462 
4463 
4464     /**
4465      * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}.
4466      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
4467      * Do not alter the UnicodeSet while iterating.
4468      * @see java.util.Set#iterator()
4469      */
4470     @Override
iterator()4471     public Iterator<String> iterator() {
4472         return new UnicodeSetIterator2(this);
4473     }
4474 
4475     // Cover for string iteration.
4476     private static class UnicodeSetIterator2 implements Iterator<String> {
4477         // Invariants:
4478         // sourceList != null then sourceList[item] is a valid character
4479         // sourceList == null then delegates to stringIterator
4480         private int[] sourceList;
4481         private int len;
4482         private int item;
4483         private int current;
4484         private int limit;
4485         private SortedSet<String> sourceStrings;
4486         private Iterator<String> stringIterator;
4487         private char[] buffer;
4488 
UnicodeSetIterator2(UnicodeSet source)4489         UnicodeSetIterator2(UnicodeSet source) {
4490             // set according to invariants
4491             len = source.len - 1;
4492             if (len > 0) {
4493                 sourceStrings = source.strings;
4494                 sourceList = source.list;
4495                 current = sourceList[item++];
4496                 limit = sourceList[item++];
4497             } else {
4498                 stringIterator = source.strings.iterator();
4499                 sourceList = null;
4500             }
4501         }
4502 
4503         /* (non-Javadoc)
4504          * @see java.util.Iterator#hasNext()
4505          */
4506         @Override
hasNext()4507         public boolean hasNext() {
4508             return sourceList != null || stringIterator.hasNext();
4509         }
4510 
4511         /* (non-Javadoc)
4512          * @see java.util.Iterator#next()
4513          */
4514         @Override
next()4515         public String next() {
4516             if (sourceList == null) {
4517                 return stringIterator.next();
4518             }
4519             int codepoint = current++;
4520             // we have the codepoint we need, but we may need to adjust the state
4521             if (current >= limit) {
4522                 if (item >= len) {
4523                     stringIterator = sourceStrings.iterator();
4524                     sourceList = null;
4525                 } else {
4526                     current = sourceList[item++];
4527                     limit = sourceList[item++];
4528                 }
4529             }
4530             // Now return. Single code point is easy
4531             if (codepoint <= 0xFFFF) {
4532                 return String.valueOf((char)codepoint);
4533             }
4534             // But Java lacks a valueOfCodePoint, so we handle ourselves for speed
4535             // allocate a buffer the first time, to make conversion faster.
4536             if (buffer == null) {
4537                 buffer = new char[2];
4538             }
4539             // compute ourselves, to save tests and calls
4540             int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT;
4541             buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE);
4542             buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE);
4543             return String.valueOf(buffer);
4544         }
4545 
4546         /* (non-Javadoc)
4547          * @see java.util.Iterator#remove()
4548          */
4549         @Override
remove()4550         public void remove() {
4551             throw new UnsupportedOperationException();
4552         }
4553     }
4554 
4555     /**
4556      * @see #containsAll(android.icu.text.UnicodeSet)
4557      */
containsAll(Iterable<T> collection)4558     public <T extends CharSequence> boolean containsAll(Iterable<T> collection) {
4559         for (T o : collection) {
4560             if (!contains(o)) {
4561                 return false;
4562             }
4563         }
4564         return true;
4565     }
4566 
4567     /**
4568      * @see #containsNone(android.icu.text.UnicodeSet)
4569      */
containsNone(Iterable<T> collection)4570     public <T extends CharSequence> boolean containsNone(Iterable<T> collection) {
4571         for (T o : collection) {
4572             if (contains(o)) {
4573                 return false;
4574             }
4575         }
4576         return true;
4577     }
4578 
4579     /**
4580      * @see #containsAll(android.icu.text.UnicodeSet)
4581      */
containsSome(Iterable<T> collection)4582     public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) {
4583         return !containsNone(collection);
4584     }
4585 
4586     /**
4587      * @see #addAll(android.icu.text.UnicodeSet)
4588      */
4589     @SuppressWarnings("unchecked")  // See ticket #11395, this is safe.
addAll(T... collection)4590     public <T extends CharSequence> UnicodeSet addAll(T... collection) {
4591         checkFrozen();
4592         for (T str : collection) {
4593             add(str);
4594         }
4595         return this;
4596     }
4597 
4598 
4599     /**
4600      * @see #removeAll(android.icu.text.UnicodeSet)
4601      */
removeAll(Iterable<T> collection)4602     public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) {
4603         checkFrozen();
4604         for (T o : collection) {
4605             remove(o);
4606         }
4607         return this;
4608     }
4609 
4610     /**
4611      * @see #retainAll(android.icu.text.UnicodeSet)
4612      */
retainAll(Iterable<T> collection)4613     public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) {
4614         checkFrozen();
4615         // TODO optimize
4616         UnicodeSet toRetain = new UnicodeSet();
4617         toRetain.addAll(collection);
4618         retainAll(toRetain);
4619         return this;
4620     }
4621 
4622     /**
4623      * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}.
4624      */
4625     public enum ComparisonStyle {
4626         /**
4627          */
4628         SHORTER_FIRST,
4629         /**
4630          */
4631         LEXICOGRAPHIC,
4632         /**
4633          */
4634         LONGER_FIRST
4635     }
4636 
4637     /**
4638      * Compares UnicodeSets, where shorter come first, and otherwise lexicographically
4639      * (according to the comparison of the first characters that differ).
4640      * @see java.lang.Comparable#compareTo(java.lang.Object)
4641      */
4642     @Override
compareTo(UnicodeSet o)4643     public int compareTo(UnicodeSet o) {
4644         return compareTo(o, ComparisonStyle.SHORTER_FIRST);
4645     }
4646     /**
4647      * Compares UnicodeSets, in three different ways.
4648      * @see java.lang.Comparable#compareTo(java.lang.Object)
4649      */
compareTo(UnicodeSet o, ComparisonStyle style)4650     public int compareTo(UnicodeSet o, ComparisonStyle style) {
4651         if (style != ComparisonStyle.LEXICOGRAPHIC) {
4652             int diff = size() - o.size();
4653             if (diff != 0) {
4654                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
4655             }
4656         }
4657         int result;
4658         for (int i = 0; ; ++i) {
4659             if (0 != (result = list[i] - o.list[i])) {
4660                 // if either list ran out, compare to the last string
4661                 if (list[i] == HIGH) {
4662                     if (!hasStrings()) return 1;
4663                     String item = strings.first();
4664                     return compare(item, o.list[i]);
4665                 }
4666                 if (o.list[i] == HIGH) {
4667                     if (!o.hasStrings()) return -1;
4668                     String item = o.strings.first();
4669                     int compareResult = compare(item, list[i]);
4670                     return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order.
4671                 }
4672                 // otherwise return the result if even index, or the reversal if not
4673                 return (i & 1) == 0 ? result : -result;
4674             }
4675             if (list[i] == HIGH) {
4676                 break;
4677             }
4678         }
4679         return compare(strings, o.strings);
4680     }
4681 
4682     /**
4683      */
compareTo(Iterable<String> other)4684     public int compareTo(Iterable<String> other) {
4685         return compare(this, other);
4686     }
4687 
4688     /**
4689      * Utility to compare a string to a code point.
4690      * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString())
4691      * and comparing, but much faster (no object creation).
4692      * Actually, there is one difference; a null compares as less.
4693      * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order.
4694      * @hide unsupported on Android
4695      */
4696 
compare(CharSequence string, int codePoint)4697     public static int compare(CharSequence string, int codePoint) {
4698         return CharSequences.compare(string, codePoint);
4699     }
4700 
4701     /**
4702      * Utility to compare a string to a code point.
4703      * Same results as turning the code point into a string and comparing, but much faster (no object creation).
4704      * Actually, there is one difference; a null compares as less.
4705      * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order.
4706      * @hide unsupported on Android
4707      */
compare(int codePoint, CharSequence string)4708     public static int compare(int codePoint, CharSequence string) {
4709         return -CharSequences.compare(string, codePoint);
4710     }
4711 
4712 
4713     /**
4714      * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered,
4715      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
4716      * That means that sets can't be compared directly with this method, unless they are TreeSets without
4717      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
4718      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
4719      * @hide unsupported on Android
4720      */
compare(Iterable<T> collection1, Iterable<T> collection2)4721     public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) {
4722         return compare(collection1.iterator(), collection2.iterator());
4723     }
4724 
4725     /**
4726      * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered,
4727      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
4728      * That means that sets can't be compared directly with this method, unless they are TreeSets without
4729      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
4730      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
4731      * @deprecated This API is ICU internal only.
4732      * @hide original deprecated declaration
4733      * @hide draft / provisional / internal are hidden on Android
4734      */
4735     @Deprecated
compare(Iterator<T> first, Iterator<T> other)4736     public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) {
4737         while (true) {
4738             if (!first.hasNext()) {
4739                 return other.hasNext() ? -1 : 0;
4740             } else if (!other.hasNext()) {
4741                 return 1;
4742             }
4743             T item1 = first.next();
4744             T item2 = other.next();
4745             int result = item1.compareTo(item2);
4746             if (result != 0) {
4747                 return result;
4748             }
4749         }
4750     }
4751 
4752 
4753     /**
4754      * Utility to compare two collections, optionally by size, and then lexicographically.
4755      * @hide unsupported on Android
4756      */
compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4757     public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) {
4758         if (style != ComparisonStyle.LEXICOGRAPHIC) {
4759             int diff = collection1.size() - collection2.size();
4760             if (diff != 0) {
4761                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
4762             }
4763         }
4764         return compare(collection1, collection2);
4765     }
4766 
4767     /**
4768      * Utility for adding the contents of an iterable to a collection.
4769      * @hide unsupported on Android
4770      */
addAllTo(Iterable<T> source, U target)4771     public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) {
4772         for (T item : source) {
4773             target.add(item);
4774         }
4775         return target;
4776     }
4777 
4778     /**
4779      * Utility for adding the contents of an iterable to a collection.
4780      * @hide unsupported on Android
4781      */
addAllTo(Iterable<T> source, T[] target)4782     public static <T> T[] addAllTo(Iterable<T> source, T[] target) {
4783         int i = 0;
4784         for (T item : source) {
4785             target[i++] = item;
4786         }
4787         return target;
4788     }
4789 
4790     /**
4791      * For iterating through the strings in the set. Example:
4792      * <pre>
4793      * for (String key : myUnicodeSet.strings()) {
4794      *   doSomethingWith(key);
4795      * }
4796      * </pre>
4797      */
strings()4798     public Collection<String> strings() {
4799         if (hasStrings()) {
4800             return Collections.unmodifiableSortedSet(strings);
4801         } else {
4802             return EMPTY_STRINGS;
4803         }
4804     }
4805 
4806     /**
4807      * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE.
4808      * @deprecated This API is ICU internal only.
4809      * @hide original deprecated declaration
4810      * @hide draft / provisional / internal are hidden on Android
4811      */
4812     @Deprecated
getSingleCodePoint(CharSequence s)4813     public static int getSingleCodePoint(CharSequence s) {
4814         return CharSequences.getSingleCodePoint(s);
4815     }
4816 
4817     /**
4818      * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set.
4819      * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E
4820      * if the dontCare set includes unassigned characters (for a particular version of Unicode).
4821      * @param dontCare Set with the don't-care characters for spanning
4822      * @return the input set, modified
4823      * @deprecated This API is ICU internal only.
4824      * @hide original deprecated declaration
4825      * @hide draft / provisional / internal are hidden on Android
4826      */
4827     @Deprecated
addBridges(UnicodeSet dontCare)4828     public UnicodeSet addBridges(UnicodeSet dontCare) {
4829         UnicodeSet notInInput = new UnicodeSet(this).complement().removeAllStrings();
4830         for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) {
4831             if (it.codepoint != 0 && it.codepointEnd != 0x10FFFF &&
4832                     dontCare.contains(it.codepoint, it.codepointEnd)) {
4833                 add(it.codepoint,it.codepointEnd);
4834             }
4835         }
4836         return this;
4837     }
4838 
4839     /**
4840      * Find the first index at or after fromIndex where the UnicodeSet matches at that index.
4841      * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match.
4842      * If there is no match, length is returned.
4843      * @deprecated This API is ICU internal only. Use span instead.
4844      * @hide original deprecated declaration
4845      * @hide draft / provisional / internal are hidden on Android
4846      */
4847     @Deprecated
findIn(CharSequence value, int fromIndex, boolean findNot)4848     public int findIn(CharSequence value, int fromIndex, boolean findNot) {
4849         //TODO add strings, optimize, using ICU4C algorithms
4850         int cp;
4851         for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) {
4852             cp = UTF16.charAt(value, fromIndex);
4853             if (contains(cp) != findNot) {
4854                 break;
4855             }
4856         }
4857         return fromIndex;
4858     }
4859 
4860     /**
4861      * Find the last index before fromIndex where the UnicodeSet matches at that index.
4862      * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match.
4863      * If there is no match, -1 is returned.
4864      * BEFORE index is not in the UnicodeSet.
4865      * @deprecated This API is ICU internal only. Use spanBack instead.
4866      * @hide original deprecated declaration
4867      * @hide draft / provisional / internal are hidden on Android
4868      */
4869     @Deprecated
findLastIn(CharSequence value, int fromIndex, boolean findNot)4870     public int findLastIn(CharSequence value, int fromIndex, boolean findNot) {
4871         //TODO add strings, optimize, using ICU4C algorithms
4872         int cp;
4873         fromIndex -= 1;
4874         for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) {
4875             cp = UTF16.charAt(value, fromIndex);
4876             if (contains(cp) != findNot) {
4877                 break;
4878             }
4879         }
4880         return fromIndex < 0 ? -1 : fromIndex;
4881     }
4882 
4883     /**
4884      * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match.
4885      * @param source The source of the CharSequence to strip from.
4886      * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object.
4887      * @return The string after it has been stripped.
4888      * @deprecated This API is ICU internal only. Use replaceFrom.
4889      * @hide original deprecated declaration
4890      * @hide draft / provisional / internal are hidden on Android
4891      */
4892     @Deprecated
stripFrom(CharSequence source, boolean matches)4893     public String stripFrom(CharSequence source, boolean matches) {
4894         StringBuilder result = new StringBuilder();
4895         for (int pos = 0; pos < source.length();) {
4896             int inside = findIn(source, pos, !matches);
4897             result.append(source.subSequence(pos, inside));
4898             pos = findIn(source, inside, matches); // get next start
4899         }
4900         return result.toString();
4901     }
4902 
4903     /**
4904      * Argument values for whether span() and similar functions continue while the current character is contained vs.
4905      * not contained in the set.
4906      * <p>
4907      * The functionality is straightforward for sets with only single code points, without strings (which is the common
4908      * case):
4909      * <ul>
4910      * <li>CONTAINED and SIMPLE work the same.
4911      * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
4912      * <li>span() and spanBack() partition any string the
4913      * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
4914      * <li>Using a
4915      * complemented (inverted) set and the opposite span conditions yields the same results.
4916      * </ul>
4917      * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
4918      * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
4919      * strings:
4920      * <ul>
4921      * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
4922      * Therefore, complementing both the set and the span conditions may yield different results.
4923      * <li>When starting spans
4924      * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
4925      * because a set string may start before the later position.
4926      * <li>span(SIMPLE) may be shorter than
4927      * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
4928      * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
4929      * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
4930      * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
4931      * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
4932      * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
4933      * </ul>
4934      * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
4935      * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
4936      * be used.
4937      * <p>
4938      * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
4939      * boundaries, never in the middle of a surrogate pair.
4940      */
4941     public enum SpanCondition {
4942         /**
4943          * Continues a span() while there is no set element at the current position.
4944          * Increments by one code point at a time.
4945          * Stops before the first set element (character or string).
4946          * (For code points only, this is like while contains(current)==false).
4947          * <p>
4948          * When span() returns, the substring between where it started and the position it returned consists only of
4949          * characters that are not in the set, and none of its strings overlap with the span.
4950          */
4951         NOT_CONTAINED,
4952 
4953         /**
4954          * Spans the longest substring that is a concatenation of set elements (characters or strings).
4955          * (For characters only, this is like while contains(current)==true).
4956          * <p>
4957          * When span() returns, the substring between where it started and the position it returned consists only of set
4958          * elements (characters or strings) that are in the set.
4959          * <p>
4960          * If a set contains strings, then the span will be the longest substring for which there
4961          * exists at least one non-overlapping concatenation of set elements (characters or strings).
4962          * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
4963          * (Java/ICU/Perl regex stops at the first match of an OR.)
4964          */
4965         CONTAINED,
4966 
4967         /**
4968          * Continues a span() while there is a set element at the current position.
4969          * Increments by the longest matching element at each position.
4970          * (For characters only, this is like while contains(current)==true).
4971          * <p>
4972          * When span() returns, the substring between where it started and the position it returned consists only of set
4973          * elements (characters or strings) that are in the set.
4974          * <p>
4975          * If a set only contains single characters, then this is the same as CONTAINED.
4976          * <p>
4977          * If a set contains strings, then the span will be the longest substring with a match at each position with the
4978          * longest single set element (character or string).
4979          * <p>
4980          * Use this span condition together with other longest-match algorithms, such as ICU converters
4981          * (ucnv_getUnicodeSet()).
4982          */
4983         SIMPLE,
4984 
4985         /**
4986          * One more than the last span condition.
4987          */
4988         CONDITION_COUNT
4989     }
4990 
4991     /**
4992      * Get the default symbol table. Null means ordinary processing. For internal use only.
4993      * @return the symbol table
4994      * @deprecated This API is ICU internal only.
4995      * @hide original deprecated declaration
4996      * @hide draft / provisional / internal are hidden on Android
4997      */
4998     @Deprecated
getDefaultXSymbolTable()4999     public static XSymbolTable getDefaultXSymbolTable() {
5000         return XSYMBOL_TABLE;
5001     }
5002 
5003     /**
5004      * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing
5005      * of UnicodeSets.
5006      * <p>
5007      * WARNING: If this function is used with a UnicodeProperty, and the
5008      * Unassigned characters (gc=Cn) are different than in ICU, you MUST call
5009      * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable}
5010      * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}.
5011      *
5012      * @param xSymbolTable the new default symbol table.
5013      * @deprecated This API is ICU internal only.
5014      * @hide original deprecated declaration
5015      * @hide draft / provisional / internal are hidden on Android
5016      */
5017     @Deprecated
setDefaultXSymbolTable(XSymbolTable xSymbolTable)5018     public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
5019         // If the properties override inclusions, these have to be regenerated.
5020         // TODO: Check if the Unicode Tools or Unicode Utilities really need this.
5021         CharacterPropertiesImpl.clear();
5022         XSYMBOL_TABLE = xSymbolTable;
5023     }
5024 
5025     /**
5026      * Returns a {@link Stream} of {@link EntryRange} values from this {@code UnicodeSet}.
5027      *
5028      * <p><b>Warnings:</b>
5029      * <ul>
5030      *   <li>The {@link EntryRange} instance is the same each time; the contents are just reset.
5031      *   <li>To iterate over the full contents, you have to also iterate over the strings.
5032      *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
5033      *       Do not alter the {@code UnicodeSet} while iterating.
5034      * </ul>
5035      *
5036      * @return a {@link Stream} of {@link EntryRange}
5037      *
5038      * @hide draft / provisional / internal are hidden on Android
5039      */
rangeStream()5040     public Stream<EntryRange> rangeStream() {
5041         // Must use false to never make this parallel because the iterator always returns the same EntryRange object.
5042         return StreamSupport.stream(ranges().spliterator(), false);
5043     }
5044 
5045     /**
5046      * Returns a {@link Stream} of {@code String} values from this {@code UnicodeSet}.
5047      *
5048      * <p><b>Warnings:</b>
5049      * <ul>
5050      *   <li>To iterate over the full contents, you have to also iterate over the ranges or code points.
5051      *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
5052      *       Do not alter the {@code UnicodeSet} while iterating.
5053      * </ul>
5054      *
5055      * @return a {@link Stream} of {@code String}
5056      *
5057      * @hide draft / provisional / internal are hidden on Android
5058      */
stringStream()5059     public Stream<String> stringStream() {
5060         return strings().stream();
5061     }
5062 
5063     /**
5064      * Returns an {@link IntStream} of Unicode code point values from this {@code UnicodeSet}.
5065      *
5066      * <p><b>Warnings:</b>
5067      * <ul>
5068      *   <li>To iterate over the full contents, you have to also iterate over the strings.
5069      *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
5070      *       Do not alter the {@code UnicodeSet} while iterating.
5071      * </ul>
5072      *
5073      * @return an {@link IntStream} of Unicode code point values
5074      *
5075      * @hide draft / provisional / internal are hidden on Android
5076      */
codePointStream()5077     public IntStream codePointStream() {
5078         return StreamSupport.intStream(new CodePointSpliterator(this), false);
5079     }
5080 
5081     /**
5082      * Returns a stream of {@code String} values from this {@code UnicodeSet}.
5083      *
5084      * <p><b>Warnings:</b>
5085      * <ul>
5086      *   <li>To iterate over the full contents, you have to also iterate over the strings.
5087      *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
5088      *       Do not alter the {@code UnicodeSet} while iterating.
5089      * </ul>
5090      *
5091      * @return a {@link Stream} of {@code String}
5092      *
5093      * @hide draft / provisional / internal are hidden on Android
5094      */
stream()5095     public Stream<String> stream() {
5096         return StreamSupport.stream(spliterator(), false);
5097     }
5098 
5099     /**
5100      * Returns an {@link Iterable} for iteration over all the code points in this set.
5101      *
5102      * <p><b>Warnings:</b>
5103      * <ul>
5104      *   <li>This is a convenience method, but comes with a performance penalty
5105      *       because it boxes {@code int} into {@code Integer}.<br>
5106      *       For an efficient but old alternative use {@link UnicodeSetIterator#next()}.
5107      *   <li>To iterate over the full contents, you have to also iterate over the strings.
5108      *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
5109      *       Do not alter the {@code UnicodeSet} while iterating.
5110      * </ul>
5111      *
5112      * @return an {@link Iterable} over all the code points
5113      *
5114      * @hide draft / provisional / internal are hidden on Android
5115      */
codePoints()5116     public Iterable<Integer> codePoints() {
5117         return new CodePointIterable(this);
5118     }
5119 
5120     private class CodePointIterable implements Iterable<Integer> {
5121         private final UnicodeSet unicodeSet;
5122 
CodePointIterable(UnicodeSet unicodeSet)5123         CodePointIterable(UnicodeSet unicodeSet) {
5124             this.unicodeSet = unicodeSet;
5125         }
5126 
5127         @Override
iterator()5128         public Iterator<Integer> iterator() {
5129             return new CodePointIterator(unicodeSet);
5130         }
5131     }
5132 
5133     private class CodePointIterator implements Iterator<Integer> {
5134         private final CodePointIteratorInt cpi;
5135 
CodePointIterator(UnicodeSet unicodeSet)5136         CodePointIterator(UnicodeSet unicodeSet) {
5137             cpi = new CodePointIteratorInt(unicodeSet);
5138         }
5139 
5140         @Override
hasNext()5141         public boolean hasNext() {
5142             return cpi.hasNext();
5143         }
5144 
5145         @Override
next()5146         public Integer next() {
5147             return cpi.next();
5148         }
5149 
5150         @Override
remove()5151         public void remove() {
5152             throw new UnsupportedOperationException();
5153         }
5154     }
5155 
5156     private static class CodePointSpliterator implements Spliterator.OfInt {
5157         private final static int CHARACTERISTICS = Spliterator.SIZED | Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.NONNULL;
5158 
5159         private final UnicodeSet unicodeSet;
5160         private final CodePointIteratorInt cpi;
5161 
CodePointSpliterator(UnicodeSet unicodeSet)5162         CodePointSpliterator(UnicodeSet unicodeSet) {
5163             this.unicodeSet = unicodeSet;
5164             cpi = new CodePointIteratorInt(unicodeSet);
5165         }
5166 
5167         @Override
estimateSize()5168         public long estimateSize() {
5169             return unicodeSet.size() - unicodeSet.strings.size();
5170         }
5171 
5172         @Override
characteristics()5173         public int characteristics() {
5174             return unicodeSet.isFrozen() ? Spliterator.IMMUTABLE | CHARACTERISTICS : CHARACTERISTICS;
5175         }
5176 
5177         @Override
trySplit()5178         public Spliterator.OfInt trySplit() {
5179             /* From the doc:
5180              *   > This method may return null for any reason, including emptiness, inability to split after
5181              *   > traversal has commenced, data structure constraints, and efficiency considerations.
5182              */
5183             return null;
5184         }
5185 
5186         @Override
tryAdvance(IntConsumer action)5187         public boolean tryAdvance(IntConsumer action) {
5188             if (action == null) {
5189                 throw new NullPointerException();
5190             }
5191 
5192             if (cpi.hasNext()) {
5193                 action.accept(cpi.next());
5194                 return true;
5195             }
5196             return false;
5197         }
5198     }
5199 
5200     /**
5201      * This class is optimized to iterate on code points and will be used to implement both
5202      * the Iterator<Integer> (Integer, boxed value) and the Spliterator.OfInt (int primitive).
5203      * It looks exactly like an Iterator<Integer>, but works on the primitive int,
5204      * so it can't implement Iterator.
5205      */
5206     static private class CodePointIteratorInt {
5207         private final int[] list;
5208         private final int lastRange;
5209         private int currentRange = 0;
5210         private int rangeStart;
5211         private int rangeLimit;
5212 
CodePointIteratorInt(UnicodeSet unicodeSet)5213         public CodePointIteratorInt(UnicodeSet unicodeSet) {
5214             this.list = unicodeSet.list;
5215             lastRange = unicodeSet.len - 1;
5216             currentRange = 0;
5217             rangeStart = list[currentRange++];
5218             if (lastRange > 0) { // not an empty list
5219                 rangeLimit = list[currentRange++];
5220             } else {
5221                 rangeLimit = rangeStart; // should be HIGH, the guard value
5222             }
5223         }
5224 
hasNext()5225         public boolean hasNext() {
5226             return rangeStart < rangeLimit || currentRange < lastRange;
5227         }
5228 
next()5229         public int next() {
5230             if (rangeStart >= rangeLimit) {
5231                 if (currentRange >= lastRange) {
5232                   throw new NoSuchElementException();
5233                 }
5234                 rangeStart = list[currentRange++];
5235                 rangeLimit = list[currentRange++];
5236             }
5237             return rangeStart++;
5238         }
5239     }
5240 }
5241 //eof
5242