1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.text; 11 12 import java.io.IOException; 13 import java.text.ParsePosition; 14 import java.util.ArrayList; 15 import java.util.Arrays; 16 import java.util.Collection; 17 import java.util.Collections; 18 import java.util.Iterator; 19 import java.util.NoSuchElementException; 20 import java.util.SortedSet; 21 import java.util.Spliterator; 22 import java.util.TreeSet; 23 import java.util.function.IntConsumer; 24 import java.util.stream.IntStream; 25 import java.util.stream.Stream; 26 import java.util.stream.StreamSupport; 27 28 import android.icu.impl.BMPSet; 29 import android.icu.impl.CharacterPropertiesImpl; 30 import android.icu.impl.PatternProps; 31 import android.icu.impl.RuleCharacterIterator; 32 import android.icu.impl.SortedSetRelation; 33 import android.icu.impl.StringRange; 34 import android.icu.impl.UCaseProps; 35 import android.icu.impl.UCharacterProperty; 36 import android.icu.impl.UPropertyAliases; 37 import android.icu.impl.UnicodeSetStringSpan; 38 import android.icu.impl.Utility; 39 import android.icu.lang.CharSequences; 40 import android.icu.lang.CharacterProperties; 41 import android.icu.lang.UCharacter; 42 import android.icu.lang.UProperty; 43 import android.icu.lang.UScript; 44 import android.icu.util.Freezable; 45 import android.icu.util.ICUUncheckedIOException; 46 import android.icu.util.OutputInt; 47 import android.icu.util.ULocale; 48 import android.icu.util.VersionInfo; 49 50 /** 51 * A mutable set of Unicode characters and multicharacter strings. 52 * Objects of this class represent <em>character classes</em> used 53 * in regular expressions. A character specifies a subset of Unicode 54 * code points. Legal code points are U+0000 to U+10FFFF, inclusive. 55 * 56 * Note: method freeze() will not only make the set immutable, but 57 * also makes important methods much higher performance: 58 * contains(c), containsNone(...), span(...), spanBack(...) etc. 59 * After the object is frozen, any subsequent call that wants to change 60 * the object will throw UnsupportedOperationException. 61 * 62 * <p>The UnicodeSet class is not designed to be subclassed. 63 * 64 * <p><code>UnicodeSet</code> supports two APIs. The first is the 65 * <em>operand</em> API that allows the caller to modify the value of 66 * a <code>UnicodeSet</code> object. It conforms to Java 2's 67 * <code>java.util.Set</code> interface, although 68 * <code>UnicodeSet</code> does not actually implement that 69 * interface. All methods of <code>Set</code> are supported, with the 70 * modification that they take a character range or single character 71 * instead of an <code>Object</code>, and they take a 72 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 73 * operand API may be thought of in terms of boolean logic: a boolean 74 * OR is implemented by <code>add</code>, a boolean AND is implemented 75 * by <code>retain</code>, a boolean XOR is implemented by 76 * <code>complement</code> taking an argument, and a boolean NOT is 77 * implemented by <code>complement</code> with no argument. In terms 78 * of traditional set theory function names, <code>add</code> is a 79 * union, <code>retain</code> is an intersection, <code>remove</code> 80 * is an asymmetric difference, and <code>complement</code> with no 81 * argument is a set complement with respect to the superset range 82 * <code>MIN_VALUE-MAX_VALUE</code> 83 * 84 * <p>The second API is the 85 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 86 * <code>java.text.Format</code>-derived classes. Unlike the 87 * methods that add characters, add categories, and control the logic 88 * of the set, the method <code>applyPattern()</code> sets all 89 * attributes of a <code>UnicodeSet</code> at once, based on a 90 * string pattern. 91 * 92 * <p><b>Pattern syntax</b></p> 93 * 94 * Patterns are accepted by the constructors and the 95 * <code>applyPattern()</code> methods and returned by the 96 * <code>toPattern()</code> method. These patterns follow a syntax 97 * similar to that employed by version 8 regular expression character 98 * classes. Here are some simple examples: 99 * 100 * <blockquote> 101 * <table> 102 * <tr style="vertical-align: top"> 103 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td> 104 * <td style="vertical-align: top;">No characters</td> 105 * </tr><tr style="vertical-align: top"> 106 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td> 107 * <td style="vertical-align: top;">The character 'a'</td> 108 * </tr><tr style="vertical-align: top"> 109 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td> 110 * <td style="vertical-align: top;">The characters 'a' and 'e'</td> 111 * </tr> 112 * <tr> 113 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td> 114 * <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code 115 * point order</td> 116 * </tr> 117 * <tr> 118 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td> 119 * <td style="vertical-align: top;">The character U+4E01</td> 120 * </tr> 121 * <tr> 122 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td> 123 * <td style="vertical-align: top;">The character 'a' and the multicharacter strings "ab" and 124 * "ac"</td> 125 * </tr> 126 * <tr> 127 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td> 128 * <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td> 129 * </tr> 130 * </table> 131 * </blockquote> 132 * 133 * Any character may be preceded by a backslash in order to remove any special 134 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are 135 * ignored, unless they are escaped. 136 * 137 * <p>Property patterns specify a set of characters having a certain 138 * property as defined by the Unicode standard. Both the POSIX-like 139 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 140 * complete list of supported property patterns, see the User's Guide 141 * for UnicodeSet at 142 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset"> 143 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>. 144 * Actual determination of property data is defined by the underlying 145 * Unicode database as implemented by UCharacter. 146 * 147 * <p>Patterns specify individual characters, ranges of characters, and 148 * Unicode property sets. When elements are concatenated, they 149 * specify their union. To complement a set, place a '^' immediately 150 * after the opening '['. Property patterns are inverted by modifying 151 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 152 * '^' has no special meaning. 153 * 154 * <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]" 155 * perform a “code point complement” (all code points minus the original set), 156 * removing all multicharacter strings, 157 * equivalent to .{@link #complement()}.{@link #removeAllStrings()} . 158 * The {@link #complement()} API function continues to perform a 159 * symmetric difference with all code points and thus retains all multicharacter strings. 160 * 161 * <p>Ranges are indicated by placing two a '-' between two 162 * characters, as in "a-z". This specifies the range of all 163 * characters from the left to the right, in Unicode order. If the 164 * left character is greater than or equal to the 165 * right character it is a syntax error. If a '-' occurs as the first 166 * character after the opening '[' or '[^', or if it occurs as the 167 * last character before the closing ']', then it is taken as a 168 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 169 * set of three characters, 'a', 'b', and '-'. 170 * 171 * <p>Sets may be intersected using the '&' operator or the asymmetric 172 * set difference may be taken using the '-' operator, for example, 173 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 174 * with values less than 4096. Operators ('&' and '|') have equal 175 * precedence and bind left-to-right. Thus 176 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 177 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 178 * difference; intersection is commutative. 179 * 180 * <table> 181 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a' 182 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a' 183 * through 'z' and all letters in between, in Unicode order 184 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing 185 * all characters but 'a' through 'z', 186 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 187 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 188 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 189 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 190 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 191 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 192 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 193 * <em>pat2</em> 194 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code> 195 * <td>The set of characters having the specified 196 * Unicode property; in 197 * this case, Unicode uppercase letters 198 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code> 199 * <td>The set of characters <em>not</em> having the given 200 * Unicode property 201 * </table> 202 * 203 * <p><b>Formal syntax</b></p> 204 * 205 * <blockquote> 206 * <table> 207 * <tr style="vertical-align: top"> 208 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern := </code></td> 209 * <td style="vertical-align: top;"><code>('[' '^'? item* ']') | 210 * property</code></td> 211 * </tr> 212 * <tr style="vertical-align: top"> 213 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item := </code></td> 214 * <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br> 215 * </code></td> 216 * </tr> 217 * <tr style="vertical-align: top"> 218 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr := </code></td> 219 * <td style="vertical-align: top;"><code>pattern | pattern-expr pattern | 220 * pattern-expr op pattern<br> 221 * </code></td> 222 * </tr> 223 * <tr style="vertical-align: top"> 224 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op := </code></td> 225 * <td style="vertical-align: top;"><code>'&' | '-'<br> 226 * </code></td> 227 * </tr> 228 * <tr style="vertical-align: top"> 229 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special := </code></td> 230 * <td style="vertical-align: top;"><code>'[' | ']' | '-'<br> 231 * </code></td> 232 * </tr> 233 * <tr style="vertical-align: top"> 234 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char := </code></td> 235 * <td style="vertical-align: top;"><em>any character that is not</em><code> special<br> 236 * | ('\\' </code><em>any character</em><code>)<br> 237 * | ('\u' hex hex hex hex)<br> 238 * </code></td> 239 * </tr> 240 * <tr style="vertical-align: top"> 241 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> 242 * <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> 243 * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> 244 * </tr> 245 * <tr> 246 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td> 247 * <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td> 248 * </tr> 249 * </table> 250 * <br> 251 * <table border="1"> 252 * <tr> 253 * <td>Legend: <table> 254 * <tr> 255 * <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td> 256 * <td style="width: 20; vertical-align: top;"> </td> 257 * <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td> 258 * </tr> 259 * <tr> 260 * <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td> 261 * <td style="vertical-align: top;"></td> 262 * <td style="vertical-align: top;">zero or one instance of <code>a</code><br> 263 * </td> 264 * </tr> 265 * <tr> 266 * <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td> 267 * <td style="vertical-align: top;"></td> 268 * <td style="vertical-align: top;">one or more instances of <code>a</code><br> 269 * </td> 270 * </tr> 271 * <tr> 272 * <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td> 273 * <td style="vertical-align: top;"></td> 274 * <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br> 275 * </td> 276 * </tr> 277 * <tr> 278 * <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td> 279 * <td style="vertical-align: top;"></td> 280 * <td style="vertical-align: top;">the literal string between the quotes </td> 281 * </tr> 282 * </table> 283 * </td> 284 * </tr> 285 * </table> 286 * </blockquote> 287 * 288 * <p>To iterate over contents of {@code UnicodeSet}, the following are available: 289 * <ul> 290 * <li>to iterate over the ranges: {@link #ranges()}, {@link #rangeStream()}</li> 291 * <li>to iterate over the strings: {@link #strings()}, {@link #stringStream()}</li> 292 * <li>to iterate over the code points: {@link #codePoints()}, {@link #codePointStream()}</li> 293 * <li>to iterate over the entire contents in a single loop: this class itself is {@link Iterable}, 294 * or use {@link #stream()}.<br> 295 * All of these method are, however, not particularly efficient, 296 * since they convert each individual code point to a {@code String}. 297 * </ul> 298 * 299 * <p>The iterators and streams methods work as expected in idiomatic Java usage.<br> 300 * The {@link UnicodeSetIterator} cannot be used in <b>for</b> loops, and it is not very Java-idiomatic, because it is old. 301 * But it might be faster in certain use cases. We recommend that you measure in performance sensitive code.<br> 302 * 303 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 304 * 305 * @author Alan Liu 306 * @see UnicodeSetIterator 307 * @see UnicodeSetSpanner 308 */ 309 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> { 310 private static final SortedSet<String> EMPTY_STRINGS = 311 Collections.unmodifiableSortedSet(new TreeSet<String>()); 312 313 /** 314 * Constant for the empty set. 315 */ 316 public static final UnicodeSet EMPTY = new UnicodeSet().freeze(); 317 /** 318 * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.) 319 */ 320 public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); 321 322 private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing 323 324 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 325 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 326 // 110000 for codepoints 327 328 /** 329 * Enough for sets with few ranges. 330 * For example, White_Space has 10 ranges, list length 21. 331 */ 332 private static final int INITIAL_CAPACITY = 25; 333 334 /** Max list [0, 1, 2, ..., max code point, HIGH] */ 335 private static final int MAX_LENGTH = HIGH + 1; 336 337 /** 338 * Minimum value that can be stored in a UnicodeSet. 339 */ 340 public static final int MIN_VALUE = LOW; 341 342 /** 343 * Maximum value that can be stored in a UnicodeSet. 344 */ 345 public static final int MAX_VALUE = HIGH - 1; 346 347 private int len; // length used; list may be longer to minimize reallocs 348 private int[] list; // MUST be terminated with HIGH 349 private int[] rangeList; // internal buffer 350 private int[] buffer; // internal buffer 351 352 // is not private so that UnicodeSetIterator can get access 353 SortedSet<String> strings = EMPTY_STRINGS; 354 355 /** 356 * The pattern representation of this set. This may not be the 357 * most economical pattern. It is the pattern supplied to 358 * applyPattern(), with variables substituted and whitespace 359 * removed. For sets constructed without applyPattern(), or 360 * modified using the non-pattern API, this string will be null, 361 * indicating that toPattern() must generate a pattern 362 * representation from the inversion list. 363 */ 364 private String pat = null; 365 366 // Special property set IDs 367 private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF] 368 private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] 369 private static final String ASSIGNED = "Assigned"; // [:^Cn:] 370 371 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. 372 private volatile UnicodeSetStringSpan stringSpan; 373 //---------------------------------------------------------------- 374 // Public API 375 //---------------------------------------------------------------- 376 377 /** 378 * Constructs an empty set. 379 */ UnicodeSet()380 public UnicodeSet() { 381 list = new int[INITIAL_CAPACITY]; 382 list[0] = HIGH; 383 len = 1; 384 } 385 386 /** 387 * Constructs a copy of an existing set. 388 */ UnicodeSet(UnicodeSet other)389 public UnicodeSet(UnicodeSet other) { 390 set(other); 391 } 392 393 /** 394 * Constructs a set containing the given range. If <code>end > 395 * start</code> then an empty set is created. 396 * 397 * @param start first character, inclusive, of range 398 * @param end last character, inclusive, of range 399 */ UnicodeSet(int start, int end)400 public UnicodeSet(int start, int end) { 401 this(); 402 add(start, end); 403 } 404 405 /** 406 * Quickly constructs a set from a set of ranges <s0, e0, s1, e1, s2, e2, ..., sn, en>. 407 * There must be an even number of integers, and they must be all greater than zero, 408 * all less than or equal to Character.MAX_CODE_POINT. 409 * In each pair (..., si, ei, ...) it must be true that si <= ei 410 * Between adjacent pairs (...ei, sj...), it must be true that ei+1 < sj 411 * @param pairs pairs of character representing ranges 412 */ UnicodeSet(int... pairs)413 public UnicodeSet(int... pairs) { 414 if ((pairs.length & 1) != 0) { 415 throw new IllegalArgumentException("Must have even number of integers"); 416 } 417 list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set. 418 len = list.length; 419 int last = -1; // used to ensure that the results are monotonically increasing. 420 int i = 0; 421 while (i < pairs.length) { 422 int start = pairs[i]; 423 if (last >= start) { 424 throw new IllegalArgumentException("Must be monotonically increasing."); 425 } 426 list[i++] = start; 427 int limit = pairs[i] + 1; 428 if (start >= limit) { 429 throw new IllegalArgumentException("Must be monotonically increasing."); 430 } 431 list[i++] = last = limit; 432 } 433 list[i] = HIGH; // terminate 434 } 435 436 /** 437 * Constructs a set from the given pattern. See the class description 438 * for the syntax of the pattern language. Whitespace is ignored. 439 * @param pattern a string specifying what characters are in the set 440 * @exception java.lang.IllegalArgumentException if the pattern contains 441 * a syntax error. 442 */ UnicodeSet(String pattern)443 public UnicodeSet(String pattern) { 444 this(); 445 applyPattern(pattern, null, null, IGNORE_SPACE); 446 } 447 448 /** 449 * Constructs a set from the given pattern. See the class description 450 * for the syntax of the pattern language. 451 * @param pattern a string specifying what characters are in the set 452 * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters 453 * @exception java.lang.IllegalArgumentException if the pattern contains 454 * a syntax error. 455 */ UnicodeSet(String pattern, boolean ignoreWhitespace)456 public UnicodeSet(String pattern, boolean ignoreWhitespace) { 457 this(); 458 applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 459 } 460 461 /** 462 * Constructs a set from the given pattern. See the class description 463 * for the syntax of the pattern language. 464 * @param pattern a string specifying what characters are in the set 465 * @param options a bitmask indicating which options to apply. 466 * Valid options are {@link #IGNORE_SPACE} and 467 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 468 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 469 * @exception java.lang.IllegalArgumentException if the pattern contains 470 * a syntax error. 471 */ UnicodeSet(String pattern, int options)472 public UnicodeSet(String pattern, int options) { 473 this(); 474 applyPattern(pattern, null, null, options); 475 } 476 477 /** 478 * Constructs a set from the given pattern. See the class description 479 * for the syntax of the pattern language. 480 * @param pattern a string specifying what characters are in the set 481 * @param pos on input, the position in pattern at which to start parsing. 482 * On output, the position after the last character parsed. 483 * @param symbols a symbol table mapping variables to char[] arrays 484 * and chars to UnicodeSets 485 * @exception java.lang.IllegalArgumentException if the pattern 486 * contains a syntax error. 487 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)488 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) { 489 this(); 490 applyPattern(pattern, pos, symbols, IGNORE_SPACE); 491 } 492 493 /** 494 * Constructs a set from the given pattern. See the class description 495 * for the syntax of the pattern language. 496 * @param pattern a string specifying what characters are in the set 497 * @param pos on input, the position in pattern at which to start parsing. 498 * On output, the position after the last character parsed. 499 * @param symbols a symbol table mapping variables to char[] arrays 500 * and chars to UnicodeSets 501 * @param options a bitmask indicating which options to apply. 502 * Valid options are {@link #IGNORE_SPACE} and 503 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 504 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 505 * @exception java.lang.IllegalArgumentException if the pattern 506 * contains a syntax error. 507 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)508 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) { 509 this(); 510 applyPattern(pattern, pos, symbols, options); 511 } 512 513 514 /** 515 * Return a new set that is equivalent to this one. 516 */ 517 @Override clone()518 public Object clone() { 519 if (isFrozen()) { 520 return this; 521 } 522 return new UnicodeSet(this); 523 } 524 525 /** 526 * Make this object represent the range <code>start - end</code>. 527 * If <code>start > end</code> then this object is set to an empty range. 528 * 529 * @param start first character in the set, inclusive 530 * @param end last character in the set, inclusive 531 */ set(int start, int end)532 public UnicodeSet set(int start, int end) { 533 checkFrozen(); 534 clear(); 535 complement(start, end); 536 return this; 537 } 538 539 /** 540 * Make this object represent the same set as <code>other</code>. 541 * @param other a <code>UnicodeSet</code> whose value will be 542 * copied to this object 543 */ set(UnicodeSet other)544 public UnicodeSet set(UnicodeSet other) { 545 checkFrozen(); 546 list = Arrays.copyOf(other.list, other.len); 547 len = other.len; 548 pat = other.pat; 549 if (other.hasStrings()) { 550 strings = new TreeSet<>(other.strings); 551 } else { 552 strings = EMPTY_STRINGS; 553 } 554 return this; 555 } 556 557 /** 558 * Modifies this set to represent the set specified by the given pattern. 559 * See the class description for the syntax of the pattern language. 560 * Whitespace is ignored. 561 * @param pattern a string specifying what characters are in the set 562 * @exception java.lang.IllegalArgumentException if the pattern 563 * contains a syntax error. 564 */ applyPattern(String pattern)565 public final UnicodeSet applyPattern(String pattern) { 566 checkFrozen(); 567 return applyPattern(pattern, null, null, IGNORE_SPACE); 568 } 569 570 /** 571 * Modifies this set to represent the set specified by the given pattern, 572 * optionally ignoring whitespace. 573 * See the class description for the syntax of the pattern language. 574 * @param pattern a string specifying what characters are in the set 575 * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored 576 * @exception java.lang.IllegalArgumentException if the pattern 577 * contains a syntax error. 578 */ applyPattern(String pattern, boolean ignoreWhitespace)579 public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) { 580 checkFrozen(); 581 return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 582 } 583 584 /** 585 * Modifies this set to represent the set specified by the given pattern, 586 * optionally ignoring whitespace. 587 * See the class description for the syntax of the pattern language. 588 * @param pattern a string specifying what characters are in the set 589 * @param options a bitmask indicating which options to apply. 590 * Valid options are {@link #IGNORE_SPACE} and 591 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 592 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 593 * @exception java.lang.IllegalArgumentException if the pattern 594 * contains a syntax error. 595 */ applyPattern(String pattern, int options)596 public UnicodeSet applyPattern(String pattern, int options) { 597 checkFrozen(); 598 return applyPattern(pattern, null, null, options); 599 } 600 601 /** 602 * Return true if the given position, in the given pattern, appears 603 * to be the start of a UnicodeSet pattern. 604 * @hide unsupported on Android 605 */ resemblesPattern(String pattern, int pos)606 public static boolean resemblesPattern(String pattern, int pos) { 607 return ((pos+1) < pattern.length() && 608 pattern.charAt(pos) == '[') || 609 resemblesPropertyPattern(pattern, pos); 610 } 611 612 /** 613 * TODO: create Appendable version of UTF16.append(buf, c), 614 * maybe in new class Appendables? 615 * @throws IOException 616 */ appendCodePoint(Appendable app, int c)617 private static void appendCodePoint(Appendable app, int c) { 618 assert 0 <= c && c <= 0x10ffff; 619 try { 620 if (c <= 0xffff) { 621 app.append((char) c); 622 } else { 623 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c)); 624 } 625 } catch (IOException e) { 626 throw new ICUUncheckedIOException(e); 627 } 628 } 629 630 /** 631 * TODO: create class Appendables? 632 * @throws IOException 633 */ append(Appendable app, CharSequence s)634 private static void append(Appendable app, CharSequence s) { 635 try { 636 app.append(s); 637 } catch (IOException e) { 638 throw new ICUUncheckedIOException(e); 639 } 640 } 641 642 /** 643 * Append the <code>toPattern()</code> representation of a 644 * string to the given <code>Appendable</code>. 645 */ _appendToPat(T buf, String s, boolean escapeUnprintable)646 private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) { 647 int cp; 648 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 649 cp = s.codePointAt(i); 650 _appendToPat(buf, cp, escapeUnprintable); 651 } 652 return buf; 653 } 654 655 /** 656 * Append the <code>toPattern()</code> representation of a 657 * character to the given <code>Appendable</code>. 658 */ _appendToPat(T buf, int c, boolean escapeUnprintable)659 private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) { 660 try { 661 if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) { 662 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 663 // unprintable 664 return Utility.escape(buf, c); 665 } 666 // Okay to let ':' pass through 667 switch (c) { 668 case '[': // SET_OPEN: 669 case ']': // SET_CLOSE: 670 case '-': // HYPHEN: 671 case '^': // COMPLEMENT: 672 case '&': // INTERSECTION: 673 case '\\': //BACKSLASH: 674 case '{': 675 case '}': 676 case '$': 677 case ':': 678 buf.append('\\'); 679 break; 680 default: 681 // Escape whitespace 682 if (PatternProps.isWhiteSpace(c)) { 683 buf.append('\\'); 684 } 685 break; 686 } 687 appendCodePoint(buf, c); 688 return buf; 689 } catch (IOException e) { 690 throw new ICUUncheckedIOException(e); 691 } 692 } 693 _appendToPat( T result, int start, int end, boolean escapeUnprintable)694 private static <T extends Appendable> T _appendToPat( 695 T result, int start, int end, boolean escapeUnprintable) { 696 _appendToPat(result, start, escapeUnprintable); 697 if (start != end) { 698 if ((start+1) != end || 699 // Avoid writing what looks like a lead+trail surrogate pair. 700 start == 0xdbff) { 701 try { 702 result.append('-'); 703 } catch (IOException e) { 704 throw new ICUUncheckedIOException(e); 705 } 706 } 707 _appendToPat(result, end, escapeUnprintable); 708 } 709 return result; 710 } 711 712 /** 713 * Returns a string representation of this set. If the result of 714 * calling this function is passed to a UnicodeSet constructor, it 715 * will produce another set that is equal to this one. 716 */ 717 @Override toPattern(boolean escapeUnprintable)718 public String toPattern(boolean escapeUnprintable) { 719 if (pat != null && !escapeUnprintable) { 720 return pat; 721 } 722 StringBuilder result = new StringBuilder(); 723 return _toPattern(result, escapeUnprintable).toString(); 724 } 725 726 /** 727 * Append a string representation of this set to result. This will be 728 * a cleaned version of the string passed to applyPattern(), if there 729 * is one. Otherwise it will be generated. 730 */ _toPattern(T result, boolean escapeUnprintable)731 private <T extends Appendable> T _toPattern(T result, 732 boolean escapeUnprintable) { 733 if (pat == null) { 734 return appendNewPattern(result, escapeUnprintable, true); 735 } 736 try { 737 if (!escapeUnprintable) { 738 // TODO: The C++ version does not have this shortcut, and instead 739 // always cleans up the pattern string, 740 // which also escapes Utility.shouldAlwaysBeEscaped(c). 741 // We should sync these implementations. 742 result.append(pat); 743 return result; 744 } 745 boolean oddNumberOfBackslashes = false; 746 for (int i=0; i<pat.length(); ) { 747 int c = pat.codePointAt(i); 748 i += Character.charCount(c); 749 if (Utility.isUnprintable(c)) { 750 // If the unprintable character is preceded by an odd 751 // number of backslashes, then it has been escaped 752 // and we omit the last backslash. 753 Utility.escape(result, c); 754 oddNumberOfBackslashes = false; 755 } else if (!oddNumberOfBackslashes && c == '\\') { 756 // Temporarily withhold an odd-numbered backslash. 757 oddNumberOfBackslashes = true; 758 } else { 759 if (oddNumberOfBackslashes) { 760 result.append('\\'); 761 } 762 appendCodePoint(result, c); 763 oddNumberOfBackslashes = false; 764 } 765 } 766 if (oddNumberOfBackslashes) { 767 result.append('\\'); 768 } 769 return result; 770 } catch (IOException e) { 771 throw new ICUUncheckedIOException(e); 772 } 773 } 774 775 /** 776 * Generate and append a string representation of this set to result. 777 * This does not use this.pat, the cleaned up copy of the string 778 * passed to applyPattern(). 779 * 780 * @param result the buffer into which to generate the pattern 781 * @param escapeUnprintable escape unprintable characters if true 782 */ _generatePattern(StringBuffer result, boolean escapeUnprintable)783 public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { 784 return _generatePattern(result, escapeUnprintable, true); 785 } 786 787 /** 788 * Generate and append a string representation of this set to result. 789 * This does not use this.pat, the cleaned up copy of the string 790 * passed to applyPattern(). 791 * 792 * @param result the buffer into which to generate the pattern 793 * @param escapeUnprintable escape unprintable characters if true 794 * @param includeStrings if false, doesn't include the strings. 795 */ _generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)796 public StringBuffer _generatePattern(StringBuffer result, 797 boolean escapeUnprintable, boolean includeStrings) { 798 return appendNewPattern(result, escapeUnprintable, includeStrings); 799 } 800 801 // Implementation of public _generatePattern(). 802 // Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer. appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)803 private <T extends Appendable> T appendNewPattern( 804 T result, boolean escapeUnprintable, boolean includeStrings) { 805 try { 806 result.append('['); 807 808 int i = 0; 809 int limit = len & ~1; // = 2 * getRangeCount() 810 811 // If the set contains at least 2 intervals and includes both 812 // MIN_VALUE and MAX_VALUE, then the inverse representation will 813 // be more economical. 814 // if (getRangeCount() >= 2 && 815 // getRangeStart(0) == MIN_VALUE && 816 // getRangeEnd(last) == MAX_VALUE) 817 // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 818 // If limit == len then len is even and the last range ends with MAX_VALUE. 819 // 820 // *But* do not write the inverse (complement) if there are strings. 821 // Since ICU 70, the '^' performs a code point complement which removes all strings. 822 if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { 823 // Emit the inverse 824 result.append('^'); 825 // Offsetting the inversion list index by one lets us 826 // iterate over the ranges of the set complement. 827 i = 1; 828 --limit; 829 } 830 831 // Emit the ranges as pairs. 832 while (i < limit) { 833 int start = list[i]; // getRangeStart() 834 int end = list[i + 1] - 1; // getRangeEnd() = range limit minus one 835 if (!(0xd800 <= end && end <= 0xdbff)) { 836 _appendToPat(result, start, end, escapeUnprintable); 837 i += 2; 838 } else { 839 // The range ends with a lead surrogate. 840 // Avoid writing what looks like a lead+trail surrogate pair. 841 // 1. Postpone ranges that start with a lead surrogate code point. 842 int firstLead = i; 843 while ((i += 2) < limit && list[i] <= 0xdbff) {} 844 int firstAfterLead = i; 845 // 2. Write following ranges that start with a trail surrogate code point. 846 while (i < limit && (start = list[i]) <= 0xdfff) { 847 _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); 848 i += 2; 849 } 850 // 3. Now write the postponed ranges. 851 for (int j = firstLead; j < firstAfterLead; j += 2) { 852 _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); 853 } 854 } 855 } 856 857 if (includeStrings && hasStrings()) { 858 for (String s : strings) { 859 result.append('{'); 860 _appendToPat(result, s, escapeUnprintable); 861 result.append('}'); 862 } 863 } 864 result.append(']'); 865 return result; 866 } catch (IOException e) { 867 throw new ICUUncheckedIOException(e); 868 } 869 } 870 871 /** 872 * Returns the number of elements in this set (its cardinality) 873 * Note than the elements of a set may include both individual 874 * codepoints and strings. 875 * 876 * @return the number of elements in this set (its cardinality). 877 */ size()878 public int size() { 879 int n = 0; 880 int count = getRangeCount(); 881 for (int i = 0; i < count; ++i) { 882 n += getRangeEnd(i) - getRangeStart(i) + 1; 883 } 884 return n + strings.size(); 885 } 886 887 /** 888 * Returns <tt>true</tt> if this set contains no elements. 889 * 890 * @return <tt>true</tt> if this set contains no elements. 891 */ isEmpty()892 public boolean isEmpty() { 893 return len == 1 && !hasStrings(); 894 } 895 896 /** 897 * @return true if this set contains multi-character strings or the empty string. 898 */ hasStrings()899 public boolean hasStrings() { 900 return !strings.isEmpty(); 901 } 902 903 /** 904 * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if 905 * this set contains any character whose low byte is the given 906 * value. This is used by <tt>RuleBasedTransliterator</tt> for 907 * indexing. 908 */ 909 @Override matchesIndexValue(int v)910 public boolean matchesIndexValue(int v) { 911 /* The index value v, in the range [0,255], is contained in this set if 912 * it is contained in any pair of this set. Pairs either have the high 913 * bytes equal, or unequal. If the high bytes are equal, then we have 914 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 915 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 916 * Then v is contained if xx <= v || v <= yy. (This is identical to the 917 * time zone month containment logic.) 918 */ 919 for (int i=0; i<getRangeCount(); ++i) { 920 int low = getRangeStart(i); 921 int high = getRangeEnd(i); 922 if ((low & ~0xFF) == (high & ~0xFF)) { 923 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 924 return true; 925 } 926 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 927 return true; 928 } 929 } 930 if (hasStrings()) { 931 for (String s : strings) { 932 if (s.isEmpty()) { 933 continue; // skip the empty string 934 } 935 int c = UTF16.charAt(s, 0); 936 if ((c & 0xFF) == v) { 937 return true; 938 } 939 } 940 } 941 return false; 942 } 943 944 /** 945 * Implementation of UnicodeMatcher.matches(). Always matches the 946 * longest possible multichar string. 947 */ 948 @Override matches(Replaceable text, int[] offset, int limit, boolean incremental)949 public int matches(Replaceable text, 950 int[] offset, 951 int limit, 952 boolean incremental) { 953 954 if (offset[0] == limit) { 955 if (contains(UnicodeMatcher.ETHER)) { 956 return incremental ? U_PARTIAL_MATCH : U_MATCH; 957 } else { 958 return U_MISMATCH; 959 } 960 } else { 961 if (hasStrings()) { // try strings first 962 963 // might separate forward and backward loops later 964 // for now they are combined 965 966 // TODO Improve efficiency of this, at least in the forward 967 // direction, if not in both. In the forward direction we 968 // can assume the strings are sorted. 969 970 boolean forward = offset[0] < limit; 971 972 // firstChar is the leftmost char to match in the 973 // forward direction or the rightmost char to match in 974 // the reverse direction. 975 char firstChar = text.charAt(offset[0]); 976 977 // If there are multiple strings that can match we 978 // return the longest match. 979 int highWaterLength = 0; 980 981 for (String trial : strings) { 982 if (trial.isEmpty()) { 983 continue; // skip the empty string 984 } 985 986 char c = trial.charAt(forward ? 0 : trial.length() - 1); 987 988 // Strings are sorted, so we can optimize in the 989 // forward direction. 990 if (forward && c > firstChar) break; 991 if (c != firstChar) continue; 992 993 int length = matchRest(text, offset[0], limit, trial); 994 995 if (incremental) { 996 int maxLen = forward ? limit-offset[0] : offset[0]-limit; 997 if (length == maxLen) { 998 // We have successfully matched but only up to limit. 999 return U_PARTIAL_MATCH; 1000 } 1001 } 1002 1003 if (length == trial.length()) { 1004 // We have successfully matched the whole string. 1005 if (length > highWaterLength) { 1006 highWaterLength = length; 1007 } 1008 // In the forward direction we know strings 1009 // are sorted so we can bail early. 1010 if (forward && length < highWaterLength) { 1011 break; 1012 } 1013 continue; 1014 } 1015 } 1016 1017 // We've checked all strings without a partial match. 1018 // If we have full matches, return the longest one. 1019 if (highWaterLength != 0) { 1020 offset[0] += forward ? highWaterLength : -highWaterLength; 1021 return U_MATCH; 1022 } 1023 } 1024 return super.matches(text, offset, limit, incremental); 1025 } 1026 } 1027 1028 /** 1029 * Returns the longest match for s in text at the given position. 1030 * If limit > start then match forward from start+1 to limit 1031 * matching all characters except s.charAt(0). If limit < start, 1032 * go backward starting from start-1 matching all characters 1033 * except s.charAt(s.length()-1). This method assumes that the 1034 * first character, text.charAt(start), matches s, so it does not 1035 * check it. 1036 * @param text the text to match 1037 * @param start the first character to match. In the forward 1038 * direction, text.charAt(start) is matched against s.charAt(0). 1039 * In the reverse direction, it is matched against 1040 * s.charAt(s.length()-1). 1041 * @param limit the limit offset for matching, either last+1 in 1042 * the forward direction, or last-1 in the reverse direction, 1043 * where last is the index of the last character to match. 1044 * @return If part of s matches up to the limit, return |limit - 1045 * start|. If all of s matches before reaching the limit, return 1046 * s.length(). If there is a mismatch between s and text, return 1047 * 0 1048 */ matchRest(Replaceable text, int start, int limit, String s)1049 private static int matchRest (Replaceable text, int start, int limit, String s) { 1050 int maxLen; 1051 int slen = s.length(); 1052 if (start < limit) { 1053 maxLen = limit - start; 1054 if (maxLen > slen) maxLen = slen; 1055 for (int i = 1; i < maxLen; ++i) { 1056 if (text.charAt(start + i) != s.charAt(i)) return 0; 1057 } 1058 } else { 1059 maxLen = start - limit; 1060 if (maxLen > slen) maxLen = slen; 1061 --slen; // <=> slen = s.length() - 1; 1062 for (int i = 1; i < maxLen; ++i) { 1063 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1064 } 1065 } 1066 return maxLen; 1067 } 1068 1069 /** 1070 * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. 1071 * @deprecated This API is ICU internal only. 1072 * @hide original deprecated declaration 1073 * @hide draft / provisional / internal are hidden on Android 1074 */ 1075 @Deprecated matchesAt(CharSequence text, int offset)1076 public int matchesAt(CharSequence text, int offset) { 1077 int lastLen = -1; 1078 strings: 1079 if (hasStrings()) { 1080 char firstChar = text.charAt(offset); 1081 String trial = null; 1082 // find the first string starting with firstChar 1083 Iterator<String> it = strings.iterator(); 1084 while (it.hasNext()) { 1085 trial = it.next(); 1086 char firstStringChar = trial.charAt(0); 1087 if (firstStringChar < firstChar) continue; 1088 if (firstStringChar > firstChar) break strings; 1089 } 1090 1091 // now keep checking string until we get the longest one 1092 for (;;) { 1093 int tempLen = matchesAt(text, offset, trial); 1094 if (lastLen > tempLen) break strings; 1095 lastLen = tempLen; 1096 if (!it.hasNext()) break; 1097 trial = it.next(); 1098 } 1099 } 1100 1101 if (lastLen < 2) { 1102 int cp = UTF16.charAt(text, offset); 1103 if (contains(cp)) lastLen = UTF16.getCharCount(cp); 1104 } 1105 1106 return offset+lastLen; 1107 } 1108 1109 /** 1110 * Does one string contain another, starting at a specific offset? 1111 * @param text text to match 1112 * @param offsetInText offset within that text 1113 * @param substring substring to match at offset in text 1114 * @return -1 if match fails, otherwise other.length() 1115 */ 1116 // Note: This method was moved from CollectionUtilities matchesAt(CharSequence text, int offsetInText, CharSequence substring)1117 private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) { 1118 int len = substring.length(); 1119 int textLength = text.length(); 1120 if (textLength + offsetInText > len) { 1121 return -1; 1122 } 1123 int i = 0; 1124 for (int j = offsetInText; i < len; ++i, ++j) { 1125 char pc = substring.charAt(i); 1126 char tc = text.charAt(j); 1127 if (pc != tc) return -1; 1128 } 1129 return i; 1130 } 1131 1132 /** 1133 * Implementation of UnicodeMatcher API. Union the set of all 1134 * characters that may be matched by this object into the given 1135 * set. 1136 * @param toUnionTo the set into which to union the source characters 1137 */ 1138 @Override addMatchSetTo(UnicodeSet toUnionTo)1139 public void addMatchSetTo(UnicodeSet toUnionTo) { 1140 toUnionTo.addAll(this); 1141 } 1142 1143 /** 1144 * Returns the index of the given character within this set, where 1145 * the set is ordered by ascending code point. If the character 1146 * is not in this set, return -1. The inverse of this method is 1147 * <code>charAt()</code>. 1148 * @return an index from 0..size()-1, or -1 1149 */ indexOf(int c)1150 public int indexOf(int c) { 1151 if (c < MIN_VALUE || c > MAX_VALUE) { 1152 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1153 } 1154 int i = 0; 1155 int n = 0; 1156 for (;;) { 1157 int start = list[i++]; 1158 if (c < start) { 1159 return -1; 1160 } 1161 int limit = list[i++]; 1162 if (c < limit) { 1163 return n + c - start; 1164 } 1165 n += limit - start; 1166 } 1167 } 1168 1169 /** 1170 * Returns the character at the given index within this set, where 1171 * the set is ordered by ascending code point. If the index is 1172 * out of range, return -1. The inverse of this method is 1173 * <code>indexOf()</code>. 1174 * @param index an index from 0..size()-1 1175 * @return the character at the given index, or -1. 1176 */ charAt(int index)1177 public int charAt(int index) { 1178 if (index >= 0) { 1179 // len2 is the largest even integer <= len, that is, it is len 1180 // for even values and len-1 for odd values. With odd values 1181 // the last entry is UNICODESET_HIGH. 1182 int len2 = len & ~1; 1183 for (int i=0; i < len2;) { 1184 int start = list[i++]; 1185 int count = list[i++] - start; 1186 if (index < count) { 1187 return start + index; 1188 } 1189 index -= count; 1190 } 1191 } 1192 return -1; 1193 } 1194 1195 /** 1196 * Adds the specified range to this set if it is not already 1197 * present. If this set already contains the specified range, 1198 * the call leaves this set unchanged. If <code>start > end</code> 1199 * then an empty range is added, leaving the set unchanged. 1200 * 1201 * @param start first character, inclusive, of range to be added 1202 * to this set. 1203 * @param end last character, inclusive, of range to be added 1204 * to this set. 1205 */ add(int start, int end)1206 public UnicodeSet add(int start, int end) { 1207 checkFrozen(); 1208 return add_unchecked(start, end); 1209 } 1210 1211 /** 1212 * Adds all characters in range (uses preferred naming convention). 1213 * @param start The index of where to start on adding all characters. 1214 * @param end The index of where to end on adding all characters. 1215 * @return a reference to this object 1216 */ addAll(int start, int end)1217 public UnicodeSet addAll(int start, int end) { 1218 checkFrozen(); 1219 return add_unchecked(start, end); 1220 } 1221 1222 // for internal use, after checkFrozen has been called add_unchecked(int start, int end)1223 private UnicodeSet add_unchecked(int start, int end) { 1224 if (start < MIN_VALUE || start > MAX_VALUE) { 1225 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1226 } 1227 if (end < MIN_VALUE || end > MAX_VALUE) { 1228 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1229 } 1230 if (start < end) { 1231 int limit = end + 1; 1232 // Fast path for adding a new range after the last one. 1233 // Odd list length: [..., lastStart, lastLimit, HIGH] 1234 if ((len & 1) != 0) { 1235 // If the list is empty, set lastLimit low enough to not be adjacent to 0. 1236 int lastLimit = len == 1 ? -2 : list[len - 2]; 1237 if (lastLimit <= start) { 1238 checkFrozen(); 1239 if (lastLimit == start) { 1240 // Extend the last range. 1241 list[len - 2] = limit; 1242 if (limit == HIGH) { 1243 --len; 1244 } 1245 } else { 1246 list[len - 1] = start; 1247 if (limit < HIGH) { 1248 ensureCapacity(len + 2); 1249 list[len++] = limit; 1250 list[len++] = HIGH; 1251 } else { // limit == HIGH 1252 ensureCapacity(len + 1); 1253 list[len++] = HIGH; 1254 } 1255 } 1256 pat = null; 1257 return this; 1258 } 1259 } 1260 // This is slow. Could be much faster using findCodePoint(start) 1261 // and modifying the list, dealing with adjacent & overlapping ranges. 1262 add(range(start, end), 2, 0); 1263 } else if (start == end) { 1264 add(start); 1265 } 1266 return this; 1267 } 1268 1269 // /** 1270 // * Format out the inversion list as a string, for debugging. Uncomment when 1271 // * needed. 1272 // */ 1273 // public final String dump() { 1274 // StringBuffer buf = new StringBuffer("["); 1275 // for (int i=0; i<len; ++i) { 1276 // if (i != 0) buf.append(", "); 1277 // int c = list[i]; 1278 // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') { 1279 // // buf.append((char) c); 1280 // //} else { 1281 // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6)); 1282 // //} 1283 // } 1284 // buf.append("]"); 1285 // return buf.toString(); 1286 // } 1287 1288 /** 1289 * Adds the specified character to this set if it is not already 1290 * present. If this set already contains the specified character, 1291 * the call leaves this set unchanged. 1292 */ add(int c)1293 public final UnicodeSet add(int c) { 1294 checkFrozen(); 1295 return add_unchecked(c); 1296 } 1297 1298 // for internal use only, after checkFrozen has been called add_unchecked(int c)1299 private final UnicodeSet add_unchecked(int c) { 1300 if (c < MIN_VALUE || c > MAX_VALUE) { 1301 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1302 } 1303 1304 // find smallest i such that c < list[i] 1305 // if odd, then it is IN the set 1306 // if even, then it is OUT of the set 1307 int i = findCodePoint(c); 1308 1309 // already in set? 1310 if ((i & 1) != 0) return this; 1311 1312 // HIGH is 0x110000 1313 // assert(list[len-1] == HIGH); 1314 1315 // empty = [HIGH] 1316 // [start_0, limit_0, start_1, limit_1, HIGH] 1317 1318 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1319 // ^ 1320 // list[i] 1321 1322 // i == 0 means c is before the first range 1323 // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into 1324 // add_unchecked, the maximum value that "c" will be compared to 1325 // is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will 1326 // never be reached according to this logic. 1327 if (c == list[i]-1) { 1328 // c is before start of next range 1329 list[i] = c; 1330 // if we touched the HIGH mark, then add a new one 1331 if (c == MAX_VALUE) { 1332 ensureCapacity(len+1); 1333 list[len++] = HIGH; 1334 } 1335 if (i > 0 && c == list[i-1]) { 1336 // collapse adjacent ranges 1337 1338 // [..., start_k-1, c, c, limit_k, ..., HIGH] 1339 // ^ 1340 // list[i] 1341 System.arraycopy(list, i+1, list, i-1, len-i-1); 1342 len -= 2; 1343 } 1344 } 1345 1346 else if (i > 0 && c == list[i-1]) { 1347 // c is after end of prior range 1348 list[i-1]++; 1349 // no need to check for collapse here 1350 } 1351 1352 else { 1353 // At this point we know the new char is not adjacent to 1354 // any existing ranges, and it is not 10FFFF. 1355 1356 1357 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1358 // ^ 1359 // list[i] 1360 1361 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1362 // ^ 1363 // list[i] 1364 1365 // Don't use ensureCapacity() to save on copying. 1366 // NOTE: This has no measurable impact on performance, 1367 // but it might help in some usage patterns. 1368 if (len+2 > list.length) { 1369 int[] temp = new int[nextCapacity(len + 2)]; 1370 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 1371 System.arraycopy(list, i, temp, i+2, len-i); 1372 list = temp; 1373 } else { 1374 System.arraycopy(list, i, list, i+2, len-i); 1375 } 1376 1377 list[i] = c; 1378 list[i+1] = c+1; 1379 len += 2; 1380 } 1381 1382 pat = null; 1383 return this; 1384 } 1385 1386 /** 1387 * Adds the specified multicharacter to this set if it is not already 1388 * present. If this set already contains the multicharacter, 1389 * the call leaves this set unchanged. 1390 * Thus "ch" => {"ch"} 1391 * 1392 * @param s the source string 1393 * @return this object, for chaining 1394 */ add(CharSequence s)1395 public final UnicodeSet add(CharSequence s) { 1396 checkFrozen(); 1397 int cp = getSingleCP(s); 1398 if (cp < 0) { 1399 String str = s.toString(); 1400 if (!strings.contains(str)) { 1401 addString(str); 1402 pat = null; 1403 } 1404 } else { 1405 add_unchecked(cp, cp); 1406 } 1407 return this; 1408 } 1409 addString(CharSequence s)1410 private void addString(CharSequence s) { 1411 if (strings == EMPTY_STRINGS) { 1412 strings = new TreeSet<>(); 1413 } 1414 strings.add(s.toString()); 1415 } 1416 1417 /** 1418 * Utility for getting code point from single code point CharSequence. 1419 * See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE). 1420 * 1421 * @return a code point IF the string consists of a single one. 1422 * otherwise returns -1. 1423 * @param s to test 1424 */ getSingleCP(CharSequence s)1425 private static int getSingleCP(CharSequence s) { 1426 if (s.length() == 1) return s.charAt(0); 1427 if (s.length() == 2) { 1428 int cp = Character.codePointAt(s, 0); 1429 if (cp > 0xFFFF) { // is surrogate pair 1430 return cp; 1431 } 1432 } 1433 return -1; 1434 } 1435 1436 /** 1437 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1438 * If this set already any particular character, it has no effect on that character. 1439 * @param s the source string 1440 * @return this object, for chaining 1441 */ addAll(CharSequence s)1442 public final UnicodeSet addAll(CharSequence s) { 1443 checkFrozen(); 1444 int cp; 1445 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1446 cp = UTF16.charAt(s, i); 1447 add_unchecked(cp, cp); 1448 } 1449 return this; 1450 } 1451 1452 /** 1453 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1454 * If this set already any particular character, it has no effect on that character. 1455 * @param s the source string 1456 * @return this object, for chaining 1457 */ retainAll(CharSequence s)1458 public final UnicodeSet retainAll(CharSequence s) { 1459 return retainAll(fromAll(s)); 1460 } 1461 1462 /** 1463 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1464 * If this set already any particular character, it has no effect on that character. 1465 * @param s the source string 1466 * @return this object, for chaining 1467 */ complementAll(CharSequence s)1468 public final UnicodeSet complementAll(CharSequence s) { 1469 return complementAll(fromAll(s)); 1470 } 1471 1472 /** 1473 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1474 * If this set already any particular character, it has no effect on that character. 1475 * @param s the source string 1476 * @return this object, for chaining 1477 */ removeAll(CharSequence s)1478 public final UnicodeSet removeAll(CharSequence s) { 1479 return removeAll(fromAll(s)); 1480 } 1481 1482 /** 1483 * Remove all strings from this UnicodeSet 1484 * @return this object, for chaining 1485 */ removeAllStrings()1486 public final UnicodeSet removeAllStrings() { 1487 checkFrozen(); 1488 if (hasStrings()) { 1489 strings.clear(); 1490 pat = null; 1491 } 1492 return this; 1493 } 1494 1495 /** 1496 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1497 * 1498 * @param s the source string 1499 * @return a newly created set containing the given string 1500 */ from(CharSequence s)1501 public static UnicodeSet from(CharSequence s) { 1502 return new UnicodeSet().add(s); 1503 } 1504 1505 1506 /** 1507 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1508 * @param s the source string 1509 * @return a newly created set containing the given characters 1510 */ fromAll(CharSequence s)1511 public static UnicodeSet fromAll(CharSequence s) { 1512 return new UnicodeSet().addAll(s); 1513 } 1514 1515 1516 /** 1517 * Retain only the elements in this set that are contained in the 1518 * specified range. If <code>start > end</code> then an empty range is 1519 * retained, leaving the set empty. 1520 * 1521 * @param start first character, inclusive, of range 1522 * @param end last character, inclusive, of range 1523 */ retain(int start, int end)1524 public UnicodeSet retain(int start, int end) { 1525 checkFrozen(); 1526 if (start < MIN_VALUE || start > MAX_VALUE) { 1527 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1528 } 1529 if (end < MIN_VALUE || end > MAX_VALUE) { 1530 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1531 } 1532 if (start <= end) { 1533 retain(range(start, end), 2, 0); 1534 } else { 1535 clear(); 1536 } 1537 return this; 1538 } 1539 1540 /** 1541 * Retain the specified character from this set if it is present. 1542 * Upon return this set will be empty if it did not contain c, or 1543 * will only contain c if it did contain c. 1544 * @param c the character to be retained 1545 * @return this object, for chaining 1546 */ retain(int c)1547 public final UnicodeSet retain(int c) { 1548 return retain(c, c); 1549 } 1550 1551 /** 1552 * Retain the specified string in this set if it is present. 1553 * Upon return this set will be empty if it did not contain s, or 1554 * will only contain s if it did contain s. 1555 * @param cs the string to be retained 1556 * @return this object, for chaining 1557 */ retain(CharSequence cs)1558 public final UnicodeSet retain(CharSequence cs) { 1559 int cp = getSingleCP(cs); 1560 if (cp < 0) { 1561 checkFrozen(); 1562 String s = cs.toString(); 1563 boolean isIn = strings.contains(s); 1564 // Check for getRangeCount() first to avoid somewhat-expensive size() 1565 // when there are single code points. 1566 if (isIn && getRangeCount() == 0 && size() == 1) { 1567 return this; 1568 } 1569 clear(); 1570 if (isIn) { 1571 addString(s); 1572 } 1573 pat = null; 1574 } else { 1575 retain(cp, cp); 1576 } 1577 return this; 1578 } 1579 1580 /** 1581 * Removes the specified range from this set if it is present. 1582 * The set will not contain the specified range once the call 1583 * returns. If <code>start > end</code> then an empty range is 1584 * removed, leaving the set unchanged. 1585 * 1586 * @param start first character, inclusive, of range to be removed 1587 * from this set. 1588 * @param end last character, inclusive, of range to be removed 1589 * from this set. 1590 */ remove(int start, int end)1591 public UnicodeSet remove(int start, int end) { 1592 checkFrozen(); 1593 if (start < MIN_VALUE || start > MAX_VALUE) { 1594 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1595 } 1596 if (end < MIN_VALUE || end > MAX_VALUE) { 1597 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1598 } 1599 if (start <= end) { 1600 retain(range(start, end), 2, 2); 1601 } 1602 return this; 1603 } 1604 1605 /** 1606 * Removes the specified character from this set if it is present. 1607 * The set will not contain the specified character once the call 1608 * returns. 1609 * @param c the character to be removed 1610 * @return this object, for chaining 1611 */ remove(int c)1612 public final UnicodeSet remove(int c) { 1613 return remove(c, c); 1614 } 1615 1616 /** 1617 * Removes the specified string from this set if it is present. 1618 * The set will not contain the specified string once the call 1619 * returns. 1620 * @param s the string to be removed 1621 * @return this object, for chaining 1622 */ remove(CharSequence s)1623 public final UnicodeSet remove(CharSequence s) { 1624 int cp = getSingleCP(s); 1625 if (cp < 0) { 1626 checkFrozen(); 1627 String str = s.toString(); 1628 if (strings.contains(str)) { 1629 strings.remove(str); 1630 pat = null; 1631 } 1632 } else { 1633 remove(cp, cp); 1634 } 1635 return this; 1636 } 1637 1638 /** 1639 * Complements the specified range in this set. Any character in 1640 * the range will be removed if it is in this set, or will be 1641 * added if it is not in this set. If <code>start > end</code> 1642 * then an empty range is complemented, leaving the set unchanged. 1643 * 1644 * @param start first character, inclusive, of range 1645 * @param end last character, inclusive, of range 1646 */ complement(int start, int end)1647 public UnicodeSet complement(int start, int end) { 1648 checkFrozen(); 1649 if (start < MIN_VALUE || start > MAX_VALUE) { 1650 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1651 } 1652 if (end < MIN_VALUE || end > MAX_VALUE) { 1653 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1654 } 1655 if (start <= end) { 1656 xor(range(start, end), 2, 0); 1657 } 1658 pat = null; 1659 return this; 1660 } 1661 1662 /** 1663 * Complements the specified character in this set. The character 1664 * will be removed if it is in this set, or will be added if it is 1665 * not in this set. 1666 */ complement(int c)1667 public final UnicodeSet complement(int c) { 1668 return complement(c, c); 1669 } 1670 1671 /** 1672 * This is equivalent to 1673 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1674 * 1675 * <p><strong>Note:</strong> This performs a symmetric difference with all code points 1676 * <em>and thus retains all multicharacter strings</em>. 1677 * In order to achieve a “code point complement” (all code points minus this set), 1678 * the easiest is to .{@link #complement()}.{@link #removeAllStrings()} . 1679 */ complement()1680 public UnicodeSet complement() { 1681 checkFrozen(); 1682 if (list[0] == LOW) { 1683 System.arraycopy(list, 1, list, 0, len-1); 1684 --len; 1685 } else { 1686 ensureCapacity(len+1); 1687 System.arraycopy(list, 0, list, 1, len); 1688 list[0] = LOW; 1689 ++len; 1690 } 1691 pat = null; 1692 return this; 1693 } 1694 1695 /** 1696 * Complement the specified string in this set. 1697 * The set will not contain the specified string once the call 1698 * returns. 1699 * 1700 * @param s the string to complement 1701 * @return this object, for chaining 1702 */ complement(CharSequence s)1703 public final UnicodeSet complement(CharSequence s) { 1704 checkFrozen(); 1705 int cp = getSingleCP(s); 1706 if (cp < 0) { 1707 String s2 = s.toString(); 1708 if (strings.contains(s2)) { 1709 strings.remove(s2); 1710 } else { 1711 addString(s2); 1712 } 1713 pat = null; 1714 } else { 1715 complement(cp, cp); 1716 } 1717 return this; 1718 } 1719 1720 /** 1721 * Returns true if this set contains the given character. 1722 * @param c character to be checked for containment 1723 * @return true if the test condition is met 1724 */ 1725 @Override contains(int c)1726 public boolean contains(int c) { 1727 if (c < MIN_VALUE || c > MAX_VALUE) { 1728 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1729 } 1730 if (bmpSet != null) { 1731 return bmpSet.contains(c); 1732 } 1733 if (stringSpan != null) { 1734 return stringSpan.contains(c); 1735 } 1736 1737 /* 1738 // Set i to the index of the start item greater than ch 1739 // We know we will terminate without length test! 1740 int i = -1; 1741 while (true) { 1742 if (c < list[++i]) break; 1743 } 1744 */ 1745 1746 int i = findCodePoint(c); 1747 1748 return ((i & 1) != 0); // return true if odd 1749 } 1750 1751 /** 1752 * Returns the smallest value i such that c < list[i]. Caller 1753 * must ensure that c is a legal value or this method will enter 1754 * an infinite loop. This method performs a binary search. 1755 * @param c a character in the range MIN_VALUE..MAX_VALUE 1756 * inclusive 1757 * @return the smallest integer i in the range 0..len-1, 1758 * inclusive, such that c < list[i] 1759 */ findCodePoint(int c)1760 private final int findCodePoint(int c) { 1761 /* Examples: 1762 findCodePoint(c) 1763 set list[] c=0 1 3 4 7 8 1764 === ============== =========== 1765 [] [110000] 0 0 0 0 0 0 1766 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1767 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1768 [:all:] [0, 110000] 1 1 1 1 1 1 1769 */ 1770 1771 // Return the smallest i such that c < list[i]. Assume 1772 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1773 if (c < list[0]) return 0; 1774 // High runner test. c is often after the last range, so an 1775 // initial check for this condition pays off. 1776 if (len >= 2 && c >= list[len-2]) return len-1; 1777 int lo = 0; 1778 int hi = len - 1; 1779 // invariant: c >= list[lo] 1780 // invariant: c < list[hi] 1781 for (;;) { 1782 int i = (lo + hi) >>> 1; 1783 if (i == lo) return hi; 1784 if (c < list[i]) { 1785 hi = i; 1786 } else { 1787 lo = i; 1788 } 1789 } 1790 } 1791 1792 // //---------------------------------------------------------------- 1793 // // Unrolled binary search 1794 // //---------------------------------------------------------------- 1795 // 1796 // private int validLen = -1; // validated value of len 1797 // private int topOfLow; 1798 // private int topOfHigh; 1799 // private int power; 1800 // private int deltaStart; 1801 // 1802 // private void validate() { 1803 // if (len <= 1) { 1804 // throw new IllegalArgumentException("list.len==" + len + "; must be >1"); 1805 // } 1806 // 1807 // // find greatest power of 2 less than or equal to len 1808 // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {} 1809 // 1810 // // assert(exp2[power] <= len); 1811 // 1812 // // determine the starting points 1813 // topOfLow = exp2[power] - 1; 1814 // topOfHigh = len - 1; 1815 // deltaStart = exp2[power-1]; 1816 // validLen = len; 1817 // } 1818 // 1819 // private static final int exp2[] = { 1820 // 0x1, 0x2, 0x4, 0x8, 1821 // 0x10, 0x20, 0x40, 0x80, 1822 // 0x100, 0x200, 0x400, 0x800, 1823 // 0x1000, 0x2000, 0x4000, 0x8000, 1824 // 0x10000, 0x20000, 0x40000, 0x80000, 1825 // 0x100000, 0x200000, 0x400000, 0x800000, 1826 // 0x1000000, 0x2000000, 0x4000000, 0x8000000, 1827 // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java 1828 // }; 1829 // 1830 // /** 1831 // * Unrolled lowest index GT. 1832 // */ 1833 // private final int leastIndexGT(int searchValue) { 1834 // 1835 // if (len != validLen) { 1836 // if (len == 1) return 0; 1837 // validate(); 1838 // } 1839 // int temp; 1840 // 1841 // // set up initial range to search. Each subrange is a power of two in length 1842 // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh; 1843 // 1844 // // Completely unrolled binary search, folhighing "Programming Pearls" 1845 // // Each case deliberately falls through to the next 1846 // // Logically, list[-1] < all_search_values && list[count] > all_search_values 1847 // // although the values -1 and count are never actually touched. 1848 // 1849 // // The bounds at each point are low & high, 1850 // // where low == high - delta*2 1851 // // so high - delta is the midpoint 1852 // 1853 // // The invariant AFTER each line is that list[low] < searchValue <= list[high] 1854 // 1855 // switch (power) { 1856 // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java 1857 // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp; 1858 // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp; 1859 // 1860 // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp; 1861 // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp; 1862 // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp; 1863 // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp; 1864 // 1865 // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp; 1866 // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp; 1867 // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp; 1868 // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp; 1869 // 1870 // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp; 1871 // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp; 1872 // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp; 1873 // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp; 1874 // 1875 // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp; 1876 // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp; 1877 // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp; 1878 // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp; 1879 // 1880 // case 12: if (searchValue < list[temp = high- 0x800]) high = temp; 1881 // case 11: if (searchValue < list[temp = high- 0x400]) high = temp; 1882 // case 10: if (searchValue < list[temp = high- 0x200]) high = temp; 1883 // case 9: if (searchValue < list[temp = high- 0x100]) high = temp; 1884 // 1885 // case 8: if (searchValue < list[temp = high- 0x80]) high = temp; 1886 // case 7: if (searchValue < list[temp = high- 0x40]) high = temp; 1887 // case 6: if (searchValue < list[temp = high- 0x20]) high = temp; 1888 // case 5: if (searchValue < list[temp = high- 0x10]) high = temp; 1889 // 1890 // case 4: if (searchValue < list[temp = high- 0x8]) high = temp; 1891 // case 3: if (searchValue < list[temp = high- 0x4]) high = temp; 1892 // case 2: if (searchValue < list[temp = high- 0x2]) high = temp; 1893 // case 1: if (searchValue < list[temp = high- 0x1]) high = temp; 1894 // } 1895 // 1896 // return high; 1897 // } 1898 // 1899 // // For debugging only 1900 // public int len() { 1901 // return len; 1902 // } 1903 // 1904 // //---------------------------------------------------------------- 1905 // //---------------------------------------------------------------- 1906 1907 /** 1908 * Returns true if this set contains every character 1909 * of the given range. 1910 * @param start first character, inclusive, of the range 1911 * @param end last character, inclusive, of the range 1912 * @return true if the test condition is met 1913 */ contains(int start, int end)1914 public boolean contains(int start, int end) { 1915 if (start < MIN_VALUE || start > MAX_VALUE) { 1916 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1917 } 1918 if (end < MIN_VALUE || end > MAX_VALUE) { 1919 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1920 } 1921 //int i = -1; 1922 //while (true) { 1923 // if (start < list[++i]) break; 1924 //} 1925 int i = findCodePoint(start); 1926 return ((i & 1) != 0 && end < list[i]); 1927 } 1928 1929 /** 1930 * Returns <tt>true</tt> if this set contains the given 1931 * multicharacter string. 1932 * @param s string to be checked for containment 1933 * @return <tt>true</tt> if this set contains the specified string 1934 */ contains(CharSequence s)1935 public final boolean contains(CharSequence s) { 1936 1937 int cp = getSingleCP(s); 1938 if (cp < 0) { 1939 return strings.contains(s.toString()); 1940 } else { 1941 return contains(cp); 1942 } 1943 } 1944 1945 /** 1946 * Returns true if this set contains all the characters and strings 1947 * of the given set. 1948 * @param b set to be checked for containment 1949 * @return true if the test condition is met 1950 */ containsAll(UnicodeSet b)1951 public boolean containsAll(UnicodeSet b) { 1952 // The specified set is a subset if all of its pairs are contained in 1953 // this set. This implementation accesses the lists directly for speed. 1954 // TODO: this could be faster if size() were cached. But that would affect building speed 1955 // so it needs investigation. 1956 int[] listB = b.list; 1957 boolean needA = true; 1958 boolean needB = true; 1959 int aPtr = 0; 1960 int bPtr = 0; 1961 int aLen = len - 1; 1962 int bLen = b.len - 1; 1963 int startA = 0, startB = 0, limitA = 0, limitB = 0; 1964 while (true) { 1965 // double iterations are such a pain... 1966 if (needA) { 1967 if (aPtr >= aLen) { 1968 // ran out of A. If B is also exhausted, then break; 1969 if (needB && bPtr >= bLen) { 1970 break; 1971 } 1972 return false; 1973 } 1974 startA = list[aPtr++]; 1975 limitA = list[aPtr++]; 1976 } 1977 if (needB) { 1978 if (bPtr >= bLen) { 1979 // ran out of B. Since we got this far, we have an A and we are ok so far 1980 break; 1981 } 1982 startB = listB[bPtr++]; 1983 limitB = listB[bPtr++]; 1984 } 1985 // if B doesn't overlap and is greater than A, get new A 1986 if (startB >= limitA) { 1987 needA = true; 1988 needB = false; 1989 continue; 1990 } 1991 // if B is wholy contained in A, then get a new B 1992 if (startB >= startA && limitB <= limitA) { 1993 needA = false; 1994 needB = true; 1995 continue; 1996 } 1997 // all other combinations mean we fail 1998 return false; 1999 } 2000 2001 if (!strings.containsAll(b.strings)) return false; 2002 return true; 2003 } 2004 2005 // /** 2006 // * Returns true if this set contains all the characters and strings 2007 // * of the given set. 2008 // * @param c set to be checked for containment 2009 // * @return true if the test condition is met 2010 // * @stable ICU 2.0 2011 // */ 2012 // public boolean containsAllOld(UnicodeSet c) { 2013 // // The specified set is a subset if all of its pairs are contained in 2014 // // this set. It's possible to code this more efficiently in terms of 2015 // // direct manipulation of the inversion lists if the need arises. 2016 // int n = c.getRangeCount(); 2017 // for (int i=0; i<n; ++i) { 2018 // if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 2019 // return false; 2020 // } 2021 // } 2022 // if (!strings.containsAll(c.strings)) return false; 2023 // return true; 2024 // } 2025 2026 /** 2027 * Returns true if there is a partition of the string such that this set contains each of the partitioned strings. 2028 * For example, for the Unicode set [a{bc}{cd}]<br> 2029 * containsAll is true for each of: "a", "bc", ""cdbca"<br> 2030 * containsAll is false for each of: "acb", "bcda", "bcx"<br> 2031 * @param s string containing characters to be checked for containment 2032 * @return true if the test condition is met 2033 */ containsAll(String s)2034 public boolean containsAll(String s) { 2035 int cp; 2036 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 2037 cp = UTF16.charAt(s, i); 2038 if (!contains(cp)) { 2039 if (!hasStrings()) { 2040 return false; 2041 } 2042 return containsAll(s, 0); 2043 } 2044 } 2045 return true; 2046 } 2047 2048 /** 2049 * Recursive routine called if we fail to find a match in containsAll, and there are strings 2050 * @param s source string 2051 * @param i point to match to the end on 2052 * @return true if ok 2053 */ containsAll(String s, int i)2054 private boolean containsAll(String s, int i) { 2055 if (i >= s.length()) { 2056 return true; 2057 } 2058 int cp= UTF16.charAt(s, i); 2059 if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) { 2060 return true; 2061 } 2062 for (String setStr : strings) { 2063 if (!setStr.isEmpty() && // skip the empty string 2064 s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) { 2065 return true; 2066 } 2067 } 2068 return false; 2069 2070 } 2071 2072 /** 2073 * Get the Regex equivalent for this UnicodeSet 2074 * @return regex pattern equivalent to this UnicodeSet 2075 * @deprecated This API is ICU internal only. 2076 * @hide original deprecated declaration 2077 * @hide draft / provisional / internal are hidden on Android 2078 */ 2079 @Deprecated getRegexEquivalent()2080 public String getRegexEquivalent() { 2081 if (!hasStrings()) { 2082 return toString(); 2083 } 2084 StringBuilder result = new StringBuilder("(?:"); 2085 appendNewPattern(result, true, false); 2086 for (String s : strings) { 2087 result.append('|'); 2088 _appendToPat(result, s, true); 2089 } 2090 return result.append(")").toString(); 2091 } 2092 2093 /** 2094 * Returns true if this set contains none of the characters 2095 * of the given range. 2096 * @param start first character, inclusive, of the range 2097 * @param end last character, inclusive, of the range 2098 * @return true if the test condition is met 2099 */ containsNone(int start, int end)2100 public boolean containsNone(int start, int end) { 2101 if (start < MIN_VALUE || start > MAX_VALUE) { 2102 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 2103 } 2104 if (end < MIN_VALUE || end > MAX_VALUE) { 2105 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 2106 } 2107 int i = -1; 2108 while (true) { 2109 if (start < list[++i]) break; 2110 } 2111 return ((i & 1) == 0 && end < list[i]); 2112 } 2113 2114 /** 2115 * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2116 * For example, for the Unicode set [a{bc}{cd}]<br> 2117 * containsNone is true for: "xy", "cb"<br> 2118 * containsNone is false for: "a", "bc", "bcd"<br> 2119 * @param b set to be checked for containment 2120 * @return true if the test condition is met 2121 */ containsNone(UnicodeSet b)2122 public boolean containsNone(UnicodeSet b) { 2123 // The specified set is a subset if some of its pairs overlap with some of this set's pairs. 2124 // This implementation accesses the lists directly for speed. 2125 int[] listB = b.list; 2126 boolean needA = true; 2127 boolean needB = true; 2128 int aPtr = 0; 2129 int bPtr = 0; 2130 int aLen = len - 1; 2131 int bLen = b.len - 1; 2132 int startA = 0, startB = 0, limitA = 0, limitB = 0; 2133 while (true) { 2134 // double iterations are such a pain... 2135 if (needA) { 2136 if (aPtr >= aLen) { 2137 // ran out of A: break so we test strings 2138 break; 2139 } 2140 startA = list[aPtr++]; 2141 limitA = list[aPtr++]; 2142 } 2143 if (needB) { 2144 if (bPtr >= bLen) { 2145 // ran out of B: break so we test strings 2146 break; 2147 } 2148 startB = listB[bPtr++]; 2149 limitB = listB[bPtr++]; 2150 } 2151 // if B is higher than any part of A, get new A 2152 if (startB >= limitA) { 2153 needA = true; 2154 needB = false; 2155 continue; 2156 } 2157 // if A is higher than any part of B, get new B 2158 if (startA >= limitB) { 2159 needA = false; 2160 needB = true; 2161 continue; 2162 } 2163 // all other combinations mean we fail 2164 return false; 2165 } 2166 2167 if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false; 2168 return true; 2169 } 2170 2171 // /** 2172 // * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2173 // * For example, for the Unicode set [a{bc}{cd}]<br> 2174 // * containsNone is true for: "xy", "cb"<br> 2175 // * containsNone is false for: "a", "bc", "bcd"<br> 2176 // * @param c set to be checked for containment 2177 // * @return true if the test condition is met 2178 // * @stable ICU 2.0 2179 // */ 2180 // public boolean containsNoneOld(UnicodeSet c) { 2181 // // The specified set is a subset if all of its pairs are contained in 2182 // // this set. It's possible to code this more efficiently in terms of 2183 // // direct manipulation of the inversion lists if the need arises. 2184 // int n = c.getRangeCount(); 2185 // for (int i=0; i<n; ++i) { 2186 // if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 2187 // return false; 2188 // } 2189 // } 2190 // if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false; 2191 // return true; 2192 // } 2193 2194 /** 2195 * Returns true if this set contains none of the characters 2196 * of the given string. 2197 * @param s string containing characters to be checked for containment 2198 * @return true if the test condition is met 2199 */ containsNone(CharSequence s)2200 public boolean containsNone(CharSequence s) { 2201 return span(s, SpanCondition.NOT_CONTAINED) == s.length(); 2202 } 2203 2204 /** 2205 * Returns true if this set contains one or more of the characters 2206 * in the given range. 2207 * @param start first character, inclusive, of the range 2208 * @param end last character, inclusive, of the range 2209 * @return true if the condition is met 2210 */ containsSome(int start, int end)2211 public final boolean containsSome(int start, int end) { 2212 return !containsNone(start, end); 2213 } 2214 2215 /** 2216 * Returns true if this set contains one or more of the characters 2217 * and strings of the given set. 2218 * @param s set to be checked for containment 2219 * @return true if the condition is met 2220 */ containsSome(UnicodeSet s)2221 public final boolean containsSome(UnicodeSet s) { 2222 return !containsNone(s); 2223 } 2224 2225 /** 2226 * Returns true if this set contains one or more of the characters 2227 * of the given string. 2228 * @param s string containing characters to be checked for containment 2229 * @return true if the condition is met 2230 */ containsSome(CharSequence s)2231 public final boolean containsSome(CharSequence s) { 2232 return !containsNone(s); 2233 } 2234 2235 2236 /** 2237 * Adds all of the elements in the specified set to this set if 2238 * they're not already present. This operation effectively 2239 * modifies this set so that its value is the <i>union</i> of the two 2240 * sets. The behavior of this operation is unspecified if the specified 2241 * collection is modified while the operation is in progress. 2242 * 2243 * @param c set whose elements are to be added to this set. 2244 */ addAll(UnicodeSet c)2245 public UnicodeSet addAll(UnicodeSet c) { 2246 checkFrozen(); 2247 add(c.list, c.len, 0); 2248 if (c.hasStrings()) { 2249 if (strings == EMPTY_STRINGS) { 2250 strings = new TreeSet<>(c.strings); 2251 } else { 2252 strings.addAll(c.strings); 2253 } 2254 } 2255 return this; 2256 } 2257 2258 /** 2259 * Retains only the elements in this set that are contained in the 2260 * specified set. In other words, removes from this set all of 2261 * its elements that are not contained in the specified set. This 2262 * operation effectively modifies this set so that its value is 2263 * the <i>intersection</i> of the two sets. 2264 * 2265 * @param c set that defines which elements this set will retain. 2266 */ retainAll(UnicodeSet c)2267 public UnicodeSet retainAll(UnicodeSet c) { 2268 checkFrozen(); 2269 retain(c.list, c.len, 0); 2270 if (hasStrings()) { 2271 if (!c.hasStrings()) { 2272 strings.clear(); 2273 } else { 2274 strings.retainAll(c.strings); 2275 } 2276 } 2277 return this; 2278 } 2279 2280 /** 2281 * Removes from this set all of its elements that are contained in the 2282 * specified set. This operation effectively modifies this 2283 * set so that its value is the <i>asymmetric set difference</i> of 2284 * the two sets. 2285 * 2286 * @param c set that defines which elements will be removed from 2287 * this set. 2288 */ removeAll(UnicodeSet c)2289 public UnicodeSet removeAll(UnicodeSet c) { 2290 checkFrozen(); 2291 retain(c.list, c.len, 2); 2292 if (hasStrings() && c.hasStrings()) { 2293 strings.removeAll(c.strings); 2294 } 2295 return this; 2296 } 2297 2298 /** 2299 * Complements in this set all elements contained in the specified 2300 * set. Any character in the other set will be removed if it is 2301 * in this set, or will be added if it is not in this set. 2302 * 2303 * @param c set that defines which elements will be complemented from 2304 * this set. 2305 */ complementAll(UnicodeSet c)2306 public UnicodeSet complementAll(UnicodeSet c) { 2307 checkFrozen(); 2308 xor(c.list, c.len, 0); 2309 if (c.hasStrings()) { 2310 if (strings == EMPTY_STRINGS) { 2311 strings = new TreeSet<>(c.strings); 2312 } else { 2313 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings); 2314 } 2315 } 2316 return this; 2317 } 2318 2319 /** 2320 * Removes all of the elements from this set. This set will be 2321 * empty after this call returns. 2322 */ clear()2323 public UnicodeSet clear() { 2324 checkFrozen(); 2325 list[0] = HIGH; 2326 len = 1; 2327 pat = null; 2328 if (hasStrings()) { 2329 strings.clear(); 2330 } 2331 return this; 2332 } 2333 2334 /** 2335 * Iteration method that returns the number of ranges contained in 2336 * this set. 2337 * @see #getRangeStart 2338 * @see #getRangeEnd 2339 */ getRangeCount()2340 public int getRangeCount() { 2341 return len/2; 2342 } 2343 2344 /** 2345 * Iteration method that returns the first character in the 2346 * specified range of this set. 2347 * @exception ArrayIndexOutOfBoundsException if index is outside 2348 * the range <code>0..getRangeCount()-1</code> 2349 * @see #getRangeCount 2350 * @see #getRangeEnd 2351 */ getRangeStart(int index)2352 public int getRangeStart(int index) { 2353 return list[index*2]; 2354 } 2355 2356 /** 2357 * Iteration method that returns the last character in the 2358 * specified range of this set. 2359 * @exception ArrayIndexOutOfBoundsException if index is outside 2360 * the range <code>0..getRangeCount()-1</code> 2361 * @see #getRangeStart 2362 * @see #getRangeEnd 2363 */ getRangeEnd(int index)2364 public int getRangeEnd(int index) { 2365 return (list[index*2 + 1] - 1); 2366 } 2367 2368 /** 2369 * Reallocate this objects internal structures to take up the least 2370 * possible space, without changing this object's value. 2371 */ compact()2372 public UnicodeSet compact() { 2373 checkFrozen(); 2374 if ((len + 7) < list.length) { 2375 // If we have more than a little unused capacity, shrink it to len. 2376 list = Arrays.copyOf(list, len); 2377 } 2378 rangeList = null; 2379 buffer = null; 2380 if (strings != EMPTY_STRINGS && strings.isEmpty()) { 2381 strings = EMPTY_STRINGS; 2382 } 2383 return this; 2384 } 2385 2386 /** 2387 * Compares the specified object with this set for equality. Returns 2388 * <tt>true</tt> if the specified object is also a set, the two sets 2389 * have the same size, and every member of the specified set is 2390 * contained in this set (or equivalently, every member of this set is 2391 * contained in the specified set). 2392 * 2393 * @param o Object to be compared for equality with this set. 2394 * @return <tt>true</tt> if the specified Object is equal to this set. 2395 */ 2396 @Override equals(Object o)2397 public boolean equals(Object o) { 2398 if (o == null) { 2399 return false; 2400 } 2401 if (this == o) { 2402 return true; 2403 } 2404 try { 2405 UnicodeSet that = (UnicodeSet) o; 2406 if (len != that.len) return false; 2407 for (int i = 0; i < len; ++i) { 2408 if (list[i] != that.list[i]) return false; 2409 } 2410 if (!strings.equals(that.strings)) return false; 2411 } catch (Exception e) { 2412 return false; 2413 } 2414 return true; 2415 } 2416 2417 /** 2418 * Returns the hash code value for this set. 2419 * 2420 * @return the hash code value for this set. 2421 * @see java.lang.Object#hashCode() 2422 */ 2423 @Override hashCode()2424 public int hashCode() { 2425 int result = len; 2426 for (int i = 0; i < len; ++i) { 2427 result *= 1000003; 2428 result += list[i]; 2429 } 2430 return result; 2431 } 2432 2433 /** 2434 * Return a programmer-readable string representation of this object. 2435 */ 2436 @Override toString()2437 public String toString() { 2438 return toPattern(true); 2439 } 2440 2441 //---------------------------------------------------------------- 2442 // Implementation: Pattern parsing 2443 //---------------------------------------------------------------- 2444 2445 /** 2446 * Parses the given pattern, starting at the given position. The character 2447 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 2448 * Parsing continues until the corresponding closing ']'. If a syntax error 2449 * is encountered between the opening and closing brace, the parse fails. 2450 * Upon return from a successful parse, the ParsePosition is updated to 2451 * point to the character following the closing ']', and an inversion 2452 * list for the parsed pattern is returned. This method 2453 * calls itself recursively to parse embedded subpatterns. 2454 * 2455 * @param pattern the string containing the pattern to be parsed. The 2456 * portion of the string from pos.getIndex(), which must be a '[', to the 2457 * corresponding closing ']', is parsed. 2458 * @param pos upon entry, the position at which to being parsing. The 2459 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 2460 * from a successful parse, pos.getIndex() is either the character after the 2461 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 2462 * is the last character of the pattern string. 2463 * @return an inversion list for the parsed substring 2464 * of <code>pattern</code> 2465 * @exception java.lang.IllegalArgumentException if the parse fails. 2466 * @deprecated This API is ICU internal only. 2467 * @hide original deprecated declaration 2468 * @hide draft / provisional / internal are hidden on Android 2469 */ 2470 @Deprecated applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2471 public UnicodeSet applyPattern(String pattern, 2472 ParsePosition pos, 2473 SymbolTable symbols, 2474 int options) { 2475 2476 // Need to build the pattern in a temporary string because 2477 // _applyPattern calls add() etc., which set pat to empty. 2478 boolean parsePositionWasNull = pos == null; 2479 if (parsePositionWasNull) { 2480 pos = new ParsePosition(0); 2481 } 2482 2483 StringBuilder rebuiltPat = new StringBuilder(); 2484 RuleCharacterIterator chars = 2485 new RuleCharacterIterator(pattern, symbols, pos); 2486 applyPattern(chars, symbols, rebuiltPat, options, 0); 2487 if (chars.inVariable()) { 2488 syntaxError(chars, "Extra chars in variable value"); 2489 } 2490 pat = rebuiltPat.toString(); 2491 if (parsePositionWasNull) { 2492 int i = pos.getIndex(); 2493 2494 // Skip over trailing whitespace 2495 if ((options & IGNORE_SPACE) != 0) { 2496 i = PatternProps.skipWhiteSpace(pattern, i); 2497 } 2498 2499 if (i != pattern.length()) { 2500 throw new IllegalArgumentException("Parse of \"" + pattern + 2501 "\" failed at " + i); 2502 } 2503 } 2504 return this; 2505 } 2506 2507 // Add constants to make the applyPattern() code easier to follow. 2508 2509 private static final int LAST0_START = 0, 2510 LAST1_RANGE = 1, 2511 LAST2_SET = 2; 2512 2513 private static final int MODE0_NONE = 0, 2514 MODE1_INBRACKET = 1, 2515 MODE2_OUTBRACKET = 2; 2516 2517 private static final int SETMODE0_NONE = 0, 2518 SETMODE1_UNICODESET = 1, 2519 SETMODE2_PROPERTYPAT = 2, 2520 SETMODE3_PREPARSED = 3; 2521 2522 private static final int MAX_DEPTH = 100; 2523 2524 /** 2525 * Parse the pattern from the given RuleCharacterIterator. The 2526 * iterator is advanced over the parsed pattern. 2527 * @param chars iterator over the pattern characters. Upon return 2528 * it will be advanced to the first character after the parsed 2529 * pattern, or the end of the iteration if all characters are 2530 * parsed. 2531 * @param symbols symbol table to use to parse and dereference 2532 * variables, or null if none. 2533 * @param rebuiltPat the pattern that was parsed, rebuilt or 2534 * copied from the input pattern, as appropriate. 2535 * @param options a bit mask. 2536 * Valid options are {@link #IGNORE_SPACE} and 2537 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 2538 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 2539 */ applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth)2540 private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 2541 Appendable rebuiltPat, int options, int depth) { 2542 if (depth > MAX_DEPTH) { 2543 syntaxError(chars, "Pattern nested too deeply"); 2544 } 2545 2546 // Syntax characters: [ ] ^ - & { } 2547 2548 // Recognized special forms for chars, sets: c-c s-s s&s 2549 2550 int opts = RuleCharacterIterator.PARSE_VARIABLES | 2551 RuleCharacterIterator.PARSE_ESCAPES; 2552 if ((options & IGNORE_SPACE) != 0) { 2553 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 2554 } 2555 2556 StringBuilder patBuf = new StringBuilder(), buf = null; 2557 boolean usePat = false; 2558 UnicodeSet scratch = null; 2559 RuleCharacterIterator.Position backup = null; 2560 2561 // mode: 0=before [, 1=between [...], 2=after ] 2562 // lastItem: 0=none, 1=char, 2=set 2563 int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; 2564 char op = 0; 2565 2566 boolean invert = false; 2567 2568 clear(); 2569 String lastString = null; 2570 2571 while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { 2572 //Eclipse stated the following is "dead code" 2573 /* 2574 if (false) { 2575 // Debugging assertion 2576 if (!((lastItem == 0 && op == 0) || 2577 (lastItem == 1 && (op == 0 || op == '-')) || 2578 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 2579 throw new IllegalArgumentException(); 2580 } 2581 }*/ 2582 2583 int c = 0; 2584 boolean literal = false; 2585 UnicodeSet nested = null; 2586 2587 // -------- Check for property pattern 2588 2589 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 2590 int setMode = SETMODE0_NONE; 2591 if (resemblesPropertyPattern(chars, opts)) { 2592 setMode = SETMODE2_PROPERTYPAT; 2593 } 2594 2595 // -------- Parse '[' of opening delimiter OR nested set. 2596 // If there is a nested set, use `setMode' to define how 2597 // the set should be parsed. If the '[' is part of the 2598 // opening delimiter for this pattern, parse special 2599 // strings "[", "[^", "[-", and "[^-". Check for stand-in 2600 // characters representing a nested set in the symbol 2601 // table. 2602 2603 else { 2604 // Prepare to backup if necessary 2605 backup = chars.getPos(backup); 2606 c = chars.next(opts); 2607 literal = chars.isEscaped(); 2608 2609 if (c == '[' && !literal) { 2610 if (mode == MODE1_INBRACKET) { 2611 chars.setPos(backup); // backup 2612 setMode = SETMODE1_UNICODESET; 2613 } else { 2614 // Handle opening '[' delimiter 2615 mode = MODE1_INBRACKET; 2616 patBuf.append('['); 2617 backup = chars.getPos(backup); // prepare to backup 2618 c = chars.next(opts); 2619 literal = chars.isEscaped(); 2620 if (c == '^' && !literal) { 2621 invert = true; 2622 patBuf.append('^'); 2623 backup = chars.getPos(backup); // prepare to backup 2624 c = chars.next(opts); 2625 literal = chars.isEscaped(); 2626 } 2627 // Fall through to handle special leading '-'; 2628 // otherwise restart loop for nested [], \p{}, etc. 2629 if (c == '-') { 2630 literal = true; 2631 // Fall through to handle literal '-' below 2632 } else { 2633 chars.setPos(backup); // backup 2634 continue; 2635 } 2636 } 2637 } else if (symbols != null) { 2638 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 2639 if (m != null) { 2640 try { 2641 nested = (UnicodeSet) m; 2642 setMode = SETMODE3_PREPARSED; 2643 } catch (ClassCastException e) { 2644 syntaxError(chars, "Syntax error"); 2645 } 2646 } 2647 } 2648 } 2649 2650 // -------- Handle a nested set. This either is inline in 2651 // the pattern or represented by a stand-in that has 2652 // previously been parsed and was looked up in the symbol 2653 // table. 2654 2655 if (setMode != SETMODE0_NONE) { 2656 if (lastItem == LAST1_RANGE) { 2657 if (op != 0) { 2658 syntaxError(chars, "Char expected after operator"); 2659 } 2660 add_unchecked(lastChar, lastChar); 2661 _appendToPat(patBuf, lastChar, false); 2662 lastItem = LAST0_START; 2663 op = 0; 2664 } 2665 2666 if (op == '-' || op == '&') { 2667 patBuf.append(op); 2668 } 2669 2670 if (nested == null) { 2671 if (scratch == null) scratch = new UnicodeSet(); 2672 nested = scratch; 2673 } 2674 switch (setMode) { 2675 case SETMODE1_UNICODESET: 2676 nested.applyPattern(chars, symbols, patBuf, options, depth + 1); 2677 break; 2678 case SETMODE2_PROPERTYPAT: 2679 chars.skipIgnored(opts); 2680 nested.applyPropertyPattern(chars, patBuf, symbols); 2681 break; 2682 case SETMODE3_PREPARSED: // `nested' already parsed 2683 nested._toPattern(patBuf, false); 2684 break; 2685 } 2686 2687 usePat = true; 2688 2689 if (mode == MODE0_NONE) { 2690 // Entire pattern is a category; leave parse loop 2691 set(nested); 2692 mode = MODE2_OUTBRACKET; 2693 break; 2694 } 2695 2696 switch (op) { 2697 case '-': 2698 removeAll(nested); 2699 break; 2700 case '&': 2701 retainAll(nested); 2702 break; 2703 case 0: 2704 addAll(nested); 2705 break; 2706 } 2707 2708 op = 0; 2709 lastItem = LAST2_SET; 2710 2711 continue; 2712 } 2713 2714 if (mode == MODE0_NONE) { 2715 syntaxError(chars, "Missing '['"); 2716 } 2717 2718 // -------- Parse special (syntax) characters. If the 2719 // current character is not special, or if it is escaped, 2720 // then fall through and handle it below. 2721 2722 if (!literal) { 2723 switch (c) { 2724 case ']': 2725 if (lastItem == LAST1_RANGE) { 2726 add_unchecked(lastChar, lastChar); 2727 _appendToPat(patBuf, lastChar, false); 2728 } 2729 // Treat final trailing '-' as a literal 2730 if (op == '-') { 2731 add_unchecked(op, op); 2732 patBuf.append(op); 2733 } else if (op == '&') { 2734 syntaxError(chars, "Trailing '&'"); 2735 } 2736 patBuf.append(']'); 2737 mode = MODE2_OUTBRACKET; 2738 continue; 2739 case '-': 2740 if (op == 0) { 2741 if (lastItem != LAST0_START) { 2742 op = (char) c; 2743 continue; 2744 } else if (lastString != null) { 2745 op = (char) c; 2746 continue; 2747 } else { 2748 // Treat final trailing '-' as a literal 2749 add_unchecked(c, c); 2750 c = chars.next(opts); 2751 literal = chars.isEscaped(); 2752 if (c == ']' && !literal) { 2753 patBuf.append("-]"); 2754 mode = MODE2_OUTBRACKET; 2755 continue; 2756 } 2757 } 2758 } 2759 syntaxError(chars, "'-' not after char, string, or set"); 2760 break; 2761 case '&': 2762 if (lastItem == LAST2_SET && op == 0) { 2763 op = (char) c; 2764 continue; 2765 } 2766 syntaxError(chars, "'&' not after set"); 2767 break; 2768 case '^': 2769 syntaxError(chars, "'^' not after '['"); 2770 break; 2771 case '{': 2772 if (op != 0 && op != '-') { 2773 syntaxError(chars, "Missing operand after operator"); 2774 } 2775 if (lastItem == LAST1_RANGE) { 2776 add_unchecked(lastChar, lastChar); 2777 _appendToPat(patBuf, lastChar, false); 2778 } 2779 lastItem = LAST0_START; 2780 if (buf == null) { 2781 buf = new StringBuilder(); 2782 } else { 2783 buf.setLength(0); 2784 } 2785 boolean ok = false; 2786 while (!chars.atEnd()) { 2787 c = chars.next(opts); 2788 literal = chars.isEscaped(); 2789 if (c == '}' && !literal) { 2790 ok = true; 2791 break; 2792 } 2793 appendCodePoint(buf, c); 2794 } 2795 if (!ok) { 2796 syntaxError(chars, "Invalid multicharacter string"); 2797 } 2798 // We have new string. Add it to set and continue; 2799 // we don't need to drop through to the further 2800 // processing 2801 String curString = buf.toString(); 2802 if (op == '-') { 2803 int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); 2804 int curSingle = CharSequences.getSingleCodePoint(curString); 2805 if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { 2806 add(lastSingle,curSingle); 2807 } else { 2808 if (strings == EMPTY_STRINGS) { 2809 strings = new TreeSet<>(); 2810 } 2811 try { 2812 StringRange.expand(lastString, curString, true, strings); 2813 } catch (Exception e) { 2814 syntaxError(chars, e.getMessage()); 2815 } 2816 } 2817 lastString = null; 2818 op = 0; 2819 } else { 2820 add(curString); 2821 lastString = curString; 2822 } 2823 patBuf.append('{'); 2824 _appendToPat(patBuf, curString, false); 2825 patBuf.append('}'); 2826 continue; 2827 case SymbolTable.SYMBOL_REF: 2828 // symbols nosymbols 2829 // [a-$] error error (ambiguous) 2830 // [a$] anchor anchor 2831 // [a-$x] var "x"* literal '$' 2832 // [a-$.] error literal '$' 2833 // *We won't get here in the case of var "x" 2834 backup = chars.getPos(backup); 2835 c = chars.next(opts); 2836 literal = chars.isEscaped(); 2837 boolean anchor = (c == ']' && !literal); 2838 if (symbols == null && !anchor) { 2839 c = SymbolTable.SYMBOL_REF; 2840 chars.setPos(backup); 2841 break; // literal '$' 2842 } 2843 if (anchor && op == 0) { 2844 if (lastItem == LAST1_RANGE) { 2845 add_unchecked(lastChar, lastChar); 2846 _appendToPat(patBuf, lastChar, false); 2847 } 2848 add_unchecked(UnicodeMatcher.ETHER); 2849 usePat = true; 2850 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 2851 mode = MODE2_OUTBRACKET; 2852 continue; 2853 } 2854 syntaxError(chars, "Unquoted '$'"); 2855 break; 2856 default: 2857 break; 2858 } 2859 } 2860 2861 // -------- Parse literal characters. This includes both 2862 // escaped chars ("\u4E01") and non-syntax characters 2863 // ("a"). 2864 2865 switch (lastItem) { 2866 case LAST0_START: 2867 if (op == '-' && lastString != null) { 2868 syntaxError(chars, "Invalid range"); 2869 } 2870 lastItem = LAST1_RANGE; 2871 lastChar = c; 2872 lastString = null; 2873 break; 2874 case LAST1_RANGE: 2875 if (op == '-') { 2876 if (lastString != null) { 2877 syntaxError(chars, "Invalid range"); 2878 } 2879 if (lastChar >= c) { 2880 // Don't allow redundant (a-a) or empty (b-a) ranges; 2881 // these are most likely typos. 2882 syntaxError(chars, "Invalid range"); 2883 } 2884 add_unchecked(lastChar, c); 2885 _appendToPat(patBuf, lastChar, false); 2886 patBuf.append(op); 2887 _appendToPat(patBuf, c, false); 2888 lastItem = LAST0_START; 2889 op = 0; 2890 } else { 2891 add_unchecked(lastChar, lastChar); 2892 _appendToPat(patBuf, lastChar, false); 2893 lastChar = c; 2894 } 2895 break; 2896 case LAST2_SET: 2897 if (op != 0) { 2898 syntaxError(chars, "Set expected after operator"); 2899 } 2900 lastChar = c; 2901 lastItem = LAST1_RANGE; 2902 break; 2903 } 2904 } 2905 2906 if (mode != MODE2_OUTBRACKET) { 2907 syntaxError(chars, "Missing ']'"); 2908 } 2909 2910 chars.skipIgnored(opts); 2911 2912 /** 2913 * Handle global flags (invert, case insensitivity). If this 2914 * pattern should be compiled case-insensitive, then we need 2915 * to close over case BEFORE COMPLEMENTING. This makes 2916 * patterns like /[^abc]/i work. 2917 */ 2918 if ((options & CASE_MASK) != 0) { 2919 closeOver(options); 2920 } 2921 if (invert) { 2922 complement().removeAllStrings(); // code point complement 2923 } 2924 2925 // Use the rebuilt pattern (pat) only if necessary. Prefer the 2926 // generated pattern. 2927 if (usePat) { 2928 append(rebuiltPat, patBuf.toString()); 2929 } else { 2930 appendNewPattern(rebuiltPat, false, true); 2931 } 2932 } 2933 syntaxError(RuleCharacterIterator chars, String msg)2934 private static void syntaxError(RuleCharacterIterator chars, String msg) { 2935 throw new IllegalArgumentException("Error: " + msg + " at \"" + 2936 Utility.escape(chars.toString()) + 2937 '"'); 2938 } 2939 2940 /** 2941 * Add the contents of the UnicodeSet (as strings) into a collection. 2942 * @param target collection to add into 2943 */ addAllTo(T target)2944 public <T extends Collection<String>> T addAllTo(T target) { 2945 return addAllTo(this, target); 2946 } 2947 2948 2949 /** 2950 * Add the contents of the UnicodeSet (as strings) into a collection. 2951 * @param target collection to add into 2952 * @hide unsupported on Android 2953 */ addAllTo(String[] target)2954 public String[] addAllTo(String[] target) { 2955 return addAllTo(this, target); 2956 } 2957 2958 /** 2959 * Add the contents of the UnicodeSet (as strings) into an array. 2960 * @hide unsupported on Android 2961 */ toArray(UnicodeSet set)2962 public static String[] toArray(UnicodeSet set) { 2963 return addAllTo(set, new String[set.size()]); 2964 } 2965 2966 /** 2967 * Add the contents of the collection (as strings) into this UnicodeSet. 2968 * The collection must not contain null. 2969 * @param source the collection to add 2970 * @return a reference to this object 2971 */ add(Iterable<?> source)2972 public UnicodeSet add(Iterable<?> source) { 2973 return addAll(source); 2974 } 2975 2976 /** 2977 * Add a collection (as strings) into this UnicodeSet. 2978 * Uses standard naming convention. 2979 * @param source collection to add into 2980 * @return a reference to this object 2981 */ addAll(Iterable<?> source)2982 public UnicodeSet addAll(Iterable<?> source) { 2983 checkFrozen(); 2984 for (Object o : source) { 2985 add(o.toString()); 2986 } 2987 return this; 2988 } 2989 2990 //---------------------------------------------------------------- 2991 // Implementation: Utility methods 2992 //---------------------------------------------------------------- 2993 nextCapacity(int minCapacity)2994 private int nextCapacity(int minCapacity) { 2995 // Grow exponentially to reduce the frequency of allocations. 2996 if (minCapacity < INITIAL_CAPACITY) { 2997 return minCapacity + INITIAL_CAPACITY; 2998 } else if (minCapacity <= 2500) { 2999 return 5 * minCapacity; 3000 } else { 3001 int newCapacity = 2 * minCapacity; 3002 if (newCapacity > MAX_LENGTH) { 3003 newCapacity = MAX_LENGTH; 3004 } 3005 return newCapacity; 3006 } 3007 } 3008 ensureCapacity(int newLen)3009 private void ensureCapacity(int newLen) { 3010 if (newLen > MAX_LENGTH) { 3011 newLen = MAX_LENGTH; 3012 } 3013 if (newLen <= list.length) return; 3014 int newCapacity = nextCapacity(newLen); 3015 int[] temp = new int[newCapacity]; 3016 // Copy only the actual contents. 3017 System.arraycopy(list, 0, temp, 0, len); 3018 list = temp; 3019 } 3020 ensureBufferCapacity(int newLen)3021 private void ensureBufferCapacity(int newLen) { 3022 if (newLen > MAX_LENGTH) { 3023 newLen = MAX_LENGTH; 3024 } 3025 if (buffer != null && newLen <= buffer.length) return; 3026 int newCapacity = nextCapacity(newLen); 3027 buffer = new int[newCapacity]; 3028 // The buffer has no contents to be copied. 3029 // It is always filled from scratch after this call. 3030 } 3031 3032 /** 3033 * Assumes start <= end. 3034 */ range(int start, int end)3035 private int[] range(int start, int end) { 3036 if (rangeList == null) { 3037 rangeList = new int[] { start, end+1, HIGH }; 3038 } else { 3039 rangeList[0] = start; 3040 rangeList[1] = end+1; 3041 } 3042 return rangeList; 3043 } 3044 3045 //---------------------------------------------------------------- 3046 // Implementation: Fundamental operations 3047 //---------------------------------------------------------------- 3048 3049 // polarity = 0, 3 is normal: x xor y 3050 // polarity = 1, 2: x xor ~y == x === y 3051 xor(int[] other, int otherLen, int polarity)3052 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 3053 ensureBufferCapacity(len + otherLen); 3054 int i = 0, j = 0, k = 0; 3055 int a = list[i++]; 3056 int b; 3057 // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used 3058 // so the following if statement will not be called. 3059 ///CLOVER:OFF 3060 if (polarity == 1 || polarity == 2) { 3061 b = LOW; 3062 if (other[j] == LOW) { // skip base if already LOW 3063 ++j; 3064 b = other[j]; 3065 } 3066 ///CLOVER:ON 3067 } else { 3068 b = other[j++]; 3069 } 3070 // simplest of all the routines 3071 // sort the values, discarding identicals! 3072 while (true) { 3073 if (a < b) { 3074 buffer[k++] = a; 3075 a = list[i++]; 3076 } else if (b < a) { 3077 buffer[k++] = b; 3078 b = other[j++]; 3079 } else if (a != HIGH) { // at this point, a == b 3080 // discard both values! 3081 a = list[i++]; 3082 b = other[j++]; 3083 } else { // DONE! 3084 buffer[k++] = HIGH; 3085 len = k; 3086 break; 3087 } 3088 } 3089 // swap list and buffer 3090 int[] temp = list; 3091 list = buffer; 3092 buffer = temp; 3093 pat = null; 3094 return this; 3095 } 3096 3097 // polarity = 0 is normal: x union y 3098 // polarity = 2: x union ~y 3099 // polarity = 1: ~x union y 3100 // polarity = 3: ~x union ~y 3101 add(int[] other, int otherLen, int polarity)3102 private UnicodeSet add(int[] other, int otherLen, int polarity) { 3103 ensureBufferCapacity(len + otherLen); 3104 int i = 0, j = 0, k = 0; 3105 int a = list[i++]; 3106 int b = other[j++]; 3107 // change from xor is that we have to check overlapping pairs 3108 // polarity bit 1 means a is second, bit 2 means b is. 3109 main: 3110 while (true) { 3111 switch (polarity) { 3112 case 0: // both first; take lower if unequal 3113 if (a < b) { // take a 3114 // Back up over overlapping ranges in buffer[] 3115 if (k > 0 && a <= buffer[k-1]) { 3116 // Pick latter end value in buffer[] vs. list[] 3117 a = max(list[i], buffer[--k]); 3118 } else { 3119 // No overlap 3120 buffer[k++] = a; 3121 a = list[i]; 3122 } 3123 i++; // Common if/else code factored out 3124 polarity ^= 1; 3125 } else if (b < a) { // take b 3126 if (k > 0 && b <= buffer[k-1]) { 3127 b = max(other[j], buffer[--k]); 3128 } else { 3129 buffer[k++] = b; 3130 b = other[j]; 3131 } 3132 j++; 3133 polarity ^= 2; 3134 } else { // a == b, take a, drop b 3135 if (a == HIGH) break main; 3136 // This is symmetrical; it doesn't matter if 3137 // we backtrack with a or b. - liu 3138 if (k > 0 && a <= buffer[k-1]) { 3139 a = max(list[i], buffer[--k]); 3140 } else { 3141 // No overlap 3142 buffer[k++] = a; 3143 a = list[i]; 3144 } 3145 i++; 3146 polarity ^= 1; 3147 b = other[j++]; polarity ^= 2; 3148 } 3149 break; 3150 case 3: // both second; take higher if unequal, and drop other 3151 if (b <= a) { // take a 3152 if (a == HIGH) break main; 3153 buffer[k++] = a; 3154 } else { // take b 3155 if (b == HIGH) break main; 3156 buffer[k++] = b; 3157 } 3158 a = list[i++]; polarity ^= 1; // factored common code 3159 b = other[j++]; polarity ^= 2; 3160 break; 3161 case 1: // a second, b first; if b < a, overlap 3162 if (a < b) { // no overlap, take a 3163 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3164 } else if (b < a) { // OVERLAP, drop b 3165 b = other[j++]; polarity ^= 2; 3166 } else { // a == b, drop both! 3167 if (a == HIGH) break main; 3168 a = list[i++]; polarity ^= 1; 3169 b = other[j++]; polarity ^= 2; 3170 } 3171 break; 3172 case 2: // a first, b second; if a < b, overlap 3173 if (b < a) { // no overlap, take b 3174 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3175 } else if (a < b) { // OVERLAP, drop a 3176 a = list[i++]; polarity ^= 1; 3177 } else { // a == b, drop both! 3178 if (a == HIGH) break main; 3179 a = list[i++]; polarity ^= 1; 3180 b = other[j++]; polarity ^= 2; 3181 } 3182 break; 3183 } 3184 } 3185 buffer[k++] = HIGH; // terminate 3186 len = k; 3187 // swap list and buffer 3188 int[] temp = list; 3189 list = buffer; 3190 buffer = temp; 3191 pat = null; 3192 return this; 3193 } 3194 3195 // polarity = 0 is normal: x intersect y 3196 // polarity = 2: x intersect ~y == set-minus 3197 // polarity = 1: ~x intersect y 3198 // polarity = 3: ~x intersect ~y 3199 retain(int[] other, int otherLen, int polarity)3200 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 3201 ensureBufferCapacity(len + otherLen); 3202 int i = 0, j = 0, k = 0; 3203 int a = list[i++]; 3204 int b = other[j++]; 3205 // change from xor is that we have to check overlapping pairs 3206 // polarity bit 1 means a is second, bit 2 means b is. 3207 main: 3208 while (true) { 3209 switch (polarity) { 3210 case 0: // both first; drop the smaller 3211 if (a < b) { // drop a 3212 a = list[i++]; polarity ^= 1; 3213 } else if (b < a) { // drop b 3214 b = other[j++]; polarity ^= 2; 3215 } else { // a == b, take one, drop other 3216 if (a == HIGH) break main; 3217 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3218 b = other[j++]; polarity ^= 2; 3219 } 3220 break; 3221 case 3: // both second; take lower if unequal 3222 if (a < b) { // take a 3223 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3224 } else if (b < a) { // take b 3225 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3226 } else { // a == b, take one, drop other 3227 if (a == HIGH) break main; 3228 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3229 b = other[j++]; polarity ^= 2; 3230 } 3231 break; 3232 case 1: // a second, b first; 3233 if (a < b) { // NO OVERLAP, drop a 3234 a = list[i++]; polarity ^= 1; 3235 } else if (b < a) { // OVERLAP, take b 3236 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3237 } else { // a == b, drop both! 3238 if (a == HIGH) break main; 3239 a = list[i++]; polarity ^= 1; 3240 b = other[j++]; polarity ^= 2; 3241 } 3242 break; 3243 case 2: // a first, b second; if a < b, overlap 3244 if (b < a) { // no overlap, drop b 3245 b = other[j++]; polarity ^= 2; 3246 } else if (a < b) { // OVERLAP, take a 3247 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3248 } else { // a == b, drop both! 3249 if (a == HIGH) break main; 3250 a = list[i++]; polarity ^= 1; 3251 b = other[j++]; polarity ^= 2; 3252 } 3253 break; 3254 } 3255 } 3256 buffer[k++] = HIGH; // terminate 3257 len = k; 3258 // swap list and buffer 3259 int[] temp = list; 3260 list = buffer; 3261 buffer = temp; 3262 pat = null; 3263 return this; 3264 } 3265 max(int a, int b)3266 private static final int max(int a, int b) { 3267 return (a > b) ? a : b; 3268 } 3269 3270 //---------------------------------------------------------------- 3271 // Generic filter-based scanning code 3272 //---------------------------------------------------------------- 3273 3274 private static interface Filter { contains(int codePoint)3275 boolean contains(int codePoint); 3276 } 3277 3278 private static final class NumericValueFilter implements Filter { 3279 double value; NumericValueFilter(double value)3280 NumericValueFilter(double value) { this.value = value; } 3281 @Override contains(int ch)3282 public boolean contains(int ch) { 3283 return UCharacter.getUnicodeNumericValue(ch) == value; 3284 } 3285 } 3286 3287 private static final class GeneralCategoryMaskFilter implements Filter { 3288 int mask; GeneralCategoryMaskFilter(int mask)3289 GeneralCategoryMaskFilter(int mask) { this.mask = mask; } 3290 @Override contains(int ch)3291 public boolean contains(int ch) { 3292 return ((1 << UCharacter.getType(ch)) & mask) != 0; 3293 } 3294 } 3295 3296 private static final class IntPropertyFilter implements Filter { 3297 int prop; 3298 int value; IntPropertyFilter(int prop, int value)3299 IntPropertyFilter(int prop, int value) { 3300 this.prop = prop; 3301 this.value = value; 3302 } 3303 @Override contains(int ch)3304 public boolean contains(int ch) { 3305 return UCharacter.getIntPropertyValue(ch, prop) == value; 3306 } 3307 } 3308 3309 private static final class ScriptExtensionsFilter implements Filter { 3310 int script; ScriptExtensionsFilter(int script)3311 ScriptExtensionsFilter(int script) { this.script = script; } 3312 @Override contains(int c)3313 public boolean contains(int c) { 3314 return UScript.hasScript(c, script); 3315 } 3316 } 3317 3318 private static final class IdentifierTypeFilter implements Filter { 3319 int idType; IdentifierTypeFilter(int idType)3320 IdentifierTypeFilter(int idType) { this.idType = idType; } 3321 @Override contains(int c)3322 public boolean contains(int c) { 3323 return UCharacterProperty.INSTANCE.hasIDType(c, idType); 3324 } 3325 } 3326 3327 // VersionInfo for unassigned characters 3328 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 3329 3330 private static final class VersionFilter implements Filter { 3331 VersionInfo version; VersionFilter(VersionInfo version)3332 VersionFilter(VersionInfo version) { this.version = version; } 3333 @Override contains(int ch)3334 public boolean contains(int ch) { 3335 VersionInfo v = UCharacter.getAge(ch); 3336 // Reference comparison ok; VersionInfo caches and reuses 3337 // unique objects. 3338 return !Utility.sameObjects(v, NO_VERSION) && 3339 v.compareTo(version) <= 0; 3340 } 3341 } 3342 3343 /** 3344 * Generic filter-based scanning code for UCD property UnicodeSets. 3345 */ applyFilter(Filter filter, UnicodeSet inclusions)3346 private void applyFilter(Filter filter, UnicodeSet inclusions) { 3347 // Logically, walk through all Unicode characters, noting the start 3348 // and end of each range for which filter.contain(c) is 3349 // true. Add each range to a set. 3350 // 3351 // To improve performance, use an inclusions set which 3352 // encodes information about character ranges that are known 3353 // to have identical properties. 3354 // inclusions contains the first characters of 3355 // same-value ranges for the given property. 3356 3357 clear(); 3358 3359 int startHasProperty = -1; 3360 int limitRange = inclusions.getRangeCount(); 3361 3362 for (int j=0; j<limitRange; ++j) { 3363 // get current range 3364 int start = inclusions.getRangeStart(j); 3365 int end = inclusions.getRangeEnd(j); 3366 3367 // for all the code points in the range, process 3368 for (int ch = start; ch <= end; ++ch) { 3369 // only add to the unicodeset on inflection points -- 3370 // where the hasProperty value changes to false 3371 if (filter.contains(ch)) { 3372 if (startHasProperty < 0) { 3373 startHasProperty = ch; 3374 } 3375 } else if (startHasProperty >= 0) { 3376 add_unchecked(startHasProperty, ch-1); 3377 startHasProperty = -1; 3378 } 3379 } 3380 } 3381 if (startHasProperty >= 0) { 3382 add_unchecked(startHasProperty, 0x10FFFF); 3383 } 3384 } 3385 3386 /** 3387 * Remove leading and trailing Pattern_White_Space and compress 3388 * internal Pattern_White_Space to a single space character. 3389 */ mungeCharName(String source)3390 private static String mungeCharName(String source) { 3391 source = PatternProps.trimWhiteSpace(source); 3392 StringBuilder buf = null; 3393 for (int i=0; i<source.length(); ++i) { 3394 char ch = source.charAt(i); 3395 if (PatternProps.isWhiteSpace(ch)) { 3396 if (buf == null) { 3397 buf = new StringBuilder().append(source, 0, i); 3398 } else if (buf.charAt(buf.length() - 1) == ' ') { 3399 continue; 3400 } 3401 ch = ' '; // convert to ' ' 3402 } 3403 if (buf != null) { 3404 buf.append(ch); 3405 } 3406 } 3407 return buf == null ? source : buf.toString(); 3408 } 3409 3410 //---------------------------------------------------------------- 3411 // Property set API 3412 //---------------------------------------------------------------- 3413 3414 /** 3415 * Modifies this set to contain those code points which have the 3416 * given value for the given binary or enumerated property, as 3417 * returned by UCharacter.getIntPropertyValue. Prior contents of 3418 * this set are lost. 3419 * 3420 * @param prop a property in the range 3421 * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or 3422 * UProperty.INT_START..UProperty.INT_LIMIT-1 or. 3423 * UProperty.MASK_START..UProperty.MASK_LIMIT-1. 3424 * 3425 * @param value a value in the range 3426 * UCharacter.getIntPropertyMinValue(prop).. 3427 * UCharacter.getIntPropertyMaxValue(prop), with one exception. 3428 * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be 3429 * a UCharacter.getType() result, but rather a mask value produced 3430 * by logically ORing (1 << UCharacter.getType()) values together. 3431 * This allows grouped categories such as [:L:] to be represented. 3432 * 3433 * @return a reference to this set 3434 */ applyIntPropertyValue(int prop, int value)3435 public UnicodeSet applyIntPropertyValue(int prop, int value) { 3436 // All of the following include checkFrozen() before modifying this set. 3437 if (prop == UProperty.GENERAL_CATEGORY_MASK) { 3438 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3439 applyFilter(new GeneralCategoryMaskFilter(value), inclusions); 3440 } else if (prop == UProperty.SCRIPT_EXTENSIONS) { 3441 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3442 applyFilter(new ScriptExtensionsFilter(value), inclusions); 3443 } else if (prop == UProperty.IDENTIFIER_TYPE) { 3444 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3445 applyFilter(new IdentifierTypeFilter(value), inclusions); 3446 } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) { 3447 if (value == 0 || value == 1) { 3448 set(CharacterProperties.getBinaryPropertySet(prop)); 3449 if (value == 0) { 3450 complement().removeAllStrings(); // code point complement 3451 } 3452 } else { 3453 clear(); 3454 } 3455 } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) { 3456 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3457 applyFilter(new IntPropertyFilter(prop, value), inclusions); 3458 } else { 3459 throw new IllegalArgumentException("unsupported property " + prop); 3460 } 3461 return this; 3462 } 3463 3464 3465 3466 /** 3467 * Modifies this set to contain those code points which have the 3468 * given value for the given property. Prior contents of this 3469 * set are lost. 3470 * 3471 * @param propertyAlias a property alias, either short or long. 3472 * The name is matched loosely. See PropertyAliases.txt for names 3473 * and a description of loose matching. If the value string is 3474 * empty, then this string is interpreted as either a 3475 * General_Category value alias, a Script value alias, a binary 3476 * property alias, or a special ID. Special IDs are matched 3477 * loosely and correspond to the following sets: 3478 * 3479 * "ANY" = [\\u0000-\\U0010FFFF], 3480 * "ASCII" = [\\u0000-\\u007F]. 3481 * 3482 * @param valueAlias a value alias, either short or long. The 3483 * name is matched loosely. See PropertyValueAliases.txt for 3484 * names and a description of loose matching. In addition to 3485 * aliases listed, numeric values and canonical combining classes 3486 * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", 3487 * "220"). The value string may also be empty. 3488 * 3489 * @return a reference to this set 3490 */ applyPropertyAlias(String propertyAlias, String valueAlias)3491 public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { 3492 return applyPropertyAlias(propertyAlias, valueAlias, null); 3493 } 3494 3495 /** 3496 * Modifies this set to contain those code points which have the 3497 * given value for the given property. Prior contents of this 3498 * set are lost. 3499 * @param propertyAlias A string of the property alias. 3500 * @param valueAlias A string of the value alias. 3501 * @param symbols if not null, then symbols are first called to see if a property 3502 * is available. If true, then everything else is skipped. 3503 * @return this set 3504 */ applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3505 public UnicodeSet applyPropertyAlias(String propertyAlias, 3506 String valueAlias, SymbolTable symbols) { 3507 checkFrozen(); 3508 int p; 3509 int v; 3510 boolean invert = false; 3511 3512 if (symbols != null 3513 && (symbols instanceof XSymbolTable) 3514 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { 3515 return this; 3516 } 3517 3518 if (XSYMBOL_TABLE != null) { 3519 if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { 3520 return this; 3521 } 3522 } 3523 3524 if (valueAlias.length() > 0) { 3525 p = UCharacter.getPropertyEnum(propertyAlias); 3526 3527 // Treat gc as gcm 3528 if (p == UProperty.GENERAL_CATEGORY) { 3529 p = UProperty.GENERAL_CATEGORY_MASK; 3530 } 3531 3532 if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) || 3533 (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) || 3534 (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) { 3535 try { 3536 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3537 } catch (IllegalArgumentException e) { 3538 // Handle numeric CCC 3539 if (p == UProperty.CANONICAL_COMBINING_CLASS || 3540 p == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 3541 p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) { 3542 v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias)); 3543 // Anything between 0 and 255 is valid even if unused. 3544 if (v < 0 || v > 255) throw e; 3545 } else { 3546 throw e; 3547 } 3548 } 3549 } 3550 3551 else { 3552 switch (p) { 3553 case UProperty.NUMERIC_VALUE: 3554 { 3555 double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); 3556 applyFilter(new NumericValueFilter(value), 3557 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3558 return this; 3559 } 3560 case UProperty.NAME: 3561 { 3562 // Must munge name, since 3563 // UCharacter.charFromName() does not do 3564 // 'loose' matching. 3565 String buf = mungeCharName(valueAlias); 3566 int ch = UCharacter.getCharFromExtendedName(buf); 3567 if (ch == -1) { 3568 throw new IllegalArgumentException("Invalid character name"); 3569 } 3570 clear(); 3571 add_unchecked(ch); 3572 return this; 3573 } 3574 case UProperty.UNICODE_1_NAME: 3575 // ICU 49 deprecates the Unicode_1_Name property APIs. 3576 throw new IllegalArgumentException("Unicode_1_Name (na1) not supported"); 3577 case UProperty.AGE: 3578 { 3579 // Must munge name, since 3580 // VersionInfo.getInstance() does not do 3581 // 'loose' matching. 3582 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 3583 applyFilter(new VersionFilter(version), 3584 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3585 return this; 3586 } 3587 case UProperty.SCRIPT_EXTENSIONS: 3588 v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias); 3589 // fall through to calling applyIntPropertyValue() 3590 break; 3591 case UProperty.IDENTIFIER_TYPE: 3592 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3593 // fall through to calling applyIntPropertyValue() 3594 break; 3595 default: 3596 // p is a non-binary, non-enumerated property that we 3597 // don't support (yet). 3598 throw new IllegalArgumentException("Unsupported property"); 3599 } 3600 } 3601 } 3602 3603 else { 3604 // valueAlias is empty. Interpret as General Category, Script, 3605 // Binary property, or ANY or ASCII. Upon success, p and v will 3606 // be set. 3607 UPropertyAliases pnames = UPropertyAliases.INSTANCE; 3608 p = UProperty.GENERAL_CATEGORY_MASK; 3609 v = pnames.getPropertyValueEnum(p, propertyAlias); 3610 if (v == UProperty.UNDEFINED) { 3611 p = UProperty.SCRIPT; 3612 v = pnames.getPropertyValueEnum(p, propertyAlias); 3613 if (v == UProperty.UNDEFINED) { 3614 p = pnames.getPropertyEnum(propertyAlias); 3615 if (p == UProperty.UNDEFINED) { 3616 p = -1; 3617 } 3618 if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) { 3619 v = 1; 3620 } else if (p == -1) { 3621 if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) { 3622 set(MIN_VALUE, MAX_VALUE); 3623 return this; 3624 } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) { 3625 set(0, 0x7F); 3626 return this; 3627 } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) { 3628 // [:Assigned:]=[:^Cn:] 3629 p = UProperty.GENERAL_CATEGORY_MASK; 3630 v = (1<<UCharacter.UNASSIGNED); 3631 invert = true; 3632 } else { 3633 // Property name was never matched. 3634 throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias); 3635 } 3636 } else { 3637 // Valid property name, but it isn't binary, so the value 3638 // must be supplied. 3639 throw new IllegalArgumentException("Missing property value"); 3640 } 3641 } 3642 } 3643 } 3644 3645 applyIntPropertyValue(p, v); 3646 if(invert) { 3647 complement().removeAllStrings(); // code point complement 3648 } 3649 3650 return this; 3651 } 3652 3653 //---------------------------------------------------------------- 3654 // Property set patterns 3655 //---------------------------------------------------------------- 3656 3657 /** 3658 * Return true if the given position, in the given pattern, appears 3659 * to be the start of a property set pattern. 3660 */ resemblesPropertyPattern(String pattern, int pos)3661 private static boolean resemblesPropertyPattern(String pattern, int pos) { 3662 // Patterns are at least 5 characters long 3663 if ((pos+5) > pattern.length()) { 3664 return false; 3665 } 3666 3667 // Look for an opening [:, [:^, \p, or \P 3668 return pattern.regionMatches(pos, "[:", 0, 2) || 3669 pattern.regionMatches(true, pos, "\\p", 0, 2) || 3670 pattern.regionMatches(pos, "\\N", 0, 2); 3671 } 3672 3673 /** 3674 * Return true if the given iterator appears to point at a 3675 * property pattern. Regardless of the result, return with the 3676 * iterator unchanged. 3677 * @param chars iterator over the pattern characters. Upon return 3678 * it will be unchanged. 3679 * @param iterOpts RuleCharacterIterator options 3680 */ resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3681 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 3682 int iterOpts) { 3683 boolean result = false; 3684 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 3685 RuleCharacterIterator.Position pos = chars.getPos(null); 3686 int c = chars.next(iterOpts); 3687 if (c == '[' || c == '\\') { 3688 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 3689 result = (c == '[') ? (d == ':') : 3690 (d == 'N' || d == 'p' || d == 'P'); 3691 } 3692 chars.setPos(pos); 3693 return result; 3694 } 3695 3696 /** 3697 * Parse the given property pattern at the given parse position. 3698 * @param symbols TODO 3699 */ applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3700 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 3701 int pos = ppos.getIndex(); 3702 3703 // On entry, ppos should point to one of the following locations: 3704 3705 // Minimum length is 5 characters, e.g. \p{L} 3706 if ((pos+5) > pattern.length()) { 3707 return null; 3708 } 3709 3710 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 3711 boolean isName = false; // true for \N{pat}, o/w false 3712 boolean invert = false; 3713 3714 // Look for an opening [:, [:^, \p, or \P 3715 if (pattern.regionMatches(pos, "[:", 0, 2)) { 3716 posix = true; 3717 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3718 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 3719 ++pos; 3720 invert = true; 3721 } 3722 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 3723 pattern.regionMatches(pos, "\\N", 0, 2)) { 3724 char c = pattern.charAt(pos+1); 3725 invert = (c == 'P'); 3726 isName = (c == 'N'); 3727 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3728 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 3729 // Syntax error; "\p" or "\P" not followed by "{" 3730 return null; 3731 } 3732 } else { 3733 // Open delimiter not seen 3734 return null; 3735 } 3736 3737 // Look for the matching close delimiter, either :] or } 3738 int close = pattern.indexOf(posix ? ":]" : "}", pos); 3739 if (close < 0) { 3740 // Syntax error; close delimiter missing 3741 return null; 3742 } 3743 3744 // Look for an '=' sign. If this is present, we will parse a 3745 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 3746 // pattern. 3747 int equals = pattern.indexOf('=', pos); 3748 String propName, valueName; 3749 if (equals >= 0 && equals < close && !isName) { 3750 // Equals seen; parse medium/long pattern 3751 propName = pattern.substring(pos, equals); 3752 valueName = pattern.substring(equals+1, close); 3753 } 3754 3755 else { 3756 // Handle case where no '=' is seen, and \N{} 3757 propName = pattern.substring(pos, close); 3758 valueName = ""; 3759 3760 // Handle \N{name} 3761 if (isName) { 3762 // This is a little inefficient since it means we have to 3763 // parse "na" back to UProperty.NAME even though we already 3764 // know it's UProperty.NAME. If we refactor the API to 3765 // support args of (int, String) then we can remove 3766 // "na" and make this a little more efficient. 3767 valueName = propName; 3768 propName = "na"; 3769 } 3770 } 3771 3772 applyPropertyAlias(propName, valueName, symbols); 3773 3774 if (invert) { 3775 complement().removeAllStrings(); // code point complement 3776 } 3777 3778 // Move to the limit position after the close delimiter 3779 ppos.setIndex(close + (posix ? 2 : 1)); 3780 3781 return this; 3782 } 3783 3784 /** 3785 * Parse a property pattern. 3786 * @param chars iterator over the pattern characters. Upon return 3787 * it will be advanced to the first character after the parsed 3788 * pattern, or the end of the iteration if all characters are 3789 * parsed. 3790 * @param rebuiltPat the pattern that was parsed, rebuilt or 3791 * copied from the input pattern, as appropriate. 3792 * @param symbols TODO 3793 */ applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3794 private void applyPropertyPattern(RuleCharacterIterator chars, 3795 Appendable rebuiltPat, SymbolTable symbols) { 3796 String patStr = chars.getCurrentBuffer(); 3797 int start = chars.getCurrentBufferPos(); 3798 ParsePosition pos = new ParsePosition(start); 3799 applyPropertyPattern(patStr, pos, symbols); 3800 int length = pos.getIndex() - start; 3801 if (length == 0) { 3802 syntaxError(chars, "Invalid property pattern"); 3803 } 3804 chars.jumpahead(length); 3805 append(rebuiltPat, patStr.substring(start, pos.getIndex())); 3806 } 3807 3808 //---------------------------------------------------------------- 3809 // Case folding API 3810 //---------------------------------------------------------------- 3811 3812 /** 3813 * Bitmask for constructor and applyPattern() indicating that 3814 * white space should be ignored. If set, ignore Unicode Pattern_White_Space characters, 3815 * unless they are quoted or escaped. This may be ORed together 3816 * with other selectors. 3817 */ 3818 public static final int IGNORE_SPACE = 1; 3819 3820 /** 3821 * Alias for {@link #CASE_INSENSITIVE}. 3822 * 3823 * @deprecated ICU 73 Use {@link #CASE_INSENSITIVE} instead. 3824 */ 3825 @Deprecated 3826 public static final int CASE = 2; 3827 3828 /** 3829 * Enable case insensitive matching. E.g., "[ab]" with this flag 3830 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3831 * match all except 'a', 'A', 'b', and 'B'. This performs a full 3832 * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. 3833 * 3834 * <p>This value is an options bit set value for some 3835 * constructors, applyPattern(), and closeOver(). 3836 * It can be ORed together with other, unrelated options. 3837 * 3838 * <p>The resulting set is a superset of the input for the code points but 3839 * not for the strings. 3840 * It performs a case mapping closure of the code points and adds 3841 * full case folding strings for the code points, and reduces strings of 3842 * the original set to their full case folding equivalents. 3843 * 3844 * <p>This is designed for case-insensitive matches, for example 3845 * in regular expressions. The full code point case closure allows checking of 3846 * an input character directly against the closure set. 3847 * Strings are matched by comparing the case-folded form from the closure 3848 * set with an incremental case folding of the string in question. 3849 * 3850 * <p>The closure set will also contain single code points if the original 3851 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 3852 * This is not necessary (that is, redundant) for the above matching method 3853 * but results in the same closure sets regardless of whether the original 3854 * set contained the code point or a string. 3855 */ 3856 public static final int CASE_INSENSITIVE = 2; 3857 3858 /** 3859 * Adds all case mappings for each element in the set. 3860 * This adds the full lower-, title-, and uppercase mappings as well as the full case folding 3861 * of each existing element in the set. 3862 * 3863 * <p>This value is an options bit set value for some 3864 * constructors, applyPattern(), and closeOver(). 3865 * It can be ORed together with other, unrelated options. 3866 * 3867 * <p>Unlike the “case insensitive” options, this does not perform a closure. 3868 * For example, it does not add 'ſ' (U+017F long s) for 's', 3869 * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. 3870 */ 3871 public static final int ADD_CASE_MAPPINGS = 4; 3872 3873 /** 3874 * Enable case insensitive matching. 3875 * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings, 3876 * which map each code point to one code point, 3877 * not full Case_Folding (cf) mappings, which map some code points to multiple code points. 3878 * 3879 * <p>This is designed for case-insensitive matches, for example in certain 3880 * regular expression implementations where only Simple_Case_Folding mappings are used, 3881 * such as in ECMAScript (JavaScript) regular expressions. 3882 * 3883 * <p>This value is an options bit set value for some 3884 * constructors, applyPattern(), and closeOver(). 3885 * It can be ORed together with other, unrelated options. 3886 */ 3887 @android.annotation.FlaggedApi(com.android.icu.Flags.FLAG_ICU_25Q2_API) 3888 public static final int SIMPLE_CASE_INSENSITIVE = 6; 3889 3890 private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS; 3891 3892 // add the result of a full case mapping to the set 3893 // use str as a temporary string to avoid constructing one addCaseMapping(UnicodeSet set, int result, StringBuilder full)3894 private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { 3895 if(result >= 0) { 3896 if(result > UCaseProps.MAX_STRING_LENGTH) { 3897 // add a single-code point case mapping 3898 set.add(result); 3899 } else { 3900 // add a string case mapping from full with length result 3901 set.add(full.toString()); 3902 full.setLength(0); 3903 } 3904 } 3905 // result < 0: the code point mapped to itself, no need to add it 3906 // see UCaseProps 3907 } 3908 3909 /** For case closure on a large set, look only at code points with relevant properties. */ maybeOnlyCaseSensitive(UnicodeSet src)3910 UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) { 3911 if (src.size() < 30) { 3912 return src; 3913 } 3914 // Return the intersection of the src code points with Case_Sensitive ones. 3915 UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE); 3916 // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src. 3917 if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) { 3918 return sensitive.cloneAsThawed().retainAll(src); 3919 } else { 3920 return ((UnicodeSet) src.clone()).retainAll(sensitive); 3921 } 3922 } 3923 3924 // Per-character scf = Simple_Case_Folding of a string. 3925 // (Normally when we case-fold a string we use full case foldings.) scfString(CharSequence s, StringBuilder scf)3926 private static final boolean scfString(CharSequence s, StringBuilder scf) { 3927 int length = s.length(); 3928 // Loop while not needing modification. 3929 for (int i = 0; i < length;) { 3930 int c = Character.codePointAt(s, i); 3931 int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); 3932 if (scfChar != c) { 3933 // Copy the characters before c. 3934 scf.setLength(0); 3935 scf.append(s, 0, i); 3936 // Loop over the rest of the string and keep case-folding. 3937 for (;;) { 3938 scf.appendCodePoint(scfChar); 3939 i += Character.charCount(c); 3940 if (i == length) { 3941 return true; 3942 } 3943 c = Character.codePointAt(s, i); 3944 scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); 3945 } 3946 } 3947 i += Character.charCount(c); 3948 } 3949 return false; 3950 } 3951 3952 /** 3953 * Close this set over the given attribute. For the attribute 3954 * {@link #CASE_INSENSITIVE}, the result is to modify this set so that: 3955 * 3956 * <ol> 3957 * <li>For each character or string 'a' in this set, all strings 3958 * 'b' such that foldCase(a) == foldCase(b) are added to this set. 3959 * (For most 'a' that are single characters, 'b' will have 3960 * b.length() == 1.) 3961 * 3962 * <li>For each string 'e' in the resulting set, if e != 3963 * foldCase(e), 'e' will be removed. 3964 * </ol> 3965 * 3966 * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] 3967 * 3968 * <p>(Here foldCase(x) refers to the operation 3969 * UCharacter.foldCase(x, true), and a == b actually denotes 3970 * a.equals(b), not pointer comparison.) 3971 * 3972 * @param attribute bitmask for attributes to close over. 3973 * Valid options: 3974 * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 3975 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 3976 * Unrelated options bits are ignored. 3977 * @return a reference to this set. 3978 */ closeOver(int attribute)3979 public UnicodeSet closeOver(int attribute) { 3980 checkFrozen(); 3981 switch (attribute & CASE_MASK) { 3982 case 0: 3983 break; 3984 case CASE_INSENSITIVE: 3985 closeOverCaseInsensitive(/* simple= */ false); 3986 break; 3987 case ADD_CASE_MAPPINGS: 3988 closeOverAddCaseMappings(); 3989 break; 3990 case SIMPLE_CASE_INSENSITIVE: 3991 closeOverCaseInsensitive(/* simple= */ true); 3992 break; 3993 default: 3994 // bad option (unreachable) 3995 break; 3996 } 3997 return this; 3998 } 3999 closeOverCaseInsensitive(boolean simple)4000 private void closeOverCaseInsensitive(boolean simple) { 4001 UCaseProps csp = UCaseProps.INSTANCE; 4002 // Start with input set to guarantee inclusion. 4003 UnicodeSet foldSet = new UnicodeSet(this); 4004 4005 // Full case mappings closure: 4006 // Remove strings because the strings will actually be reduced (folded); 4007 // therefore, start with no strings and add only those needed. 4008 // Do this before processing code points, because they may add strings. 4009 if (!simple && foldSet.hasStrings()) { 4010 foldSet.strings.clear(); 4011 } 4012 4013 UnicodeSet codePoints = maybeOnlyCaseSensitive(this); 4014 4015 // Iterate over the ranges of single code points. Nested loop for each code point. 4016 int n = codePoints.getRangeCount(); 4017 for (int i=0; i<n; ++i) { 4018 int start = codePoints.getRangeStart(i); 4019 int end = codePoints.getRangeEnd(i); 4020 4021 if (simple) { 4022 for (int cp=start; cp<=end; ++cp) { 4023 csp.addSimpleCaseClosure(cp, foldSet); 4024 } 4025 } else { 4026 for (int cp=start; cp<=end; ++cp) { 4027 csp.addCaseClosure(cp, foldSet); 4028 } 4029 } 4030 } 4031 if (hasStrings()) { 4032 StringBuilder sb = simple ? new StringBuilder() : null; 4033 for (String s : strings) { 4034 if (simple) { 4035 if (scfString(s, sb)) { 4036 foldSet.remove(s).add(sb); 4037 } 4038 } else { 4039 String str = UCharacter.foldCase(s, 0); 4040 if(!csp.addStringCaseClosure(str, foldSet)) { 4041 foldSet.add(str); // does not map to code points: add the folded string itself 4042 } 4043 } 4044 } 4045 } 4046 set(foldSet); 4047 } 4048 closeOverAddCaseMappings()4049 private void closeOverAddCaseMappings() { 4050 UCaseProps csp = UCaseProps.INSTANCE; 4051 // Start with input set to guarantee inclusion. 4052 UnicodeSet foldSet = new UnicodeSet(this); 4053 4054 UnicodeSet codePoints = maybeOnlyCaseSensitive(this); 4055 4056 // Iterate over the ranges of single code points. Nested loop for each code point. 4057 int n = codePoints.getRangeCount(); 4058 int result; 4059 StringBuilder full = new StringBuilder(); 4060 4061 for (int i=0; i<n; ++i) { 4062 int start = codePoints.getRangeStart(i); 4063 int end = codePoints.getRangeEnd(i); 4064 4065 // add case mappings 4066 // (does not add long s for regular s, or Kelvin for k, for example) 4067 for (int cp=start; cp<=end; ++cp) { 4068 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); 4069 addCaseMapping(foldSet, result, full); 4070 4071 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); 4072 addCaseMapping(foldSet, result, full); 4073 4074 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); 4075 addCaseMapping(foldSet, result, full); 4076 4077 result = csp.toFullFolding(cp, full, 0); 4078 addCaseMapping(foldSet, result, full); 4079 } 4080 } 4081 if (hasStrings()) { 4082 ULocale root = ULocale.ROOT; 4083 BreakIterator bi = BreakIterator.getWordInstance(root); 4084 for (String str : strings) { 4085 // TODO: call lower-level functions 4086 foldSet.add(UCharacter.toLowerCase(root, str)); 4087 foldSet.add(UCharacter.toTitleCase(root, str, bi)); 4088 foldSet.add(UCharacter.toUpperCase(root, str)); 4089 foldSet.add(UCharacter.foldCase(str, 0)); 4090 } 4091 } 4092 set(foldSet); 4093 } 4094 4095 /** 4096 * Internal class for customizing UnicodeSet parsing of properties. 4097 * TODO: extend to allow customizing of codepoint ranges 4098 * @author medavis 4099 * @hide Only a subset of ICU is exposed in Android 4100 * @hide draft / provisional / internal are hidden on Android 4101 */ 4102 abstract public static class XSymbolTable implements SymbolTable { 4103 /** 4104 * Default constructor 4105 * @hide draft / provisional / internal are hidden on Android 4106 */ XSymbolTable()4107 public XSymbolTable(){} 4108 /** 4109 * Supplies default implementation for SymbolTable (no action). 4110 * @hide draft / provisional / internal are hidden on Android 4111 */ 4112 @Override lookupMatcher(int i)4113 public UnicodeMatcher lookupMatcher(int i) { 4114 return null; 4115 } 4116 4117 /** 4118 * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style 4119 * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be. 4120 * <p> 4121 * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a 4122 * propertyName+propertyValue combination. 4123 * 4124 * @param propertyName 4125 * the name of the property 4126 * @param propertyValue 4127 * the name of the property value 4128 * @param result UnicodeSet value to change 4129 * a set to which the characters having the propertyName+propertyValue are to be added. 4130 * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters 4131 * with that property have been added to the UnicodeSet, and returns false if the 4132 * propertyName+propertyValue combination is not recognized (in which case result is unaltered). 4133 * @hide draft / provisional / internal are hidden on Android 4134 */ applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)4135 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 4136 return false; 4137 } 4138 /** 4139 * Supplies default implementation for SymbolTable (no action). 4140 * @hide draft / provisional / internal are hidden on Android 4141 */ 4142 @Override lookup(String s)4143 public char[] lookup(String s) { 4144 return null; 4145 } 4146 /** 4147 * Supplies default implementation for SymbolTable (no action). 4148 * @hide draft / provisional / internal are hidden on Android 4149 */ 4150 @Override parseReference(String text, ParsePosition pos, int limit)4151 public String parseReference(String text, ParsePosition pos, int limit) { 4152 return null; 4153 } 4154 } 4155 4156 /** 4157 * Is this frozen, according to the Freezable interface? 4158 * 4159 * @return value 4160 */ 4161 @Override isFrozen()4162 public boolean isFrozen() { 4163 return (bmpSet != null || stringSpan != null); 4164 } 4165 4166 /** 4167 * Freeze this class, according to the Freezable interface. 4168 * 4169 * @return this 4170 */ 4171 @Override freeze()4172 public UnicodeSet freeze() { 4173 if (!isFrozen()) { 4174 compact(); 4175 4176 // Optimize contains() and span() and similar functions. 4177 if (hasStrings()) { 4178 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL); 4179 } 4180 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { 4181 // Optimize for code point spans. 4182 // There are no strings, or 4183 // all strings are irrelevant for span() etc. because 4184 // all of each string's code points are contained in this set. 4185 // However, fully contained strings are relevant for spanAndCount(), 4186 // so we create both objects. 4187 bmpSet = new BMPSet(list, len); 4188 } 4189 } 4190 return this; 4191 } 4192 4193 /** 4194 * Span a string using this UnicodeSet. 4195 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4196 * @param s The string to be spanned 4197 * @param spanCondition The span condition 4198 * @return the length of the span 4199 */ span(CharSequence s, SpanCondition spanCondition)4200 public int span(CharSequence s, SpanCondition spanCondition) { 4201 return span(s, 0, spanCondition); 4202 } 4203 4204 /** 4205 * Span a string using this UnicodeSet. 4206 * If the start index is less than 0, span will start from 0. 4207 * If the start index is greater than the string length, span returns the string length. 4208 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4209 * @param s The string to be spanned 4210 * @param start The start index that the span begins 4211 * @param spanCondition The span condition 4212 * @return the string index which ends the span (i.e. exclusive) 4213 */ span(CharSequence s, int start, SpanCondition spanCondition)4214 public int span(CharSequence s, int start, SpanCondition spanCondition) { 4215 int end = s.length(); 4216 if (start < 0) { 4217 start = 0; 4218 } else if (start >= end) { 4219 return end; 4220 } 4221 if (bmpSet != null) { 4222 // Frozen set without strings, or no string is relevant for span(). 4223 return bmpSet.span(s, start, spanCondition, null); 4224 } 4225 if (stringSpan != null) { 4226 return stringSpan.span(s, start, spanCondition); 4227 } else if (hasStrings()) { 4228 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4229 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4230 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4231 if (strSpan.needsStringSpanUTF16()) { 4232 return strSpan.span(s, start, spanCondition); 4233 } 4234 } 4235 4236 return spanCodePointsAndCount(s, start, spanCondition, null); 4237 } 4238 4239 /** 4240 * Same as span() but also counts the smallest number of set elements on any path across the span. 4241 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4242 * @param outCount An output-only object (must not be null) for returning the count. 4243 * @return the limit (exclusive end) of the span 4244 * @deprecated This API is ICU internal only. 4245 * @hide original deprecated declaration 4246 * @hide draft / provisional / internal are hidden on Android 4247 */ 4248 @Deprecated spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4249 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { 4250 if (outCount == null) { 4251 throw new IllegalArgumentException("outCount must not be null"); 4252 } 4253 int end = s.length(); 4254 if (start < 0) { 4255 start = 0; 4256 } else if (start >= end) { 4257 return end; 4258 } 4259 if (stringSpan != null) { 4260 // We might also have bmpSet != null, 4261 // but fully-contained strings are relevant for counting elements. 4262 return stringSpan.spanAndCount(s, start, spanCondition, outCount); 4263 } else if (bmpSet != null) { 4264 return bmpSet.span(s, start, spanCondition, outCount); 4265 } else if (hasStrings()) { 4266 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4267 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4268 which |= UnicodeSetStringSpan.WITH_COUNT; 4269 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4270 return strSpan.spanAndCount(s, start, spanCondition, outCount); 4271 } 4272 4273 return spanCodePointsAndCount(s, start, spanCondition, outCount); 4274 } 4275 spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4276 private int spanCodePointsAndCount(CharSequence s, int start, 4277 SpanCondition spanCondition, OutputInt outCount) { 4278 // Pin to 0/1 values. 4279 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4280 4281 int c; 4282 int next = start; 4283 int length = s.length(); 4284 int count = 0; 4285 do { 4286 c = Character.codePointAt(s, next); 4287 if (spanContained != contains(c)) { 4288 break; 4289 } 4290 ++count; 4291 next += Character.charCount(c); 4292 } while (next < length); 4293 if (outCount != null) { outCount.value = count; } 4294 return next; 4295 } 4296 4297 /** 4298 * Span a string backwards (from the end) using this UnicodeSet. 4299 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4300 * @param s The string to be spanned 4301 * @param spanCondition The span condition 4302 * @return The string index which starts the span (i.e. inclusive). 4303 */ spanBack(CharSequence s, SpanCondition spanCondition)4304 public int spanBack(CharSequence s, SpanCondition spanCondition) { 4305 return spanBack(s, s.length(), spanCondition); 4306 } 4307 4308 /** 4309 * Span a string backwards (from the fromIndex) using this UnicodeSet. 4310 * If the fromIndex is less than 0, spanBack will return 0. 4311 * If fromIndex is greater than the string length, spanBack will start from the string length. 4312 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4313 * @param s The string to be spanned 4314 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards 4315 * @param spanCondition The span condition 4316 * @return The string index which starts the span (i.e. inclusive). 4317 */ spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4318 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { 4319 if (fromIndex <= 0) { 4320 return 0; 4321 } 4322 if (fromIndex > s.length()) { 4323 fromIndex = s.length(); 4324 } 4325 if (bmpSet != null) { 4326 // Frozen set without strings, or no string is relevant for spanBack(). 4327 return bmpSet.spanBack(s, fromIndex, spanCondition); 4328 } 4329 if (stringSpan != null) { 4330 return stringSpan.spanBack(s, fromIndex, spanCondition); 4331 } else if (hasStrings()) { 4332 int which = (spanCondition == SpanCondition.NOT_CONTAINED) 4333 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED 4334 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; 4335 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4336 if (strSpan.needsStringSpanUTF16()) { 4337 return strSpan.spanBack(s, fromIndex, spanCondition); 4338 } 4339 } 4340 4341 // Pin to 0/1 values. 4342 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4343 4344 int c; 4345 int prev = fromIndex; 4346 do { 4347 c = Character.codePointBefore(s, prev); 4348 if (spanContained != contains(c)) { 4349 break; 4350 } 4351 prev -= Character.charCount(c); 4352 } while (prev > 0); 4353 return prev; 4354 } 4355 4356 /** 4357 * Clone a thawed version of this class, according to the Freezable interface. 4358 * @return the clone, not frozen 4359 */ 4360 @Override cloneAsThawed()4361 public UnicodeSet cloneAsThawed() { 4362 UnicodeSet result = new UnicodeSet(this); 4363 assert !result.isFrozen(); 4364 return result; 4365 } 4366 4367 // internal function checkFrozen()4368 private void checkFrozen() { 4369 if (isFrozen()) { 4370 throw new UnsupportedOperationException("Attempt to modify frozen object"); 4371 } 4372 } 4373 4374 // ************************ 4375 // Additional methods for integration with Generics and Collections 4376 // ************************ 4377 4378 /** 4379 * A struct-like class used for iteration through ranges, for faster iteration than by String. 4380 * Read about the restrictions on usage in {@link UnicodeSet#ranges()}. 4381 */ 4382 public static class EntryRange { 4383 /** 4384 * The starting code point of the range. 4385 */ 4386 public int codepoint; 4387 /** 4388 * The ending code point of the range 4389 */ 4390 public int codepointEnd; 4391 EntryRange()4392 EntryRange() { 4393 } 4394 4395 /** 4396 * {@inheritDoc} 4397 */ 4398 @Override toString()4399 public String toString() { 4400 StringBuilder b = new StringBuilder(); 4401 return ( 4402 codepoint == codepointEnd ? _appendToPat(b, codepoint, false) 4403 : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) 4404 .toString(); 4405 } 4406 } 4407 4408 /** 4409 * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points. 4410 * The UnicodeSet must not be altered during the iteration. 4411 * The EntryRange instance is the same each time; the contents are just reset. 4412 * 4413 * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings. 4414 * 4415 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4416 * Do not alter the UnicodeSet while iterating. 4417 * 4418 * <pre> 4419 * // Sample code 4420 * for (EntryRange range : us1.ranges()) { 4421 * // do something with code points between range.codepoint and range.codepointEnd; 4422 * } 4423 * for (String s : us1.strings()) { 4424 * // do something with each string; 4425 * } 4426 * </pre> 4427 */ ranges()4428 public Iterable<EntryRange> ranges() { 4429 return new EntryRangeIterable(); 4430 } 4431 4432 private class EntryRangeIterable implements Iterable<EntryRange> { 4433 @Override iterator()4434 public Iterator<EntryRange> iterator() { 4435 return new EntryRangeIterator(); 4436 } 4437 } 4438 4439 private class EntryRangeIterator implements Iterator<EntryRange> { 4440 int pos; 4441 EntryRange result = new EntryRange(); 4442 4443 @Override hasNext()4444 public boolean hasNext() { 4445 return pos < len-1; 4446 } 4447 @Override next()4448 public EntryRange next() { 4449 if (pos < len-1) { 4450 result.codepoint = list[pos++]; 4451 result.codepointEnd = list[pos++]-1; 4452 } else { 4453 throw new NoSuchElementException(); 4454 } 4455 return result; 4456 } 4457 @Override remove()4458 public void remove() { 4459 throw new UnsupportedOperationException(); 4460 } 4461 } 4462 4463 4464 /** 4465 * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. 4466 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4467 * Do not alter the UnicodeSet while iterating. 4468 * @see java.util.Set#iterator() 4469 */ 4470 @Override iterator()4471 public Iterator<String> iterator() { 4472 return new UnicodeSetIterator2(this); 4473 } 4474 4475 // Cover for string iteration. 4476 private static class UnicodeSetIterator2 implements Iterator<String> { 4477 // Invariants: 4478 // sourceList != null then sourceList[item] is a valid character 4479 // sourceList == null then delegates to stringIterator 4480 private int[] sourceList; 4481 private int len; 4482 private int item; 4483 private int current; 4484 private int limit; 4485 private SortedSet<String> sourceStrings; 4486 private Iterator<String> stringIterator; 4487 private char[] buffer; 4488 UnicodeSetIterator2(UnicodeSet source)4489 UnicodeSetIterator2(UnicodeSet source) { 4490 // set according to invariants 4491 len = source.len - 1; 4492 if (len > 0) { 4493 sourceStrings = source.strings; 4494 sourceList = source.list; 4495 current = sourceList[item++]; 4496 limit = sourceList[item++]; 4497 } else { 4498 stringIterator = source.strings.iterator(); 4499 sourceList = null; 4500 } 4501 } 4502 4503 /* (non-Javadoc) 4504 * @see java.util.Iterator#hasNext() 4505 */ 4506 @Override hasNext()4507 public boolean hasNext() { 4508 return sourceList != null || stringIterator.hasNext(); 4509 } 4510 4511 /* (non-Javadoc) 4512 * @see java.util.Iterator#next() 4513 */ 4514 @Override next()4515 public String next() { 4516 if (sourceList == null) { 4517 return stringIterator.next(); 4518 } 4519 int codepoint = current++; 4520 // we have the codepoint we need, but we may need to adjust the state 4521 if (current >= limit) { 4522 if (item >= len) { 4523 stringIterator = sourceStrings.iterator(); 4524 sourceList = null; 4525 } else { 4526 current = sourceList[item++]; 4527 limit = sourceList[item++]; 4528 } 4529 } 4530 // Now return. Single code point is easy 4531 if (codepoint <= 0xFFFF) { 4532 return String.valueOf((char)codepoint); 4533 } 4534 // But Java lacks a valueOfCodePoint, so we handle ourselves for speed 4535 // allocate a buffer the first time, to make conversion faster. 4536 if (buffer == null) { 4537 buffer = new char[2]; 4538 } 4539 // compute ourselves, to save tests and calls 4540 int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; 4541 buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); 4542 buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); 4543 return String.valueOf(buffer); 4544 } 4545 4546 /* (non-Javadoc) 4547 * @see java.util.Iterator#remove() 4548 */ 4549 @Override remove()4550 public void remove() { 4551 throw new UnsupportedOperationException(); 4552 } 4553 } 4554 4555 /** 4556 * @see #containsAll(android.icu.text.UnicodeSet) 4557 */ containsAll(Iterable<T> collection)4558 public <T extends CharSequence> boolean containsAll(Iterable<T> collection) { 4559 for (T o : collection) { 4560 if (!contains(o)) { 4561 return false; 4562 } 4563 } 4564 return true; 4565 } 4566 4567 /** 4568 * @see #containsNone(android.icu.text.UnicodeSet) 4569 */ containsNone(Iterable<T> collection)4570 public <T extends CharSequence> boolean containsNone(Iterable<T> collection) { 4571 for (T o : collection) { 4572 if (contains(o)) { 4573 return false; 4574 } 4575 } 4576 return true; 4577 } 4578 4579 /** 4580 * @see #containsAll(android.icu.text.UnicodeSet) 4581 */ containsSome(Iterable<T> collection)4582 public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) { 4583 return !containsNone(collection); 4584 } 4585 4586 /** 4587 * @see #addAll(android.icu.text.UnicodeSet) 4588 */ 4589 @SuppressWarnings("unchecked") // See ticket #11395, this is safe. addAll(T... collection)4590 public <T extends CharSequence> UnicodeSet addAll(T... collection) { 4591 checkFrozen(); 4592 for (T str : collection) { 4593 add(str); 4594 } 4595 return this; 4596 } 4597 4598 4599 /** 4600 * @see #removeAll(android.icu.text.UnicodeSet) 4601 */ removeAll(Iterable<T> collection)4602 public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) { 4603 checkFrozen(); 4604 for (T o : collection) { 4605 remove(o); 4606 } 4607 return this; 4608 } 4609 4610 /** 4611 * @see #retainAll(android.icu.text.UnicodeSet) 4612 */ retainAll(Iterable<T> collection)4613 public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) { 4614 checkFrozen(); 4615 // TODO optimize 4616 UnicodeSet toRetain = new UnicodeSet(); 4617 toRetain.addAll(collection); 4618 retainAll(toRetain); 4619 return this; 4620 } 4621 4622 /** 4623 * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}. 4624 */ 4625 public enum ComparisonStyle { 4626 /** 4627 */ 4628 SHORTER_FIRST, 4629 /** 4630 */ 4631 LEXICOGRAPHIC, 4632 /** 4633 */ 4634 LONGER_FIRST 4635 } 4636 4637 /** 4638 * Compares UnicodeSets, where shorter come first, and otherwise lexicographically 4639 * (according to the comparison of the first characters that differ). 4640 * @see java.lang.Comparable#compareTo(java.lang.Object) 4641 */ 4642 @Override compareTo(UnicodeSet o)4643 public int compareTo(UnicodeSet o) { 4644 return compareTo(o, ComparisonStyle.SHORTER_FIRST); 4645 } 4646 /** 4647 * Compares UnicodeSets, in three different ways. 4648 * @see java.lang.Comparable#compareTo(java.lang.Object) 4649 */ compareTo(UnicodeSet o, ComparisonStyle style)4650 public int compareTo(UnicodeSet o, ComparisonStyle style) { 4651 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4652 int diff = size() - o.size(); 4653 if (diff != 0) { 4654 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4655 } 4656 } 4657 int result; 4658 for (int i = 0; ; ++i) { 4659 if (0 != (result = list[i] - o.list[i])) { 4660 // if either list ran out, compare to the last string 4661 if (list[i] == HIGH) { 4662 if (!hasStrings()) return 1; 4663 String item = strings.first(); 4664 return compare(item, o.list[i]); 4665 } 4666 if (o.list[i] == HIGH) { 4667 if (!o.hasStrings()) return -1; 4668 String item = o.strings.first(); 4669 int compareResult = compare(item, list[i]); 4670 return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order. 4671 } 4672 // otherwise return the result if even index, or the reversal if not 4673 return (i & 1) == 0 ? result : -result; 4674 } 4675 if (list[i] == HIGH) { 4676 break; 4677 } 4678 } 4679 return compare(strings, o.strings); 4680 } 4681 4682 /** 4683 */ compareTo(Iterable<String> other)4684 public int compareTo(Iterable<String> other) { 4685 return compare(this, other); 4686 } 4687 4688 /** 4689 * Utility to compare a string to a code point. 4690 * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) 4691 * and comparing, but much faster (no object creation). 4692 * Actually, there is one difference; a null compares as less. 4693 * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order. 4694 * @hide unsupported on Android 4695 */ 4696 compare(CharSequence string, int codePoint)4697 public static int compare(CharSequence string, int codePoint) { 4698 return CharSequences.compare(string, codePoint); 4699 } 4700 4701 /** 4702 * Utility to compare a string to a code point. 4703 * Same results as turning the code point into a string and comparing, but much faster (no object creation). 4704 * Actually, there is one difference; a null compares as less. 4705 * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order. 4706 * @hide unsupported on Android 4707 */ compare(int codePoint, CharSequence string)4708 public static int compare(int codePoint, CharSequence string) { 4709 return -CharSequences.compare(string, codePoint); 4710 } 4711 4712 4713 /** 4714 * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered, 4715 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4716 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4717 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4718 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4719 * @hide unsupported on Android 4720 */ compare(Iterable<T> collection1, Iterable<T> collection2)4721 public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) { 4722 return compare(collection1.iterator(), collection2.iterator()); 4723 } 4724 4725 /** 4726 * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, 4727 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4728 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4729 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4730 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4731 * @deprecated This API is ICU internal only. 4732 * @hide original deprecated declaration 4733 * @hide draft / provisional / internal are hidden on Android 4734 */ 4735 @Deprecated compare(Iterator<T> first, Iterator<T> other)4736 public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) { 4737 while (true) { 4738 if (!first.hasNext()) { 4739 return other.hasNext() ? -1 : 0; 4740 } else if (!other.hasNext()) { 4741 return 1; 4742 } 4743 T item1 = first.next(); 4744 T item2 = other.next(); 4745 int result = item1.compareTo(item2); 4746 if (result != 0) { 4747 return result; 4748 } 4749 } 4750 } 4751 4752 4753 /** 4754 * Utility to compare two collections, optionally by size, and then lexicographically. 4755 * @hide unsupported on Android 4756 */ compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4757 public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) { 4758 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4759 int diff = collection1.size() - collection2.size(); 4760 if (diff != 0) { 4761 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4762 } 4763 } 4764 return compare(collection1, collection2); 4765 } 4766 4767 /** 4768 * Utility for adding the contents of an iterable to a collection. 4769 * @hide unsupported on Android 4770 */ addAllTo(Iterable<T> source, U target)4771 public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) { 4772 for (T item : source) { 4773 target.add(item); 4774 } 4775 return target; 4776 } 4777 4778 /** 4779 * Utility for adding the contents of an iterable to a collection. 4780 * @hide unsupported on Android 4781 */ addAllTo(Iterable<T> source, T[] target)4782 public static <T> T[] addAllTo(Iterable<T> source, T[] target) { 4783 int i = 0; 4784 for (T item : source) { 4785 target[i++] = item; 4786 } 4787 return target; 4788 } 4789 4790 /** 4791 * For iterating through the strings in the set. Example: 4792 * <pre> 4793 * for (String key : myUnicodeSet.strings()) { 4794 * doSomethingWith(key); 4795 * } 4796 * </pre> 4797 */ strings()4798 public Collection<String> strings() { 4799 if (hasStrings()) { 4800 return Collections.unmodifiableSortedSet(strings); 4801 } else { 4802 return EMPTY_STRINGS; 4803 } 4804 } 4805 4806 /** 4807 * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE. 4808 * @deprecated This API is ICU internal only. 4809 * @hide original deprecated declaration 4810 * @hide draft / provisional / internal are hidden on Android 4811 */ 4812 @Deprecated getSingleCodePoint(CharSequence s)4813 public static int getSingleCodePoint(CharSequence s) { 4814 return CharSequences.getSingleCodePoint(s); 4815 } 4816 4817 /** 4818 * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set. 4819 * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E 4820 * if the dontCare set includes unassigned characters (for a particular version of Unicode). 4821 * @param dontCare Set with the don't-care characters for spanning 4822 * @return the input set, modified 4823 * @deprecated This API is ICU internal only. 4824 * @hide original deprecated declaration 4825 * @hide draft / provisional / internal are hidden on Android 4826 */ 4827 @Deprecated addBridges(UnicodeSet dontCare)4828 public UnicodeSet addBridges(UnicodeSet dontCare) { 4829 UnicodeSet notInInput = new UnicodeSet(this).complement().removeAllStrings(); 4830 for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) { 4831 if (it.codepoint != 0 && it.codepointEnd != 0x10FFFF && 4832 dontCare.contains(it.codepoint, it.codepointEnd)) { 4833 add(it.codepoint,it.codepointEnd); 4834 } 4835 } 4836 return this; 4837 } 4838 4839 /** 4840 * Find the first index at or after fromIndex where the UnicodeSet matches at that index. 4841 * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. 4842 * If there is no match, length is returned. 4843 * @deprecated This API is ICU internal only. Use span instead. 4844 * @hide original deprecated declaration 4845 * @hide draft / provisional / internal are hidden on Android 4846 */ 4847 @Deprecated findIn(CharSequence value, int fromIndex, boolean findNot)4848 public int findIn(CharSequence value, int fromIndex, boolean findNot) { 4849 //TODO add strings, optimize, using ICU4C algorithms 4850 int cp; 4851 for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) { 4852 cp = UTF16.charAt(value, fromIndex); 4853 if (contains(cp) != findNot) { 4854 break; 4855 } 4856 } 4857 return fromIndex; 4858 } 4859 4860 /** 4861 * Find the last index before fromIndex where the UnicodeSet matches at that index. 4862 * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match. 4863 * If there is no match, -1 is returned. 4864 * BEFORE index is not in the UnicodeSet. 4865 * @deprecated This API is ICU internal only. Use spanBack instead. 4866 * @hide original deprecated declaration 4867 * @hide draft / provisional / internal are hidden on Android 4868 */ 4869 @Deprecated findLastIn(CharSequence value, int fromIndex, boolean findNot)4870 public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { 4871 //TODO add strings, optimize, using ICU4C algorithms 4872 int cp; 4873 fromIndex -= 1; 4874 for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) { 4875 cp = UTF16.charAt(value, fromIndex); 4876 if (contains(cp) != findNot) { 4877 break; 4878 } 4879 } 4880 return fromIndex < 0 ? -1 : fromIndex; 4881 } 4882 4883 /** 4884 * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match. 4885 * @param source The source of the CharSequence to strip from. 4886 * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. 4887 * @return The string after it has been stripped. 4888 * @deprecated This API is ICU internal only. Use replaceFrom. 4889 * @hide original deprecated declaration 4890 * @hide draft / provisional / internal are hidden on Android 4891 */ 4892 @Deprecated stripFrom(CharSequence source, boolean matches)4893 public String stripFrom(CharSequence source, boolean matches) { 4894 StringBuilder result = new StringBuilder(); 4895 for (int pos = 0; pos < source.length();) { 4896 int inside = findIn(source, pos, !matches); 4897 result.append(source.subSequence(pos, inside)); 4898 pos = findIn(source, inside, matches); // get next start 4899 } 4900 return result.toString(); 4901 } 4902 4903 /** 4904 * Argument values for whether span() and similar functions continue while the current character is contained vs. 4905 * not contained in the set. 4906 * <p> 4907 * The functionality is straightforward for sets with only single code points, without strings (which is the common 4908 * case): 4909 * <ul> 4910 * <li>CONTAINED and SIMPLE work the same. 4911 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. 4912 * <li>span() and spanBack() partition any string the 4913 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). 4914 * <li>Using a 4915 * complemented (inverted) set and the opposite span conditions yields the same results. 4916 * </ul> 4917 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in 4918 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with 4919 * strings: 4920 * <ul> 4921 * <li>The complement of the set contains the opposite set of code points, but the same set of strings. 4922 * Therefore, complementing both the set and the span conditions may yield different results. 4923 * <li>When starting spans 4924 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 4925 * because a set string may start before the later position. 4926 * <li>span(SIMPLE) may be shorter than 4927 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which 4928 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", 4929 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). 4930 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, 4931 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield 4932 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. 4933 * </ul> 4934 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then 4935 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could 4936 * be used. 4937 * <p> 4938 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point 4939 * boundaries, never in the middle of a surrogate pair. 4940 */ 4941 public enum SpanCondition { 4942 /** 4943 * Continues a span() while there is no set element at the current position. 4944 * Increments by one code point at a time. 4945 * Stops before the first set element (character or string). 4946 * (For code points only, this is like while contains(current)==false). 4947 * <p> 4948 * When span() returns, the substring between where it started and the position it returned consists only of 4949 * characters that are not in the set, and none of its strings overlap with the span. 4950 */ 4951 NOT_CONTAINED, 4952 4953 /** 4954 * Spans the longest substring that is a concatenation of set elements (characters or strings). 4955 * (For characters only, this is like while contains(current)==true). 4956 * <p> 4957 * When span() returns, the substring between where it started and the position it returned consists only of set 4958 * elements (characters or strings) that are in the set. 4959 * <p> 4960 * If a set contains strings, then the span will be the longest substring for which there 4961 * exists at least one non-overlapping concatenation of set elements (characters or strings). 4962 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 4963 * (Java/ICU/Perl regex stops at the first match of an OR.) 4964 */ 4965 CONTAINED, 4966 4967 /** 4968 * Continues a span() while there is a set element at the current position. 4969 * Increments by the longest matching element at each position. 4970 * (For characters only, this is like while contains(current)==true). 4971 * <p> 4972 * When span() returns, the substring between where it started and the position it returned consists only of set 4973 * elements (characters or strings) that are in the set. 4974 * <p> 4975 * If a set only contains single characters, then this is the same as CONTAINED. 4976 * <p> 4977 * If a set contains strings, then the span will be the longest substring with a match at each position with the 4978 * longest single set element (character or string). 4979 * <p> 4980 * Use this span condition together with other longest-match algorithms, such as ICU converters 4981 * (ucnv_getUnicodeSet()). 4982 */ 4983 SIMPLE, 4984 4985 /** 4986 * One more than the last span condition. 4987 */ 4988 CONDITION_COUNT 4989 } 4990 4991 /** 4992 * Get the default symbol table. Null means ordinary processing. For internal use only. 4993 * @return the symbol table 4994 * @deprecated This API is ICU internal only. 4995 * @hide original deprecated declaration 4996 * @hide draft / provisional / internal are hidden on Android 4997 */ 4998 @Deprecated getDefaultXSymbolTable()4999 public static XSymbolTable getDefaultXSymbolTable() { 5000 return XSYMBOL_TABLE; 5001 } 5002 5003 /** 5004 * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing 5005 * of UnicodeSets. 5006 * <p> 5007 * WARNING: If this function is used with a UnicodeProperty, and the 5008 * Unassigned characters (gc=Cn) are different than in ICU, you MUST call 5009 * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} 5010 * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. 5011 * 5012 * @param xSymbolTable the new default symbol table. 5013 * @deprecated This API is ICU internal only. 5014 * @hide original deprecated declaration 5015 * @hide draft / provisional / internal are hidden on Android 5016 */ 5017 @Deprecated setDefaultXSymbolTable(XSymbolTable xSymbolTable)5018 public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { 5019 // If the properties override inclusions, these have to be regenerated. 5020 // TODO: Check if the Unicode Tools or Unicode Utilities really need this. 5021 CharacterPropertiesImpl.clear(); 5022 XSYMBOL_TABLE = xSymbolTable; 5023 } 5024 5025 /** 5026 * Returns a {@link Stream} of {@link EntryRange} values from this {@code UnicodeSet}. 5027 * 5028 * <p><b>Warnings:</b> 5029 * <ul> 5030 * <li>The {@link EntryRange} instance is the same each time; the contents are just reset. 5031 * <li>To iterate over the full contents, you have to also iterate over the strings. 5032 * <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br> 5033 * Do not alter the {@code UnicodeSet} while iterating. 5034 * </ul> 5035 * 5036 * @return a {@link Stream} of {@link EntryRange} 5037 * 5038 * @hide draft / provisional / internal are hidden on Android 5039 */ rangeStream()5040 public Stream<EntryRange> rangeStream() { 5041 // Must use false to never make this parallel because the iterator always returns the same EntryRange object. 5042 return StreamSupport.stream(ranges().spliterator(), false); 5043 } 5044 5045 /** 5046 * Returns a {@link Stream} of {@code String} values from this {@code UnicodeSet}. 5047 * 5048 * <p><b>Warnings:</b> 5049 * <ul> 5050 * <li>To iterate over the full contents, you have to also iterate over the ranges or code points. 5051 * <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br> 5052 * Do not alter the {@code UnicodeSet} while iterating. 5053 * </ul> 5054 * 5055 * @return a {@link Stream} of {@code String} 5056 * 5057 * @hide draft / provisional / internal are hidden on Android 5058 */ stringStream()5059 public Stream<String> stringStream() { 5060 return strings().stream(); 5061 } 5062 5063 /** 5064 * Returns an {@link IntStream} of Unicode code point values from this {@code UnicodeSet}. 5065 * 5066 * <p><b>Warnings:</b> 5067 * <ul> 5068 * <li>To iterate over the full contents, you have to also iterate over the strings. 5069 * <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br> 5070 * Do not alter the {@code UnicodeSet} while iterating. 5071 * </ul> 5072 * 5073 * @return an {@link IntStream} of Unicode code point values 5074 * 5075 * @hide draft / provisional / internal are hidden on Android 5076 */ codePointStream()5077 public IntStream codePointStream() { 5078 return StreamSupport.intStream(new CodePointSpliterator(this), false); 5079 } 5080 5081 /** 5082 * Returns a stream of {@code String} values from this {@code UnicodeSet}. 5083 * 5084 * <p><b>Warnings:</b> 5085 * <ul> 5086 * <li>To iterate over the full contents, you have to also iterate over the strings. 5087 * <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br> 5088 * Do not alter the {@code UnicodeSet} while iterating. 5089 * </ul> 5090 * 5091 * @return a {@link Stream} of {@code String} 5092 * 5093 * @hide draft / provisional / internal are hidden on Android 5094 */ stream()5095 public Stream<String> stream() { 5096 return StreamSupport.stream(spliterator(), false); 5097 } 5098 5099 /** 5100 * Returns an {@link Iterable} for iteration over all the code points in this set. 5101 * 5102 * <p><b>Warnings:</b> 5103 * <ul> 5104 * <li>This is a convenience method, but comes with a performance penalty 5105 * because it boxes {@code int} into {@code Integer}.<br> 5106 * For an efficient but old alternative use {@link UnicodeSetIterator#next()}. 5107 * <li>To iterate over the full contents, you have to also iterate over the strings. 5108 * <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br> 5109 * Do not alter the {@code UnicodeSet} while iterating. 5110 * </ul> 5111 * 5112 * @return an {@link Iterable} over all the code points 5113 * 5114 * @hide draft / provisional / internal are hidden on Android 5115 */ codePoints()5116 public Iterable<Integer> codePoints() { 5117 return new CodePointIterable(this); 5118 } 5119 5120 private class CodePointIterable implements Iterable<Integer> { 5121 private final UnicodeSet unicodeSet; 5122 CodePointIterable(UnicodeSet unicodeSet)5123 CodePointIterable(UnicodeSet unicodeSet) { 5124 this.unicodeSet = unicodeSet; 5125 } 5126 5127 @Override iterator()5128 public Iterator<Integer> iterator() { 5129 return new CodePointIterator(unicodeSet); 5130 } 5131 } 5132 5133 private class CodePointIterator implements Iterator<Integer> { 5134 private final CodePointIteratorInt cpi; 5135 CodePointIterator(UnicodeSet unicodeSet)5136 CodePointIterator(UnicodeSet unicodeSet) { 5137 cpi = new CodePointIteratorInt(unicodeSet); 5138 } 5139 5140 @Override hasNext()5141 public boolean hasNext() { 5142 return cpi.hasNext(); 5143 } 5144 5145 @Override next()5146 public Integer next() { 5147 return cpi.next(); 5148 } 5149 5150 @Override remove()5151 public void remove() { 5152 throw new UnsupportedOperationException(); 5153 } 5154 } 5155 5156 private static class CodePointSpliterator implements Spliterator.OfInt { 5157 private final static int CHARACTERISTICS = Spliterator.SIZED | Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.NONNULL; 5158 5159 private final UnicodeSet unicodeSet; 5160 private final CodePointIteratorInt cpi; 5161 CodePointSpliterator(UnicodeSet unicodeSet)5162 CodePointSpliterator(UnicodeSet unicodeSet) { 5163 this.unicodeSet = unicodeSet; 5164 cpi = new CodePointIteratorInt(unicodeSet); 5165 } 5166 5167 @Override estimateSize()5168 public long estimateSize() { 5169 return unicodeSet.size() - unicodeSet.strings.size(); 5170 } 5171 5172 @Override characteristics()5173 public int characteristics() { 5174 return unicodeSet.isFrozen() ? Spliterator.IMMUTABLE | CHARACTERISTICS : CHARACTERISTICS; 5175 } 5176 5177 @Override trySplit()5178 public Spliterator.OfInt trySplit() { 5179 /* From the doc: 5180 * > This method may return null for any reason, including emptiness, inability to split after 5181 * > traversal has commenced, data structure constraints, and efficiency considerations. 5182 */ 5183 return null; 5184 } 5185 5186 @Override tryAdvance(IntConsumer action)5187 public boolean tryAdvance(IntConsumer action) { 5188 if (action == null) { 5189 throw new NullPointerException(); 5190 } 5191 5192 if (cpi.hasNext()) { 5193 action.accept(cpi.next()); 5194 return true; 5195 } 5196 return false; 5197 } 5198 } 5199 5200 /** 5201 * This class is optimized to iterate on code points and will be used to implement both 5202 * the Iterator<Integer> (Integer, boxed value) and the Spliterator.OfInt (int primitive). 5203 * It looks exactly like an Iterator<Integer>, but works on the primitive int, 5204 * so it can't implement Iterator. 5205 */ 5206 static private class CodePointIteratorInt { 5207 private final int[] list; 5208 private final int lastRange; 5209 private int currentRange = 0; 5210 private int rangeStart; 5211 private int rangeLimit; 5212 CodePointIteratorInt(UnicodeSet unicodeSet)5213 public CodePointIteratorInt(UnicodeSet unicodeSet) { 5214 this.list = unicodeSet.list; 5215 lastRange = unicodeSet.len - 1; 5216 currentRange = 0; 5217 rangeStart = list[currentRange++]; 5218 if (lastRange > 0) { // not an empty list 5219 rangeLimit = list[currentRange++]; 5220 } else { 5221 rangeLimit = rangeStart; // should be HIGH, the guard value 5222 } 5223 } 5224 hasNext()5225 public boolean hasNext() { 5226 return rangeStart < rangeLimit || currentRange < lastRange; 5227 } 5228 next()5229 public int next() { 5230 if (rangeStart >= rangeLimit) { 5231 if (currentRange >= lastRange) { 5232 throw new NoSuchElementException(); 5233 } 5234 rangeStart = list[currentRange++]; 5235 rangeLimit = list[currentRange++]; 5236 } 5237 return rangeStart++; 5238 } 5239 } 5240 } 5241 //eof 5242