1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.text; 11 12 import java.io.IOException; 13 import java.text.ParsePosition; 14 import java.util.ArrayList; 15 import java.util.Arrays; 16 import java.util.Collection; 17 import java.util.Collections; 18 import java.util.Iterator; 19 import java.util.NoSuchElementException; 20 import java.util.SortedSet; 21 import java.util.TreeSet; 22 23 import android.icu.impl.BMPSet; 24 import android.icu.impl.CharacterPropertiesImpl; 25 import android.icu.impl.PatternProps; 26 import android.icu.impl.RuleCharacterIterator; 27 import android.icu.impl.SortedSetRelation; 28 import android.icu.impl.StringRange; 29 import android.icu.impl.UCaseProps; 30 import android.icu.impl.UCharacterProperty; 31 import android.icu.impl.UPropertyAliases; 32 import android.icu.impl.UnicodeSetStringSpan; 33 import android.icu.impl.Utility; 34 import android.icu.lang.CharSequences; 35 import android.icu.lang.CharacterProperties; 36 import android.icu.lang.UCharacter; 37 import android.icu.lang.UProperty; 38 import android.icu.lang.UScript; 39 import android.icu.util.Freezable; 40 import android.icu.util.ICUUncheckedIOException; 41 import android.icu.util.OutputInt; 42 import android.icu.util.ULocale; 43 import android.icu.util.VersionInfo; 44 45 /** 46 * A mutable set of Unicode characters and multicharacter strings. 47 * Objects of this class represent <em>character classes</em> used 48 * in regular expressions. A character specifies a subset of Unicode 49 * code points. Legal code points are U+0000 to U+10FFFF, inclusive. 50 * 51 * Note: method freeze() will not only make the set immutable, but 52 * also makes important methods much higher performance: 53 * contains(c), containsNone(...), span(...), spanBack(...) etc. 54 * After the object is frozen, any subsequent call that wants to change 55 * the object will throw UnsupportedOperationException. 56 * 57 * <p>The UnicodeSet class is not designed to be subclassed. 58 * 59 * <p><code>UnicodeSet</code> supports two APIs. The first is the 60 * <em>operand</em> API that allows the caller to modify the value of 61 * a <code>UnicodeSet</code> object. It conforms to Java 2's 62 * <code>java.util.Set</code> interface, although 63 * <code>UnicodeSet</code> does not actually implement that 64 * interface. All methods of <code>Set</code> are supported, with the 65 * modification that they take a character range or single character 66 * instead of an <code>Object</code>, and they take a 67 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 68 * operand API may be thought of in terms of boolean logic: a boolean 69 * OR is implemented by <code>add</code>, a boolean AND is implemented 70 * by <code>retain</code>, a boolean XOR is implemented by 71 * <code>complement</code> taking an argument, and a boolean NOT is 72 * implemented by <code>complement</code> with no argument. In terms 73 * of traditional set theory function names, <code>add</code> is a 74 * union, <code>retain</code> is an intersection, <code>remove</code> 75 * is an asymmetric difference, and <code>complement</code> with no 76 * argument is a set complement with respect to the superset range 77 * <code>MIN_VALUE-MAX_VALUE</code> 78 * 79 * <p>The second API is the 80 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 81 * <code>java.text.Format</code>-derived classes. Unlike the 82 * methods that add characters, add categories, and control the logic 83 * of the set, the method <code>applyPattern()</code> sets all 84 * attributes of a <code>UnicodeSet</code> at once, based on a 85 * string pattern. 86 * 87 * <p><b>Pattern syntax</b></p> 88 * 89 * Patterns are accepted by the constructors and the 90 * <code>applyPattern()</code> methods and returned by the 91 * <code>toPattern()</code> method. These patterns follow a syntax 92 * similar to that employed by version 8 regular expression character 93 * classes. Here are some simple examples: 94 * 95 * <blockquote> 96 * <table> 97 * <tr style="vertical-align: top"> 98 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td> 99 * <td style="vertical-align: top;">No characters</td> 100 * </tr><tr style="vertical-align: top"> 101 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td> 102 * <td style="vertical-align: top;">The character 'a'</td> 103 * </tr><tr style="vertical-align: top"> 104 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td> 105 * <td style="vertical-align: top;">The characters 'a' and 'e'</td> 106 * </tr> 107 * <tr> 108 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td> 109 * <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code 110 * point order</td> 111 * </tr> 112 * <tr> 113 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td> 114 * <td style="vertical-align: top;">The character U+4E01</td> 115 * </tr> 116 * <tr> 117 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td> 118 * <td style="vertical-align: top;">The character 'a' and the multicharacter strings "ab" and 119 * "ac"</td> 120 * </tr> 121 * <tr> 122 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td> 123 * <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td> 124 * </tr> 125 * </table> 126 * </blockquote> 127 * 128 * Any character may be preceded by a backslash in order to remove any special 129 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are 130 * ignored, unless they are escaped. 131 * 132 * <p>Property patterns specify a set of characters having a certain 133 * property as defined by the Unicode standard. Both the POSIX-like 134 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 135 * complete list of supported property patterns, see the User's Guide 136 * for UnicodeSet at 137 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset"> 138 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>. 139 * Actual determination of property data is defined by the underlying 140 * Unicode database as implemented by UCharacter. 141 * 142 * <p>Patterns specify individual characters, ranges of characters, and 143 * Unicode property sets. When elements are concatenated, they 144 * specify their union. To complement a set, place a '^' immediately 145 * after the opening '['. Property patterns are inverted by modifying 146 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 147 * '^' has no special meaning. 148 * 149 * <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]" 150 * perform a “code point complement” (all code points minus the original set), 151 * removing all multicharacter strings, 152 * equivalent to .{@link #complement()}.{@link #removeAllStrings()} . 153 * The {@link #complement()} API function continues to perform a 154 * symmetric difference with all code points and thus retains all multicharacter strings. 155 * 156 * <p>Ranges are indicated by placing two a '-' between two 157 * characters, as in "a-z". This specifies the range of all 158 * characters from the left to the right, in Unicode order. If the 159 * left character is greater than or equal to the 160 * right character it is a syntax error. If a '-' occurs as the first 161 * character after the opening '[' or '[^', or if it occurs as the 162 * last character before the closing ']', then it is taken as a 163 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 164 * set of three characters, 'a', 'b', and '-'. 165 * 166 * <p>Sets may be intersected using the '&' operator or the asymmetric 167 * set difference may be taken using the '-' operator, for example, 168 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 169 * with values less than 4096. Operators ('&' and '|') have equal 170 * precedence and bind left-to-right. Thus 171 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 172 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 173 * difference; intersection is commutative. 174 * 175 * <table> 176 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a' 177 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a' 178 * through 'z' and all letters in between, in Unicode order 179 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing 180 * all characters but 'a' through 'z', 181 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 182 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 183 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 184 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 185 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 186 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 187 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 188 * <em>pat2</em> 189 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code> 190 * <td>The set of characters having the specified 191 * Unicode property; in 192 * this case, Unicode uppercase letters 193 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code> 194 * <td>The set of characters <em>not</em> having the given 195 * Unicode property 196 * </table> 197 * 198 * <p><b>Formal syntax</b></p> 199 * 200 * <blockquote> 201 * <table> 202 * <tr style="vertical-align: top"> 203 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern := </code></td> 204 * <td style="vertical-align: top;"><code>('[' '^'? item* ']') | 205 * property</code></td> 206 * </tr> 207 * <tr style="vertical-align: top"> 208 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item := </code></td> 209 * <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br> 210 * </code></td> 211 * </tr> 212 * <tr style="vertical-align: top"> 213 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr := </code></td> 214 * <td style="vertical-align: top;"><code>pattern | pattern-expr pattern | 215 * pattern-expr op pattern<br> 216 * </code></td> 217 * </tr> 218 * <tr style="vertical-align: top"> 219 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op := </code></td> 220 * <td style="vertical-align: top;"><code>'&' | '-'<br> 221 * </code></td> 222 * </tr> 223 * <tr style="vertical-align: top"> 224 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special := </code></td> 225 * <td style="vertical-align: top;"><code>'[' | ']' | '-'<br> 226 * </code></td> 227 * </tr> 228 * <tr style="vertical-align: top"> 229 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char := </code></td> 230 * <td style="vertical-align: top;"><em>any character that is not</em><code> special<br> 231 * | ('\\' </code><em>any character</em><code>)<br> 232 * | ('\u' hex hex hex hex)<br> 233 * </code></td> 234 * </tr> 235 * <tr style="vertical-align: top"> 236 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> 237 * <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> 238 * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> 239 * </tr> 240 * <tr> 241 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td> 242 * <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td> 243 * </tr> 244 * </table> 245 * <br> 246 * <table border="1"> 247 * <tr> 248 * <td>Legend: <table> 249 * <tr> 250 * <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td> 251 * <td style="width: 20; vertical-align: top;"> </td> 252 * <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td> 253 * </tr> 254 * <tr> 255 * <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td> 256 * <td style="vertical-align: top;"></td> 257 * <td style="vertical-align: top;">zero or one instance of <code>a</code><br> 258 * </td> 259 * </tr> 260 * <tr> 261 * <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td> 262 * <td style="vertical-align: top;"></td> 263 * <td style="vertical-align: top;">one or more instances of <code>a</code><br> 264 * </td> 265 * </tr> 266 * <tr> 267 * <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td> 268 * <td style="vertical-align: top;"></td> 269 * <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br> 270 * </td> 271 * </tr> 272 * <tr> 273 * <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td> 274 * <td style="vertical-align: top;"></td> 275 * <td style="vertical-align: top;">the literal string between the quotes </td> 276 * </tr> 277 * </table> 278 * </td> 279 * </tr> 280 * </table> 281 * </blockquote> 282 * <p>To iterate over contents of UnicodeSet, the following are available: 283 * <ul><li>{@link #ranges()} to iterate through the ranges</li> 284 * <li>{@link #strings()} to iterate through the strings</li> 285 * <li>{@link #iterator()} to iterate through the entire contents in a single loop. 286 * That method is, however, not particularly efficient, since it "boxes" each code point into a String. 287 * </ul> 288 * All of the above can be used in <b>for</b> loops. 289 * The {@link android.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. 290 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 291 * 292 * @author Alan Liu 293 * @see UnicodeSetIterator 294 * @see UnicodeSetSpanner 295 */ 296 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> { 297 private static final SortedSet<String> EMPTY_STRINGS = 298 Collections.unmodifiableSortedSet(new TreeSet<String>()); 299 300 /** 301 * Constant for the empty set. 302 */ 303 public static final UnicodeSet EMPTY = new UnicodeSet().freeze(); 304 /** 305 * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.) 306 */ 307 public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); 308 309 private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing 310 311 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 312 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 313 // 110000 for codepoints 314 315 /** 316 * Enough for sets with few ranges. 317 * For example, White_Space has 10 ranges, list length 21. 318 */ 319 private static final int INITIAL_CAPACITY = 25; 320 321 /** Max list [0, 1, 2, ..., max code point, HIGH] */ 322 private static final int MAX_LENGTH = HIGH + 1; 323 324 /** 325 * Minimum value that can be stored in a UnicodeSet. 326 */ 327 public static final int MIN_VALUE = LOW; 328 329 /** 330 * Maximum value that can be stored in a UnicodeSet. 331 */ 332 public static final int MAX_VALUE = HIGH - 1; 333 334 private int len; // length used; list may be longer to minimize reallocs 335 private int[] list; // MUST be terminated with HIGH 336 private int[] rangeList; // internal buffer 337 private int[] buffer; // internal buffer 338 339 // is not private so that UnicodeSetIterator can get access 340 SortedSet<String> strings = EMPTY_STRINGS; 341 342 /** 343 * The pattern representation of this set. This may not be the 344 * most economical pattern. It is the pattern supplied to 345 * applyPattern(), with variables substituted and whitespace 346 * removed. For sets constructed without applyPattern(), or 347 * modified using the non-pattern API, this string will be null, 348 * indicating that toPattern() must generate a pattern 349 * representation from the inversion list. 350 */ 351 private String pat = null; 352 353 // Special property set IDs 354 private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF] 355 private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] 356 private static final String ASSIGNED = "Assigned"; // [:^Cn:] 357 358 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. 359 private volatile UnicodeSetStringSpan stringSpan; 360 //---------------------------------------------------------------- 361 // Public API 362 //---------------------------------------------------------------- 363 364 /** 365 * Constructs an empty set. 366 */ UnicodeSet()367 public UnicodeSet() { 368 list = new int[INITIAL_CAPACITY]; 369 list[0] = HIGH; 370 len = 1; 371 } 372 373 /** 374 * Constructs a copy of an existing set. 375 */ UnicodeSet(UnicodeSet other)376 public UnicodeSet(UnicodeSet other) { 377 set(other); 378 } 379 380 /** 381 * Constructs a set containing the given range. If <code>end > 382 * start</code> then an empty set is created. 383 * 384 * @param start first character, inclusive, of range 385 * @param end last character, inclusive, of range 386 */ UnicodeSet(int start, int end)387 public UnicodeSet(int start, int end) { 388 this(); 389 add(start, end); 390 } 391 392 /** 393 * Quickly constructs a set from a set of ranges <s0, e0, s1, e1, s2, e2, ..., sn, en>. 394 * There must be an even number of integers, and they must be all greater than zero, 395 * all less than or equal to Character.MAX_CODE_POINT. 396 * In each pair (..., si, ei, ...) it must be true that si <= ei 397 * Between adjacent pairs (...ei, sj...), it must be true that ei+1 < sj 398 * @param pairs pairs of character representing ranges 399 */ UnicodeSet(int... pairs)400 public UnicodeSet(int... pairs) { 401 if ((pairs.length & 1) != 0) { 402 throw new IllegalArgumentException("Must have even number of integers"); 403 } 404 list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set. 405 len = list.length; 406 int last = -1; // used to ensure that the results are monotonically increasing. 407 int i = 0; 408 while (i < pairs.length) { 409 int start = pairs[i]; 410 if (last >= start) { 411 throw new IllegalArgumentException("Must be monotonically increasing."); 412 } 413 list[i++] = start; 414 int limit = pairs[i] + 1; 415 if (start >= limit) { 416 throw new IllegalArgumentException("Must be monotonically increasing."); 417 } 418 list[i++] = last = limit; 419 } 420 list[i] = HIGH; // terminate 421 } 422 423 /** 424 * Constructs a set from the given pattern. See the class description 425 * for the syntax of the pattern language. Whitespace is ignored. 426 * @param pattern a string specifying what characters are in the set 427 * @exception java.lang.IllegalArgumentException if the pattern contains 428 * a syntax error. 429 */ UnicodeSet(String pattern)430 public UnicodeSet(String pattern) { 431 this(); 432 applyPattern(pattern, null, null, IGNORE_SPACE); 433 } 434 435 /** 436 * Constructs a set from the given pattern. See the class description 437 * for the syntax of the pattern language. 438 * @param pattern a string specifying what characters are in the set 439 * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters 440 * @exception java.lang.IllegalArgumentException if the pattern contains 441 * a syntax error. 442 */ UnicodeSet(String pattern, boolean ignoreWhitespace)443 public UnicodeSet(String pattern, boolean ignoreWhitespace) { 444 this(); 445 applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 446 } 447 448 /** 449 * Constructs a set from the given pattern. See the class description 450 * for the syntax of the pattern language. 451 * @param pattern a string specifying what characters are in the set 452 * @param options a bitmask indicating which options to apply. 453 * Valid options are {@link #IGNORE_SPACE} and 454 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 455 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 456 * @exception java.lang.IllegalArgumentException if the pattern contains 457 * a syntax error. 458 */ UnicodeSet(String pattern, int options)459 public UnicodeSet(String pattern, int options) { 460 this(); 461 applyPattern(pattern, null, null, options); 462 } 463 464 /** 465 * Constructs a set from the given pattern. See the class description 466 * for the syntax of the pattern language. 467 * @param pattern a string specifying what characters are in the set 468 * @param pos on input, the position in pattern at which to start parsing. 469 * On output, the position after the last character parsed. 470 * @param symbols a symbol table mapping variables to char[] arrays 471 * and chars to UnicodeSets 472 * @exception java.lang.IllegalArgumentException if the pattern 473 * contains a syntax error. 474 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)475 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) { 476 this(); 477 applyPattern(pattern, pos, symbols, IGNORE_SPACE); 478 } 479 480 /** 481 * Constructs a set from the given pattern. See the class description 482 * for the syntax of the pattern language. 483 * @param pattern a string specifying what characters are in the set 484 * @param pos on input, the position in pattern at which to start parsing. 485 * On output, the position after the last character parsed. 486 * @param symbols a symbol table mapping variables to char[] arrays 487 * and chars to UnicodeSets 488 * @param options a bitmask indicating which options to apply. 489 * Valid options are {@link #IGNORE_SPACE} and 490 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 491 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 492 * @exception java.lang.IllegalArgumentException if the pattern 493 * contains a syntax error. 494 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)495 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) { 496 this(); 497 applyPattern(pattern, pos, symbols, options); 498 } 499 500 501 /** 502 * Return a new set that is equivalent to this one. 503 */ 504 @Override clone()505 public Object clone() { 506 if (isFrozen()) { 507 return this; 508 } 509 return new UnicodeSet(this); 510 } 511 512 /** 513 * Make this object represent the range <code>start - end</code>. 514 * If <code>start > end</code> then this object is set to an empty range. 515 * 516 * @param start first character in the set, inclusive 517 * @param end last character in the set, inclusive 518 */ set(int start, int end)519 public UnicodeSet set(int start, int end) { 520 checkFrozen(); 521 clear(); 522 complement(start, end); 523 return this; 524 } 525 526 /** 527 * Make this object represent the same set as <code>other</code>. 528 * @param other a <code>UnicodeSet</code> whose value will be 529 * copied to this object 530 */ set(UnicodeSet other)531 public UnicodeSet set(UnicodeSet other) { 532 checkFrozen(); 533 list = Arrays.copyOf(other.list, other.len); 534 len = other.len; 535 pat = other.pat; 536 if (other.hasStrings()) { 537 strings = new TreeSet<>(other.strings); 538 } else { 539 strings = EMPTY_STRINGS; 540 } 541 return this; 542 } 543 544 /** 545 * Modifies this set to represent the set specified by the given pattern. 546 * See the class description for the syntax of the pattern language. 547 * Whitespace is ignored. 548 * @param pattern a string specifying what characters are in the set 549 * @exception java.lang.IllegalArgumentException if the pattern 550 * contains a syntax error. 551 */ applyPattern(String pattern)552 public final UnicodeSet applyPattern(String pattern) { 553 checkFrozen(); 554 return applyPattern(pattern, null, null, IGNORE_SPACE); 555 } 556 557 /** 558 * Modifies this set to represent the set specified by the given pattern, 559 * optionally ignoring whitespace. 560 * See the class description for the syntax of the pattern language. 561 * @param pattern a string specifying what characters are in the set 562 * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored 563 * @exception java.lang.IllegalArgumentException if the pattern 564 * contains a syntax error. 565 */ applyPattern(String pattern, boolean ignoreWhitespace)566 public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) { 567 checkFrozen(); 568 return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 569 } 570 571 /** 572 * Modifies this set to represent the set specified by the given pattern, 573 * optionally ignoring whitespace. 574 * See the class description for the syntax of the pattern language. 575 * @param pattern a string specifying what characters are in the set 576 * @param options a bitmask indicating which options to apply. 577 * Valid options are {@link #IGNORE_SPACE} and 578 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 579 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 580 * @exception java.lang.IllegalArgumentException if the pattern 581 * contains a syntax error. 582 */ applyPattern(String pattern, int options)583 public UnicodeSet applyPattern(String pattern, int options) { 584 checkFrozen(); 585 return applyPattern(pattern, null, null, options); 586 } 587 588 /** 589 * Return true if the given position, in the given pattern, appears 590 * to be the start of a UnicodeSet pattern. 591 * @hide unsupported on Android 592 */ resemblesPattern(String pattern, int pos)593 public static boolean resemblesPattern(String pattern, int pos) { 594 return ((pos+1) < pattern.length() && 595 pattern.charAt(pos) == '[') || 596 resemblesPropertyPattern(pattern, pos); 597 } 598 599 /** 600 * TODO: create Appendable version of UTF16.append(buf, c), 601 * maybe in new class Appendables? 602 * @throws IOException 603 */ appendCodePoint(Appendable app, int c)604 private static void appendCodePoint(Appendable app, int c) { 605 assert 0 <= c && c <= 0x10ffff; 606 try { 607 if (c <= 0xffff) { 608 app.append((char) c); 609 } else { 610 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c)); 611 } 612 } catch (IOException e) { 613 throw new ICUUncheckedIOException(e); 614 } 615 } 616 617 /** 618 * TODO: create class Appendables? 619 * @throws IOException 620 */ append(Appendable app, CharSequence s)621 private static void append(Appendable app, CharSequence s) { 622 try { 623 app.append(s); 624 } catch (IOException e) { 625 throw new ICUUncheckedIOException(e); 626 } 627 } 628 629 /** 630 * Append the <code>toPattern()</code> representation of a 631 * string to the given <code>Appendable</code>. 632 */ _appendToPat(T buf, String s, boolean escapeUnprintable)633 private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) { 634 int cp; 635 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 636 cp = s.codePointAt(i); 637 _appendToPat(buf, cp, escapeUnprintable); 638 } 639 return buf; 640 } 641 642 /** 643 * Append the <code>toPattern()</code> representation of a 644 * character to the given <code>Appendable</code>. 645 */ _appendToPat(T buf, int c, boolean escapeUnprintable)646 private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) { 647 try { 648 if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) { 649 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 650 // unprintable 651 return Utility.escape(buf, c); 652 } 653 // Okay to let ':' pass through 654 switch (c) { 655 case '[': // SET_OPEN: 656 case ']': // SET_CLOSE: 657 case '-': // HYPHEN: 658 case '^': // COMPLEMENT: 659 case '&': // INTERSECTION: 660 case '\\': //BACKSLASH: 661 case '{': 662 case '}': 663 case '$': 664 case ':': 665 buf.append('\\'); 666 break; 667 default: 668 // Escape whitespace 669 if (PatternProps.isWhiteSpace(c)) { 670 buf.append('\\'); 671 } 672 break; 673 } 674 appendCodePoint(buf, c); 675 return buf; 676 } catch (IOException e) { 677 throw new ICUUncheckedIOException(e); 678 } 679 } 680 _appendToPat( T result, int start, int end, boolean escapeUnprintable)681 private static <T extends Appendable> T _appendToPat( 682 T result, int start, int end, boolean escapeUnprintable) { 683 _appendToPat(result, start, escapeUnprintable); 684 if (start != end) { 685 if ((start+1) != end || 686 // Avoid writing what looks like a lead+trail surrogate pair. 687 start == 0xdbff) { 688 try { 689 result.append('-'); 690 } catch (IOException e) { 691 throw new ICUUncheckedIOException(e); 692 } 693 } 694 _appendToPat(result, end, escapeUnprintable); 695 } 696 return result; 697 } 698 699 /** 700 * Returns a string representation of this set. If the result of 701 * calling this function is passed to a UnicodeSet constructor, it 702 * will produce another set that is equal to this one. 703 */ 704 @Override toPattern(boolean escapeUnprintable)705 public String toPattern(boolean escapeUnprintable) { 706 if (pat != null && !escapeUnprintable) { 707 return pat; 708 } 709 StringBuilder result = new StringBuilder(); 710 return _toPattern(result, escapeUnprintable).toString(); 711 } 712 713 /** 714 * Append a string representation of this set to result. This will be 715 * a cleaned version of the string passed to applyPattern(), if there 716 * is one. Otherwise it will be generated. 717 */ _toPattern(T result, boolean escapeUnprintable)718 private <T extends Appendable> T _toPattern(T result, 719 boolean escapeUnprintable) { 720 if (pat == null) { 721 return appendNewPattern(result, escapeUnprintable, true); 722 } 723 try { 724 if (!escapeUnprintable) { 725 // TODO: The C++ version does not have this shortcut, and instead 726 // always cleans up the pattern string, 727 // which also escapes Utility.shouldAlwaysBeEscaped(c). 728 // We should sync these implementations. 729 result.append(pat); 730 return result; 731 } 732 boolean oddNumberOfBackslashes = false; 733 for (int i=0; i<pat.length(); ) { 734 int c = pat.codePointAt(i); 735 i += Character.charCount(c); 736 if (Utility.isUnprintable(c)) { 737 // If the unprintable character is preceded by an odd 738 // number of backslashes, then it has been escaped 739 // and we omit the last backslash. 740 Utility.escape(result, c); 741 oddNumberOfBackslashes = false; 742 } else if (!oddNumberOfBackslashes && c == '\\') { 743 // Temporarily withhold an odd-numbered backslash. 744 oddNumberOfBackslashes = true; 745 } else { 746 if (oddNumberOfBackslashes) { 747 result.append('\\'); 748 } 749 appendCodePoint(result, c); 750 oddNumberOfBackslashes = false; 751 } 752 } 753 if (oddNumberOfBackslashes) { 754 result.append('\\'); 755 } 756 return result; 757 } catch (IOException e) { 758 throw new ICUUncheckedIOException(e); 759 } 760 } 761 762 /** 763 * Generate and append a string representation of this set to result. 764 * This does not use this.pat, the cleaned up copy of the string 765 * passed to applyPattern(). 766 * 767 * @param result the buffer into which to generate the pattern 768 * @param escapeUnprintable escape unprintable characters if true 769 */ _generatePattern(StringBuffer result, boolean escapeUnprintable)770 public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { 771 return _generatePattern(result, escapeUnprintable, true); 772 } 773 774 /** 775 * Generate and append a string representation of this set to result. 776 * This does not use this.pat, the cleaned up copy of the string 777 * passed to applyPattern(). 778 * 779 * @param result the buffer into which to generate the pattern 780 * @param escapeUnprintable escape unprintable characters if true 781 * @param includeStrings if false, doesn't include the strings. 782 */ _generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)783 public StringBuffer _generatePattern(StringBuffer result, 784 boolean escapeUnprintable, boolean includeStrings) { 785 return appendNewPattern(result, escapeUnprintable, includeStrings); 786 } 787 788 // Implementation of public _generatePattern(). 789 // Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer. appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)790 private <T extends Appendable> T appendNewPattern( 791 T result, boolean escapeUnprintable, boolean includeStrings) { 792 try { 793 result.append('['); 794 795 int i = 0; 796 int limit = len & ~1; // = 2 * getRangeCount() 797 798 // If the set contains at least 2 intervals and includes both 799 // MIN_VALUE and MAX_VALUE, then the inverse representation will 800 // be more economical. 801 // if (getRangeCount() >= 2 && 802 // getRangeStart(0) == MIN_VALUE && 803 // getRangeEnd(last) == MAX_VALUE) 804 // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 805 // If limit == len then len is even and the last range ends with MAX_VALUE. 806 // 807 // *But* do not write the inverse (complement) if there are strings. 808 // Since ICU 70, the '^' performs a code point complement which removes all strings. 809 if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { 810 // Emit the inverse 811 result.append('^'); 812 // Offsetting the inversion list index by one lets us 813 // iterate over the ranges of the set complement. 814 i = 1; 815 --limit; 816 } 817 818 // Emit the ranges as pairs. 819 while (i < limit) { 820 int start = list[i]; // getRangeStart() 821 int end = list[i + 1] - 1; // getRangeEnd() = range limit minus one 822 if (!(0xd800 <= end && end <= 0xdbff)) { 823 _appendToPat(result, start, end, escapeUnprintable); 824 i += 2; 825 } else { 826 // The range ends with a lead surrogate. 827 // Avoid writing what looks like a lead+trail surrogate pair. 828 // 1. Postpone ranges that start with a lead surrogate code point. 829 int firstLead = i; 830 while ((i += 2) < limit && list[i] <= 0xdbff) {} 831 int firstAfterLead = i; 832 // 2. Write following ranges that start with a trail surrogate code point. 833 while (i < limit && (start = list[i]) <= 0xdfff) { 834 _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); 835 i += 2; 836 } 837 // 3. Now write the postponed ranges. 838 for (int j = firstLead; j < firstAfterLead; j += 2) { 839 _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); 840 } 841 } 842 } 843 844 if (includeStrings && hasStrings()) { 845 for (String s : strings) { 846 result.append('{'); 847 _appendToPat(result, s, escapeUnprintable); 848 result.append('}'); 849 } 850 } 851 result.append(']'); 852 return result; 853 } catch (IOException e) { 854 throw new ICUUncheckedIOException(e); 855 } 856 } 857 858 /** 859 * Returns the number of elements in this set (its cardinality) 860 * Note than the elements of a set may include both individual 861 * codepoints and strings. 862 * 863 * @return the number of elements in this set (its cardinality). 864 */ size()865 public int size() { 866 int n = 0; 867 int count = getRangeCount(); 868 for (int i = 0; i < count; ++i) { 869 n += getRangeEnd(i) - getRangeStart(i) + 1; 870 } 871 return n + strings.size(); 872 } 873 874 /** 875 * Returns <tt>true</tt> if this set contains no elements. 876 * 877 * @return <tt>true</tt> if this set contains no elements. 878 */ isEmpty()879 public boolean isEmpty() { 880 return len == 1 && !hasStrings(); 881 } 882 883 /** 884 * @return true if this set contains multi-character strings or the empty string. 885 */ hasStrings()886 public boolean hasStrings() { 887 return !strings.isEmpty(); 888 } 889 890 /** 891 * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if 892 * this set contains any character whose low byte is the given 893 * value. This is used by <tt>RuleBasedTransliterator</tt> for 894 * indexing. 895 */ 896 @Override matchesIndexValue(int v)897 public boolean matchesIndexValue(int v) { 898 /* The index value v, in the range [0,255], is contained in this set if 899 * it is contained in any pair of this set. Pairs either have the high 900 * bytes equal, or unequal. If the high bytes are equal, then we have 901 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 902 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 903 * Then v is contained if xx <= v || v <= yy. (This is identical to the 904 * time zone month containment logic.) 905 */ 906 for (int i=0; i<getRangeCount(); ++i) { 907 int low = getRangeStart(i); 908 int high = getRangeEnd(i); 909 if ((low & ~0xFF) == (high & ~0xFF)) { 910 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 911 return true; 912 } 913 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 914 return true; 915 } 916 } 917 if (hasStrings()) { 918 for (String s : strings) { 919 if (s.isEmpty()) { 920 continue; // skip the empty string 921 } 922 int c = UTF16.charAt(s, 0); 923 if ((c & 0xFF) == v) { 924 return true; 925 } 926 } 927 } 928 return false; 929 } 930 931 /** 932 * Implementation of UnicodeMatcher.matches(). Always matches the 933 * longest possible multichar string. 934 */ 935 @Override matches(Replaceable text, int[] offset, int limit, boolean incremental)936 public int matches(Replaceable text, 937 int[] offset, 938 int limit, 939 boolean incremental) { 940 941 if (offset[0] == limit) { 942 if (contains(UnicodeMatcher.ETHER)) { 943 return incremental ? U_PARTIAL_MATCH : U_MATCH; 944 } else { 945 return U_MISMATCH; 946 } 947 } else { 948 if (hasStrings()) { // try strings first 949 950 // might separate forward and backward loops later 951 // for now they are combined 952 953 // TODO Improve efficiency of this, at least in the forward 954 // direction, if not in both. In the forward direction we 955 // can assume the strings are sorted. 956 957 boolean forward = offset[0] < limit; 958 959 // firstChar is the leftmost char to match in the 960 // forward direction or the rightmost char to match in 961 // the reverse direction. 962 char firstChar = text.charAt(offset[0]); 963 964 // If there are multiple strings that can match we 965 // return the longest match. 966 int highWaterLength = 0; 967 968 for (String trial : strings) { 969 if (trial.isEmpty()) { 970 continue; // skip the empty string 971 } 972 973 char c = trial.charAt(forward ? 0 : trial.length() - 1); 974 975 // Strings are sorted, so we can optimize in the 976 // forward direction. 977 if (forward && c > firstChar) break; 978 if (c != firstChar) continue; 979 980 int length = matchRest(text, offset[0], limit, trial); 981 982 if (incremental) { 983 int maxLen = forward ? limit-offset[0] : offset[0]-limit; 984 if (length == maxLen) { 985 // We have successfully matched but only up to limit. 986 return U_PARTIAL_MATCH; 987 } 988 } 989 990 if (length == trial.length()) { 991 // We have successfully matched the whole string. 992 if (length > highWaterLength) { 993 highWaterLength = length; 994 } 995 // In the forward direction we know strings 996 // are sorted so we can bail early. 997 if (forward && length < highWaterLength) { 998 break; 999 } 1000 continue; 1001 } 1002 } 1003 1004 // We've checked all strings without a partial match. 1005 // If we have full matches, return the longest one. 1006 if (highWaterLength != 0) { 1007 offset[0] += forward ? highWaterLength : -highWaterLength; 1008 return U_MATCH; 1009 } 1010 } 1011 return super.matches(text, offset, limit, incremental); 1012 } 1013 } 1014 1015 /** 1016 * Returns the longest match for s in text at the given position. 1017 * If limit > start then match forward from start+1 to limit 1018 * matching all characters except s.charAt(0). If limit < start, 1019 * go backward starting from start-1 matching all characters 1020 * except s.charAt(s.length()-1). This method assumes that the 1021 * first character, text.charAt(start), matches s, so it does not 1022 * check it. 1023 * @param text the text to match 1024 * @param start the first character to match. In the forward 1025 * direction, text.charAt(start) is matched against s.charAt(0). 1026 * In the reverse direction, it is matched against 1027 * s.charAt(s.length()-1). 1028 * @param limit the limit offset for matching, either last+1 in 1029 * the forward direction, or last-1 in the reverse direction, 1030 * where last is the index of the last character to match. 1031 * @return If part of s matches up to the limit, return |limit - 1032 * start|. If all of s matches before reaching the limit, return 1033 * s.length(). If there is a mismatch between s and text, return 1034 * 0 1035 */ matchRest(Replaceable text, int start, int limit, String s)1036 private static int matchRest (Replaceable text, int start, int limit, String s) { 1037 int maxLen; 1038 int slen = s.length(); 1039 if (start < limit) { 1040 maxLen = limit - start; 1041 if (maxLen > slen) maxLen = slen; 1042 for (int i = 1; i < maxLen; ++i) { 1043 if (text.charAt(start + i) != s.charAt(i)) return 0; 1044 } 1045 } else { 1046 maxLen = start - limit; 1047 if (maxLen > slen) maxLen = slen; 1048 --slen; // <=> slen = s.length() - 1; 1049 for (int i = 1; i < maxLen; ++i) { 1050 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1051 } 1052 } 1053 return maxLen; 1054 } 1055 1056 /** 1057 * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. 1058 * @deprecated This API is ICU internal only. 1059 * @hide original deprecated declaration 1060 * @hide draft / provisional / internal are hidden on Android 1061 */ 1062 @Deprecated matchesAt(CharSequence text, int offset)1063 public int matchesAt(CharSequence text, int offset) { 1064 int lastLen = -1; 1065 strings: 1066 if (hasStrings()) { 1067 char firstChar = text.charAt(offset); 1068 String trial = null; 1069 // find the first string starting with firstChar 1070 Iterator<String> it = strings.iterator(); 1071 while (it.hasNext()) { 1072 trial = it.next(); 1073 char firstStringChar = trial.charAt(0); 1074 if (firstStringChar < firstChar) continue; 1075 if (firstStringChar > firstChar) break strings; 1076 } 1077 1078 // now keep checking string until we get the longest one 1079 for (;;) { 1080 int tempLen = matchesAt(text, offset, trial); 1081 if (lastLen > tempLen) break strings; 1082 lastLen = tempLen; 1083 if (!it.hasNext()) break; 1084 trial = it.next(); 1085 } 1086 } 1087 1088 if (lastLen < 2) { 1089 int cp = UTF16.charAt(text, offset); 1090 if (contains(cp)) lastLen = UTF16.getCharCount(cp); 1091 } 1092 1093 return offset+lastLen; 1094 } 1095 1096 /** 1097 * Does one string contain another, starting at a specific offset? 1098 * @param text text to match 1099 * @param offsetInText offset within that text 1100 * @param substring substring to match at offset in text 1101 * @return -1 if match fails, otherwise other.length() 1102 */ 1103 // Note: This method was moved from CollectionUtilities matchesAt(CharSequence text, int offsetInText, CharSequence substring)1104 private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) { 1105 int len = substring.length(); 1106 int textLength = text.length(); 1107 if (textLength + offsetInText > len) { 1108 return -1; 1109 } 1110 int i = 0; 1111 for (int j = offsetInText; i < len; ++i, ++j) { 1112 char pc = substring.charAt(i); 1113 char tc = text.charAt(j); 1114 if (pc != tc) return -1; 1115 } 1116 return i; 1117 } 1118 1119 /** 1120 * Implementation of UnicodeMatcher API. Union the set of all 1121 * characters that may be matched by this object into the given 1122 * set. 1123 * @param toUnionTo the set into which to union the source characters 1124 */ 1125 @Override addMatchSetTo(UnicodeSet toUnionTo)1126 public void addMatchSetTo(UnicodeSet toUnionTo) { 1127 toUnionTo.addAll(this); 1128 } 1129 1130 /** 1131 * Returns the index of the given character within this set, where 1132 * the set is ordered by ascending code point. If the character 1133 * is not in this set, return -1. The inverse of this method is 1134 * <code>charAt()</code>. 1135 * @return an index from 0..size()-1, or -1 1136 */ indexOf(int c)1137 public int indexOf(int c) { 1138 if (c < MIN_VALUE || c > MAX_VALUE) { 1139 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1140 } 1141 int i = 0; 1142 int n = 0; 1143 for (;;) { 1144 int start = list[i++]; 1145 if (c < start) { 1146 return -1; 1147 } 1148 int limit = list[i++]; 1149 if (c < limit) { 1150 return n + c - start; 1151 } 1152 n += limit - start; 1153 } 1154 } 1155 1156 /** 1157 * Returns the character at the given index within this set, where 1158 * the set is ordered by ascending code point. If the index is 1159 * out of range, return -1. The inverse of this method is 1160 * <code>indexOf()</code>. 1161 * @param index an index from 0..size()-1 1162 * @return the character at the given index, or -1. 1163 */ charAt(int index)1164 public int charAt(int index) { 1165 if (index >= 0) { 1166 // len2 is the largest even integer <= len, that is, it is len 1167 // for even values and len-1 for odd values. With odd values 1168 // the last entry is UNICODESET_HIGH. 1169 int len2 = len & ~1; 1170 for (int i=0; i < len2;) { 1171 int start = list[i++]; 1172 int count = list[i++] - start; 1173 if (index < count) { 1174 return start + index; 1175 } 1176 index -= count; 1177 } 1178 } 1179 return -1; 1180 } 1181 1182 /** 1183 * Adds the specified range to this set if it is not already 1184 * present. If this set already contains the specified range, 1185 * the call leaves this set unchanged. If <code>start > end</code> 1186 * then an empty range is added, leaving the set unchanged. 1187 * 1188 * @param start first character, inclusive, of range to be added 1189 * to this set. 1190 * @param end last character, inclusive, of range to be added 1191 * to this set. 1192 */ add(int start, int end)1193 public UnicodeSet add(int start, int end) { 1194 checkFrozen(); 1195 return add_unchecked(start, end); 1196 } 1197 1198 /** 1199 * Adds all characters in range (uses preferred naming convention). 1200 * @param start The index of where to start on adding all characters. 1201 * @param end The index of where to end on adding all characters. 1202 * @return a reference to this object 1203 */ addAll(int start, int end)1204 public UnicodeSet addAll(int start, int end) { 1205 checkFrozen(); 1206 return add_unchecked(start, end); 1207 } 1208 1209 // for internal use, after checkFrozen has been called add_unchecked(int start, int end)1210 private UnicodeSet add_unchecked(int start, int end) { 1211 if (start < MIN_VALUE || start > MAX_VALUE) { 1212 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1213 } 1214 if (end < MIN_VALUE || end > MAX_VALUE) { 1215 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1216 } 1217 if (start < end) { 1218 int limit = end + 1; 1219 // Fast path for adding a new range after the last one. 1220 // Odd list length: [..., lastStart, lastLimit, HIGH] 1221 if ((len & 1) != 0) { 1222 // If the list is empty, set lastLimit low enough to not be adjacent to 0. 1223 int lastLimit = len == 1 ? -2 : list[len - 2]; 1224 if (lastLimit <= start) { 1225 checkFrozen(); 1226 if (lastLimit == start) { 1227 // Extend the last range. 1228 list[len - 2] = limit; 1229 if (limit == HIGH) { 1230 --len; 1231 } 1232 } else { 1233 list[len - 1] = start; 1234 if (limit < HIGH) { 1235 ensureCapacity(len + 2); 1236 list[len++] = limit; 1237 list[len++] = HIGH; 1238 } else { // limit == HIGH 1239 ensureCapacity(len + 1); 1240 list[len++] = HIGH; 1241 } 1242 } 1243 pat = null; 1244 return this; 1245 } 1246 } 1247 // This is slow. Could be much faster using findCodePoint(start) 1248 // and modifying the list, dealing with adjacent & overlapping ranges. 1249 add(range(start, end), 2, 0); 1250 } else if (start == end) { 1251 add(start); 1252 } 1253 return this; 1254 } 1255 1256 // /** 1257 // * Format out the inversion list as a string, for debugging. Uncomment when 1258 // * needed. 1259 // */ 1260 // public final String dump() { 1261 // StringBuffer buf = new StringBuffer("["); 1262 // for (int i=0; i<len; ++i) { 1263 // if (i != 0) buf.append(", "); 1264 // int c = list[i]; 1265 // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') { 1266 // // buf.append((char) c); 1267 // //} else { 1268 // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6)); 1269 // //} 1270 // } 1271 // buf.append("]"); 1272 // return buf.toString(); 1273 // } 1274 1275 /** 1276 * Adds the specified character to this set if it is not already 1277 * present. If this set already contains the specified character, 1278 * the call leaves this set unchanged. 1279 */ add(int c)1280 public final UnicodeSet add(int c) { 1281 checkFrozen(); 1282 return add_unchecked(c); 1283 } 1284 1285 // for internal use only, after checkFrozen has been called add_unchecked(int c)1286 private final UnicodeSet add_unchecked(int c) { 1287 if (c < MIN_VALUE || c > MAX_VALUE) { 1288 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1289 } 1290 1291 // find smallest i such that c < list[i] 1292 // if odd, then it is IN the set 1293 // if even, then it is OUT of the set 1294 int i = findCodePoint(c); 1295 1296 // already in set? 1297 if ((i & 1) != 0) return this; 1298 1299 // HIGH is 0x110000 1300 // assert(list[len-1] == HIGH); 1301 1302 // empty = [HIGH] 1303 // [start_0, limit_0, start_1, limit_1, HIGH] 1304 1305 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1306 // ^ 1307 // list[i] 1308 1309 // i == 0 means c is before the first range 1310 // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into 1311 // add_unchecked, the maximum value that "c" will be compared to 1312 // is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will 1313 // never be reached according to this logic. 1314 if (c == list[i]-1) { 1315 // c is before start of next range 1316 list[i] = c; 1317 // if we touched the HIGH mark, then add a new one 1318 if (c == MAX_VALUE) { 1319 ensureCapacity(len+1); 1320 list[len++] = HIGH; 1321 } 1322 if (i > 0 && c == list[i-1]) { 1323 // collapse adjacent ranges 1324 1325 // [..., start_k-1, c, c, limit_k, ..., HIGH] 1326 // ^ 1327 // list[i] 1328 System.arraycopy(list, i+1, list, i-1, len-i-1); 1329 len -= 2; 1330 } 1331 } 1332 1333 else if (i > 0 && c == list[i-1]) { 1334 // c is after end of prior range 1335 list[i-1]++; 1336 // no need to check for collapse here 1337 } 1338 1339 else { 1340 // At this point we know the new char is not adjacent to 1341 // any existing ranges, and it is not 10FFFF. 1342 1343 1344 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1345 // ^ 1346 // list[i] 1347 1348 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1349 // ^ 1350 // list[i] 1351 1352 // Don't use ensureCapacity() to save on copying. 1353 // NOTE: This has no measurable impact on performance, 1354 // but it might help in some usage patterns. 1355 if (len+2 > list.length) { 1356 int[] temp = new int[nextCapacity(len + 2)]; 1357 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 1358 System.arraycopy(list, i, temp, i+2, len-i); 1359 list = temp; 1360 } else { 1361 System.arraycopy(list, i, list, i+2, len-i); 1362 } 1363 1364 list[i] = c; 1365 list[i+1] = c+1; 1366 len += 2; 1367 } 1368 1369 pat = null; 1370 return this; 1371 } 1372 1373 /** 1374 * Adds the specified multicharacter to this set if it is not already 1375 * present. If this set already contains the multicharacter, 1376 * the call leaves this set unchanged. 1377 * Thus "ch" => {"ch"} 1378 * 1379 * @param s the source string 1380 * @return this object, for chaining 1381 */ add(CharSequence s)1382 public final UnicodeSet add(CharSequence s) { 1383 checkFrozen(); 1384 int cp = getSingleCP(s); 1385 if (cp < 0) { 1386 String str = s.toString(); 1387 if (!strings.contains(str)) { 1388 addString(str); 1389 pat = null; 1390 } 1391 } else { 1392 add_unchecked(cp, cp); 1393 } 1394 return this; 1395 } 1396 addString(CharSequence s)1397 private void addString(CharSequence s) { 1398 if (strings == EMPTY_STRINGS) { 1399 strings = new TreeSet<>(); 1400 } 1401 strings.add(s.toString()); 1402 } 1403 1404 /** 1405 * Utility for getting code point from single code point CharSequence. 1406 * See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE). 1407 * 1408 * @return a code point IF the string consists of a single one. 1409 * otherwise returns -1. 1410 * @param s to test 1411 */ getSingleCP(CharSequence s)1412 private static int getSingleCP(CharSequence s) { 1413 if (s.length() == 1) return s.charAt(0); 1414 if (s.length() == 2) { 1415 int cp = Character.codePointAt(s, 0); 1416 if (cp > 0xFFFF) { // is surrogate pair 1417 return cp; 1418 } 1419 } 1420 return -1; 1421 } 1422 1423 /** 1424 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1425 * If this set already any particular character, it has no effect on that character. 1426 * @param s the source string 1427 * @return this object, for chaining 1428 */ addAll(CharSequence s)1429 public final UnicodeSet addAll(CharSequence s) { 1430 checkFrozen(); 1431 int cp; 1432 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1433 cp = UTF16.charAt(s, i); 1434 add_unchecked(cp, cp); 1435 } 1436 return this; 1437 } 1438 1439 /** 1440 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1441 * If this set already any particular character, it has no effect on that character. 1442 * @param s the source string 1443 * @return this object, for chaining 1444 */ retainAll(CharSequence s)1445 public final UnicodeSet retainAll(CharSequence s) { 1446 return retainAll(fromAll(s)); 1447 } 1448 1449 /** 1450 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1451 * If this set already any particular character, it has no effect on that character. 1452 * @param s the source string 1453 * @return this object, for chaining 1454 */ complementAll(CharSequence s)1455 public final UnicodeSet complementAll(CharSequence s) { 1456 return complementAll(fromAll(s)); 1457 } 1458 1459 /** 1460 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1461 * If this set already any particular character, it has no effect on that character. 1462 * @param s the source string 1463 * @return this object, for chaining 1464 */ removeAll(CharSequence s)1465 public final UnicodeSet removeAll(CharSequence s) { 1466 return removeAll(fromAll(s)); 1467 } 1468 1469 /** 1470 * Remove all strings from this UnicodeSet 1471 * @return this object, for chaining 1472 */ removeAllStrings()1473 public final UnicodeSet removeAllStrings() { 1474 checkFrozen(); 1475 if (hasStrings()) { 1476 strings.clear(); 1477 pat = null; 1478 } 1479 return this; 1480 } 1481 1482 /** 1483 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1484 * 1485 * @param s the source string 1486 * @return a newly created set containing the given string 1487 */ from(CharSequence s)1488 public static UnicodeSet from(CharSequence s) { 1489 return new UnicodeSet().add(s); 1490 } 1491 1492 1493 /** 1494 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1495 * @param s the source string 1496 * @return a newly created set containing the given characters 1497 */ fromAll(CharSequence s)1498 public static UnicodeSet fromAll(CharSequence s) { 1499 return new UnicodeSet().addAll(s); 1500 } 1501 1502 1503 /** 1504 * Retain only the elements in this set that are contained in the 1505 * specified range. If <code>start > end</code> then an empty range is 1506 * retained, leaving the set empty. 1507 * 1508 * @param start first character, inclusive, of range 1509 * @param end last character, inclusive, of range 1510 */ retain(int start, int end)1511 public UnicodeSet retain(int start, int end) { 1512 checkFrozen(); 1513 if (start < MIN_VALUE || start > MAX_VALUE) { 1514 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1515 } 1516 if (end < MIN_VALUE || end > MAX_VALUE) { 1517 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1518 } 1519 if (start <= end) { 1520 retain(range(start, end), 2, 0); 1521 } else { 1522 clear(); 1523 } 1524 return this; 1525 } 1526 1527 /** 1528 * Retain the specified character from this set if it is present. 1529 * Upon return this set will be empty if it did not contain c, or 1530 * will only contain c if it did contain c. 1531 * @param c the character to be retained 1532 * @return this object, for chaining 1533 */ retain(int c)1534 public final UnicodeSet retain(int c) { 1535 return retain(c, c); 1536 } 1537 1538 /** 1539 * Retain the specified string in this set if it is present. 1540 * Upon return this set will be empty if it did not contain s, or 1541 * will only contain s if it did contain s. 1542 * @param cs the string to be retained 1543 * @return this object, for chaining 1544 */ retain(CharSequence cs)1545 public final UnicodeSet retain(CharSequence cs) { 1546 int cp = getSingleCP(cs); 1547 if (cp < 0) { 1548 checkFrozen(); 1549 String s = cs.toString(); 1550 boolean isIn = strings.contains(s); 1551 // Check for getRangeCount() first to avoid somewhat-expensive size() 1552 // when there are single code points. 1553 if (isIn && getRangeCount() == 0 && size() == 1) { 1554 return this; 1555 } 1556 clear(); 1557 if (isIn) { 1558 addString(s); 1559 } 1560 pat = null; 1561 } else { 1562 retain(cp, cp); 1563 } 1564 return this; 1565 } 1566 1567 /** 1568 * Removes the specified range from this set if it is present. 1569 * The set will not contain the specified range once the call 1570 * returns. If <code>start > end</code> then an empty range is 1571 * removed, leaving the set unchanged. 1572 * 1573 * @param start first character, inclusive, of range to be removed 1574 * from this set. 1575 * @param end last character, inclusive, of range to be removed 1576 * from this set. 1577 */ remove(int start, int end)1578 public UnicodeSet remove(int start, int end) { 1579 checkFrozen(); 1580 if (start < MIN_VALUE || start > MAX_VALUE) { 1581 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1582 } 1583 if (end < MIN_VALUE || end > MAX_VALUE) { 1584 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1585 } 1586 if (start <= end) { 1587 retain(range(start, end), 2, 2); 1588 } 1589 return this; 1590 } 1591 1592 /** 1593 * Removes the specified character from this set if it is present. 1594 * The set will not contain the specified character once the call 1595 * returns. 1596 * @param c the character to be removed 1597 * @return this object, for chaining 1598 */ remove(int c)1599 public final UnicodeSet remove(int c) { 1600 return remove(c, c); 1601 } 1602 1603 /** 1604 * Removes the specified string from this set if it is present. 1605 * The set will not contain the specified string once the call 1606 * returns. 1607 * @param s the string to be removed 1608 * @return this object, for chaining 1609 */ remove(CharSequence s)1610 public final UnicodeSet remove(CharSequence s) { 1611 int cp = getSingleCP(s); 1612 if (cp < 0) { 1613 checkFrozen(); 1614 String str = s.toString(); 1615 if (strings.contains(str)) { 1616 strings.remove(str); 1617 pat = null; 1618 } 1619 } else { 1620 remove(cp, cp); 1621 } 1622 return this; 1623 } 1624 1625 /** 1626 * Complements the specified range in this set. Any character in 1627 * the range will be removed if it is in this set, or will be 1628 * added if it is not in this set. If <code>start > end</code> 1629 * then an empty range is complemented, leaving the set unchanged. 1630 * 1631 * @param start first character, inclusive, of range 1632 * @param end last character, inclusive, of range 1633 */ complement(int start, int end)1634 public UnicodeSet complement(int start, int end) { 1635 checkFrozen(); 1636 if (start < MIN_VALUE || start > MAX_VALUE) { 1637 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1638 } 1639 if (end < MIN_VALUE || end > MAX_VALUE) { 1640 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1641 } 1642 if (start <= end) { 1643 xor(range(start, end), 2, 0); 1644 } 1645 pat = null; 1646 return this; 1647 } 1648 1649 /** 1650 * Complements the specified character in this set. The character 1651 * will be removed if it is in this set, or will be added if it is 1652 * not in this set. 1653 */ complement(int c)1654 public final UnicodeSet complement(int c) { 1655 return complement(c, c); 1656 } 1657 1658 /** 1659 * This is equivalent to 1660 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1661 * 1662 * <p><strong>Note:</strong> This performs a symmetric difference with all code points 1663 * <em>and thus retains all multicharacter strings</em>. 1664 * In order to achieve a “code point complement” (all code points minus this set), 1665 * the easiest is to .{@link #complement()}.{@link #removeAllStrings()} . 1666 */ complement()1667 public UnicodeSet complement() { 1668 checkFrozen(); 1669 if (list[0] == LOW) { 1670 System.arraycopy(list, 1, list, 0, len-1); 1671 --len; 1672 } else { 1673 ensureCapacity(len+1); 1674 System.arraycopy(list, 0, list, 1, len); 1675 list[0] = LOW; 1676 ++len; 1677 } 1678 pat = null; 1679 return this; 1680 } 1681 1682 /** 1683 * Complement the specified string in this set. 1684 * The set will not contain the specified string once the call 1685 * returns. 1686 * 1687 * @param s the string to complement 1688 * @return this object, for chaining 1689 */ complement(CharSequence s)1690 public final UnicodeSet complement(CharSequence s) { 1691 checkFrozen(); 1692 int cp = getSingleCP(s); 1693 if (cp < 0) { 1694 String s2 = s.toString(); 1695 if (strings.contains(s2)) { 1696 strings.remove(s2); 1697 } else { 1698 addString(s2); 1699 } 1700 pat = null; 1701 } else { 1702 complement(cp, cp); 1703 } 1704 return this; 1705 } 1706 1707 /** 1708 * Returns true if this set contains the given character. 1709 * @param c character to be checked for containment 1710 * @return true if the test condition is met 1711 */ 1712 @Override contains(int c)1713 public boolean contains(int c) { 1714 if (c < MIN_VALUE || c > MAX_VALUE) { 1715 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1716 } 1717 if (bmpSet != null) { 1718 return bmpSet.contains(c); 1719 } 1720 if (stringSpan != null) { 1721 return stringSpan.contains(c); 1722 } 1723 1724 /* 1725 // Set i to the index of the start item greater than ch 1726 // We know we will terminate without length test! 1727 int i = -1; 1728 while (true) { 1729 if (c < list[++i]) break; 1730 } 1731 */ 1732 1733 int i = findCodePoint(c); 1734 1735 return ((i & 1) != 0); // return true if odd 1736 } 1737 1738 /** 1739 * Returns the smallest value i such that c < list[i]. Caller 1740 * must ensure that c is a legal value or this method will enter 1741 * an infinite loop. This method performs a binary search. 1742 * @param c a character in the range MIN_VALUE..MAX_VALUE 1743 * inclusive 1744 * @return the smallest integer i in the range 0..len-1, 1745 * inclusive, such that c < list[i] 1746 */ findCodePoint(int c)1747 private final int findCodePoint(int c) { 1748 /* Examples: 1749 findCodePoint(c) 1750 set list[] c=0 1 3 4 7 8 1751 === ============== =========== 1752 [] [110000] 0 0 0 0 0 0 1753 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1754 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1755 [:all:] [0, 110000] 1 1 1 1 1 1 1756 */ 1757 1758 // Return the smallest i such that c < list[i]. Assume 1759 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1760 if (c < list[0]) return 0; 1761 // High runner test. c is often after the last range, so an 1762 // initial check for this condition pays off. 1763 if (len >= 2 && c >= list[len-2]) return len-1; 1764 int lo = 0; 1765 int hi = len - 1; 1766 // invariant: c >= list[lo] 1767 // invariant: c < list[hi] 1768 for (;;) { 1769 int i = (lo + hi) >>> 1; 1770 if (i == lo) return hi; 1771 if (c < list[i]) { 1772 hi = i; 1773 } else { 1774 lo = i; 1775 } 1776 } 1777 } 1778 1779 // //---------------------------------------------------------------- 1780 // // Unrolled binary search 1781 // //---------------------------------------------------------------- 1782 // 1783 // private int validLen = -1; // validated value of len 1784 // private int topOfLow; 1785 // private int topOfHigh; 1786 // private int power; 1787 // private int deltaStart; 1788 // 1789 // private void validate() { 1790 // if (len <= 1) { 1791 // throw new IllegalArgumentException("list.len==" + len + "; must be >1"); 1792 // } 1793 // 1794 // // find greatest power of 2 less than or equal to len 1795 // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {} 1796 // 1797 // // assert(exp2[power] <= len); 1798 // 1799 // // determine the starting points 1800 // topOfLow = exp2[power] - 1; 1801 // topOfHigh = len - 1; 1802 // deltaStart = exp2[power-1]; 1803 // validLen = len; 1804 // } 1805 // 1806 // private static final int exp2[] = { 1807 // 0x1, 0x2, 0x4, 0x8, 1808 // 0x10, 0x20, 0x40, 0x80, 1809 // 0x100, 0x200, 0x400, 0x800, 1810 // 0x1000, 0x2000, 0x4000, 0x8000, 1811 // 0x10000, 0x20000, 0x40000, 0x80000, 1812 // 0x100000, 0x200000, 0x400000, 0x800000, 1813 // 0x1000000, 0x2000000, 0x4000000, 0x8000000, 1814 // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java 1815 // }; 1816 // 1817 // /** 1818 // * Unrolled lowest index GT. 1819 // */ 1820 // private final int leastIndexGT(int searchValue) { 1821 // 1822 // if (len != validLen) { 1823 // if (len == 1) return 0; 1824 // validate(); 1825 // } 1826 // int temp; 1827 // 1828 // // set up initial range to search. Each subrange is a power of two in length 1829 // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh; 1830 // 1831 // // Completely unrolled binary search, folhighing "Programming Pearls" 1832 // // Each case deliberately falls through to the next 1833 // // Logically, list[-1] < all_search_values && list[count] > all_search_values 1834 // // although the values -1 and count are never actually touched. 1835 // 1836 // // The bounds at each point are low & high, 1837 // // where low == high - delta*2 1838 // // so high - delta is the midpoint 1839 // 1840 // // The invariant AFTER each line is that list[low] < searchValue <= list[high] 1841 // 1842 // switch (power) { 1843 // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java 1844 // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp; 1845 // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp; 1846 // 1847 // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp; 1848 // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp; 1849 // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp; 1850 // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp; 1851 // 1852 // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp; 1853 // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp; 1854 // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp; 1855 // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp; 1856 // 1857 // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp; 1858 // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp; 1859 // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp; 1860 // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp; 1861 // 1862 // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp; 1863 // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp; 1864 // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp; 1865 // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp; 1866 // 1867 // case 12: if (searchValue < list[temp = high- 0x800]) high = temp; 1868 // case 11: if (searchValue < list[temp = high- 0x400]) high = temp; 1869 // case 10: if (searchValue < list[temp = high- 0x200]) high = temp; 1870 // case 9: if (searchValue < list[temp = high- 0x100]) high = temp; 1871 // 1872 // case 8: if (searchValue < list[temp = high- 0x80]) high = temp; 1873 // case 7: if (searchValue < list[temp = high- 0x40]) high = temp; 1874 // case 6: if (searchValue < list[temp = high- 0x20]) high = temp; 1875 // case 5: if (searchValue < list[temp = high- 0x10]) high = temp; 1876 // 1877 // case 4: if (searchValue < list[temp = high- 0x8]) high = temp; 1878 // case 3: if (searchValue < list[temp = high- 0x4]) high = temp; 1879 // case 2: if (searchValue < list[temp = high- 0x2]) high = temp; 1880 // case 1: if (searchValue < list[temp = high- 0x1]) high = temp; 1881 // } 1882 // 1883 // return high; 1884 // } 1885 // 1886 // // For debugging only 1887 // public int len() { 1888 // return len; 1889 // } 1890 // 1891 // //---------------------------------------------------------------- 1892 // //---------------------------------------------------------------- 1893 1894 /** 1895 * Returns true if this set contains every character 1896 * of the given range. 1897 * @param start first character, inclusive, of the range 1898 * @param end last character, inclusive, of the range 1899 * @return true if the test condition is met 1900 */ contains(int start, int end)1901 public boolean contains(int start, int end) { 1902 if (start < MIN_VALUE || start > MAX_VALUE) { 1903 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1904 } 1905 if (end < MIN_VALUE || end > MAX_VALUE) { 1906 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1907 } 1908 //int i = -1; 1909 //while (true) { 1910 // if (start < list[++i]) break; 1911 //} 1912 int i = findCodePoint(start); 1913 return ((i & 1) != 0 && end < list[i]); 1914 } 1915 1916 /** 1917 * Returns <tt>true</tt> if this set contains the given 1918 * multicharacter string. 1919 * @param s string to be checked for containment 1920 * @return <tt>true</tt> if this set contains the specified string 1921 */ contains(CharSequence s)1922 public final boolean contains(CharSequence s) { 1923 1924 int cp = getSingleCP(s); 1925 if (cp < 0) { 1926 return strings.contains(s.toString()); 1927 } else { 1928 return contains(cp); 1929 } 1930 } 1931 1932 /** 1933 * Returns true if this set contains all the characters and strings 1934 * of the given set. 1935 * @param b set to be checked for containment 1936 * @return true if the test condition is met 1937 */ containsAll(UnicodeSet b)1938 public boolean containsAll(UnicodeSet b) { 1939 // The specified set is a subset if all of its pairs are contained in 1940 // this set. This implementation accesses the lists directly for speed. 1941 // TODO: this could be faster if size() were cached. But that would affect building speed 1942 // so it needs investigation. 1943 int[] listB = b.list; 1944 boolean needA = true; 1945 boolean needB = true; 1946 int aPtr = 0; 1947 int bPtr = 0; 1948 int aLen = len - 1; 1949 int bLen = b.len - 1; 1950 int startA = 0, startB = 0, limitA = 0, limitB = 0; 1951 while (true) { 1952 // double iterations are such a pain... 1953 if (needA) { 1954 if (aPtr >= aLen) { 1955 // ran out of A. If B is also exhausted, then break; 1956 if (needB && bPtr >= bLen) { 1957 break; 1958 } 1959 return false; 1960 } 1961 startA = list[aPtr++]; 1962 limitA = list[aPtr++]; 1963 } 1964 if (needB) { 1965 if (bPtr >= bLen) { 1966 // ran out of B. Since we got this far, we have an A and we are ok so far 1967 break; 1968 } 1969 startB = listB[bPtr++]; 1970 limitB = listB[bPtr++]; 1971 } 1972 // if B doesn't overlap and is greater than A, get new A 1973 if (startB >= limitA) { 1974 needA = true; 1975 needB = false; 1976 continue; 1977 } 1978 // if B is wholy contained in A, then get a new B 1979 if (startB >= startA && limitB <= limitA) { 1980 needA = false; 1981 needB = true; 1982 continue; 1983 } 1984 // all other combinations mean we fail 1985 return false; 1986 } 1987 1988 if (!strings.containsAll(b.strings)) return false; 1989 return true; 1990 } 1991 1992 // /** 1993 // * Returns true if this set contains all the characters and strings 1994 // * of the given set. 1995 // * @param c set to be checked for containment 1996 // * @return true if the test condition is met 1997 // * @stable ICU 2.0 1998 // */ 1999 // public boolean containsAllOld(UnicodeSet c) { 2000 // // The specified set is a subset if all of its pairs are contained in 2001 // // this set. It's possible to code this more efficiently in terms of 2002 // // direct manipulation of the inversion lists if the need arises. 2003 // int n = c.getRangeCount(); 2004 // for (int i=0; i<n; ++i) { 2005 // if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 2006 // return false; 2007 // } 2008 // } 2009 // if (!strings.containsAll(c.strings)) return false; 2010 // return true; 2011 // } 2012 2013 /** 2014 * Returns true if there is a partition of the string such that this set contains each of the partitioned strings. 2015 * For example, for the Unicode set [a{bc}{cd}]<br> 2016 * containsAll is true for each of: "a", "bc", ""cdbca"<br> 2017 * containsAll is false for each of: "acb", "bcda", "bcx"<br> 2018 * @param s string containing characters to be checked for containment 2019 * @return true if the test condition is met 2020 */ containsAll(String s)2021 public boolean containsAll(String s) { 2022 int cp; 2023 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 2024 cp = UTF16.charAt(s, i); 2025 if (!contains(cp)) { 2026 if (!hasStrings()) { 2027 return false; 2028 } 2029 return containsAll(s, 0); 2030 } 2031 } 2032 return true; 2033 } 2034 2035 /** 2036 * Recursive routine called if we fail to find a match in containsAll, and there are strings 2037 * @param s source string 2038 * @param i point to match to the end on 2039 * @return true if ok 2040 */ containsAll(String s, int i)2041 private boolean containsAll(String s, int i) { 2042 if (i >= s.length()) { 2043 return true; 2044 } 2045 int cp= UTF16.charAt(s, i); 2046 if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) { 2047 return true; 2048 } 2049 for (String setStr : strings) { 2050 if (!setStr.isEmpty() && // skip the empty string 2051 s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) { 2052 return true; 2053 } 2054 } 2055 return false; 2056 2057 } 2058 2059 /** 2060 * Get the Regex equivalent for this UnicodeSet 2061 * @return regex pattern equivalent to this UnicodeSet 2062 * @deprecated This API is ICU internal only. 2063 * @hide original deprecated declaration 2064 * @hide draft / provisional / internal are hidden on Android 2065 */ 2066 @Deprecated getRegexEquivalent()2067 public String getRegexEquivalent() { 2068 if (!hasStrings()) { 2069 return toString(); 2070 } 2071 StringBuilder result = new StringBuilder("(?:"); 2072 appendNewPattern(result, true, false); 2073 for (String s : strings) { 2074 result.append('|'); 2075 _appendToPat(result, s, true); 2076 } 2077 return result.append(")").toString(); 2078 } 2079 2080 /** 2081 * Returns true if this set contains none of the characters 2082 * of the given range. 2083 * @param start first character, inclusive, of the range 2084 * @param end last character, inclusive, of the range 2085 * @return true if the test condition is met 2086 */ containsNone(int start, int end)2087 public boolean containsNone(int start, int end) { 2088 if (start < MIN_VALUE || start > MAX_VALUE) { 2089 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 2090 } 2091 if (end < MIN_VALUE || end > MAX_VALUE) { 2092 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 2093 } 2094 int i = -1; 2095 while (true) { 2096 if (start < list[++i]) break; 2097 } 2098 return ((i & 1) == 0 && end < list[i]); 2099 } 2100 2101 /** 2102 * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2103 * For example, for the Unicode set [a{bc}{cd}]<br> 2104 * containsNone is true for: "xy", "cb"<br> 2105 * containsNone is false for: "a", "bc", "bcd"<br> 2106 * @param b set to be checked for containment 2107 * @return true if the test condition is met 2108 */ containsNone(UnicodeSet b)2109 public boolean containsNone(UnicodeSet b) { 2110 // The specified set is a subset if some of its pairs overlap with some of this set's pairs. 2111 // This implementation accesses the lists directly for speed. 2112 int[] listB = b.list; 2113 boolean needA = true; 2114 boolean needB = true; 2115 int aPtr = 0; 2116 int bPtr = 0; 2117 int aLen = len - 1; 2118 int bLen = b.len - 1; 2119 int startA = 0, startB = 0, limitA = 0, limitB = 0; 2120 while (true) { 2121 // double iterations are such a pain... 2122 if (needA) { 2123 if (aPtr >= aLen) { 2124 // ran out of A: break so we test strings 2125 break; 2126 } 2127 startA = list[aPtr++]; 2128 limitA = list[aPtr++]; 2129 } 2130 if (needB) { 2131 if (bPtr >= bLen) { 2132 // ran out of B: break so we test strings 2133 break; 2134 } 2135 startB = listB[bPtr++]; 2136 limitB = listB[bPtr++]; 2137 } 2138 // if B is higher than any part of A, get new A 2139 if (startB >= limitA) { 2140 needA = true; 2141 needB = false; 2142 continue; 2143 } 2144 // if A is higher than any part of B, get new B 2145 if (startA >= limitB) { 2146 needA = false; 2147 needB = true; 2148 continue; 2149 } 2150 // all other combinations mean we fail 2151 return false; 2152 } 2153 2154 if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false; 2155 return true; 2156 } 2157 2158 // /** 2159 // * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2160 // * For example, for the Unicode set [a{bc}{cd}]<br> 2161 // * containsNone is true for: "xy", "cb"<br> 2162 // * containsNone is false for: "a", "bc", "bcd"<br> 2163 // * @param c set to be checked for containment 2164 // * @return true if the test condition is met 2165 // * @stable ICU 2.0 2166 // */ 2167 // public boolean containsNoneOld(UnicodeSet c) { 2168 // // The specified set is a subset if all of its pairs are contained in 2169 // // this set. It's possible to code this more efficiently in terms of 2170 // // direct manipulation of the inversion lists if the need arises. 2171 // int n = c.getRangeCount(); 2172 // for (int i=0; i<n; ++i) { 2173 // if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 2174 // return false; 2175 // } 2176 // } 2177 // if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false; 2178 // return true; 2179 // } 2180 2181 /** 2182 * Returns true if this set contains none of the characters 2183 * of the given string. 2184 * @param s string containing characters to be checked for containment 2185 * @return true if the test condition is met 2186 */ containsNone(CharSequence s)2187 public boolean containsNone(CharSequence s) { 2188 return span(s, SpanCondition.NOT_CONTAINED) == s.length(); 2189 } 2190 2191 /** 2192 * Returns true if this set contains one or more of the characters 2193 * in the given range. 2194 * @param start first character, inclusive, of the range 2195 * @param end last character, inclusive, of the range 2196 * @return true if the condition is met 2197 */ containsSome(int start, int end)2198 public final boolean containsSome(int start, int end) { 2199 return !containsNone(start, end); 2200 } 2201 2202 /** 2203 * Returns true if this set contains one or more of the characters 2204 * and strings of the given set. 2205 * @param s set to be checked for containment 2206 * @return true if the condition is met 2207 */ containsSome(UnicodeSet s)2208 public final boolean containsSome(UnicodeSet s) { 2209 return !containsNone(s); 2210 } 2211 2212 /** 2213 * Returns true if this set contains one or more of the characters 2214 * of the given string. 2215 * @param s string containing characters to be checked for containment 2216 * @return true if the condition is met 2217 */ containsSome(CharSequence s)2218 public final boolean containsSome(CharSequence s) { 2219 return !containsNone(s); 2220 } 2221 2222 2223 /** 2224 * Adds all of the elements in the specified set to this set if 2225 * they're not already present. This operation effectively 2226 * modifies this set so that its value is the <i>union</i> of the two 2227 * sets. The behavior of this operation is unspecified if the specified 2228 * collection is modified while the operation is in progress. 2229 * 2230 * @param c set whose elements are to be added to this set. 2231 */ addAll(UnicodeSet c)2232 public UnicodeSet addAll(UnicodeSet c) { 2233 checkFrozen(); 2234 add(c.list, c.len, 0); 2235 if (c.hasStrings()) { 2236 if (strings == EMPTY_STRINGS) { 2237 strings = new TreeSet<>(c.strings); 2238 } else { 2239 strings.addAll(c.strings); 2240 } 2241 } 2242 return this; 2243 } 2244 2245 /** 2246 * Retains only the elements in this set that are contained in the 2247 * specified set. In other words, removes from this set all of 2248 * its elements that are not contained in the specified set. This 2249 * operation effectively modifies this set so that its value is 2250 * the <i>intersection</i> of the two sets. 2251 * 2252 * @param c set that defines which elements this set will retain. 2253 */ retainAll(UnicodeSet c)2254 public UnicodeSet retainAll(UnicodeSet c) { 2255 checkFrozen(); 2256 retain(c.list, c.len, 0); 2257 if (hasStrings()) { 2258 if (!c.hasStrings()) { 2259 strings.clear(); 2260 } else { 2261 strings.retainAll(c.strings); 2262 } 2263 } 2264 return this; 2265 } 2266 2267 /** 2268 * Removes from this set all of its elements that are contained in the 2269 * specified set. This operation effectively modifies this 2270 * set so that its value is the <i>asymmetric set difference</i> of 2271 * the two sets. 2272 * 2273 * @param c set that defines which elements will be removed from 2274 * this set. 2275 */ removeAll(UnicodeSet c)2276 public UnicodeSet removeAll(UnicodeSet c) { 2277 checkFrozen(); 2278 retain(c.list, c.len, 2); 2279 if (hasStrings() && c.hasStrings()) { 2280 strings.removeAll(c.strings); 2281 } 2282 return this; 2283 } 2284 2285 /** 2286 * Complements in this set all elements contained in the specified 2287 * set. Any character in the other set will be removed if it is 2288 * in this set, or will be added if it is not in this set. 2289 * 2290 * @param c set that defines which elements will be complemented from 2291 * this set. 2292 */ complementAll(UnicodeSet c)2293 public UnicodeSet complementAll(UnicodeSet c) { 2294 checkFrozen(); 2295 xor(c.list, c.len, 0); 2296 if (c.hasStrings()) { 2297 if (strings == EMPTY_STRINGS) { 2298 strings = new TreeSet<>(c.strings); 2299 } else { 2300 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings); 2301 } 2302 } 2303 return this; 2304 } 2305 2306 /** 2307 * Removes all of the elements from this set. This set will be 2308 * empty after this call returns. 2309 */ clear()2310 public UnicodeSet clear() { 2311 checkFrozen(); 2312 list[0] = HIGH; 2313 len = 1; 2314 pat = null; 2315 if (hasStrings()) { 2316 strings.clear(); 2317 } 2318 return this; 2319 } 2320 2321 /** 2322 * Iteration method that returns the number of ranges contained in 2323 * this set. 2324 * @see #getRangeStart 2325 * @see #getRangeEnd 2326 */ getRangeCount()2327 public int getRangeCount() { 2328 return len/2; 2329 } 2330 2331 /** 2332 * Iteration method that returns the first character in the 2333 * specified range of this set. 2334 * @exception ArrayIndexOutOfBoundsException if index is outside 2335 * the range <code>0..getRangeCount()-1</code> 2336 * @see #getRangeCount 2337 * @see #getRangeEnd 2338 */ getRangeStart(int index)2339 public int getRangeStart(int index) { 2340 return list[index*2]; 2341 } 2342 2343 /** 2344 * Iteration method that returns the last character in the 2345 * specified range of this set. 2346 * @exception ArrayIndexOutOfBoundsException if index is outside 2347 * the range <code>0..getRangeCount()-1</code> 2348 * @see #getRangeStart 2349 * @see #getRangeEnd 2350 */ getRangeEnd(int index)2351 public int getRangeEnd(int index) { 2352 return (list[index*2 + 1] - 1); 2353 } 2354 2355 /** 2356 * Reallocate this objects internal structures to take up the least 2357 * possible space, without changing this object's value. 2358 */ compact()2359 public UnicodeSet compact() { 2360 checkFrozen(); 2361 if ((len + 7) < list.length) { 2362 // If we have more than a little unused capacity, shrink it to len. 2363 list = Arrays.copyOf(list, len); 2364 } 2365 rangeList = null; 2366 buffer = null; 2367 if (strings != EMPTY_STRINGS && strings.isEmpty()) { 2368 strings = EMPTY_STRINGS; 2369 } 2370 return this; 2371 } 2372 2373 /** 2374 * Compares the specified object with this set for equality. Returns 2375 * <tt>true</tt> if the specified object is also a set, the two sets 2376 * have the same size, and every member of the specified set is 2377 * contained in this set (or equivalently, every member of this set is 2378 * contained in the specified set). 2379 * 2380 * @param o Object to be compared for equality with this set. 2381 * @return <tt>true</tt> if the specified Object is equal to this set. 2382 */ 2383 @Override equals(Object o)2384 public boolean equals(Object o) { 2385 if (o == null) { 2386 return false; 2387 } 2388 if (this == o) { 2389 return true; 2390 } 2391 try { 2392 UnicodeSet that = (UnicodeSet) o; 2393 if (len != that.len) return false; 2394 for (int i = 0; i < len; ++i) { 2395 if (list[i] != that.list[i]) return false; 2396 } 2397 if (!strings.equals(that.strings)) return false; 2398 } catch (Exception e) { 2399 return false; 2400 } 2401 return true; 2402 } 2403 2404 /** 2405 * Returns the hash code value for this set. 2406 * 2407 * @return the hash code value for this set. 2408 * @see java.lang.Object#hashCode() 2409 */ 2410 @Override hashCode()2411 public int hashCode() { 2412 int result = len; 2413 for (int i = 0; i < len; ++i) { 2414 result *= 1000003; 2415 result += list[i]; 2416 } 2417 return result; 2418 } 2419 2420 /** 2421 * Return a programmer-readable string representation of this object. 2422 */ 2423 @Override toString()2424 public String toString() { 2425 return toPattern(true); 2426 } 2427 2428 //---------------------------------------------------------------- 2429 // Implementation: Pattern parsing 2430 //---------------------------------------------------------------- 2431 2432 /** 2433 * Parses the given pattern, starting at the given position. The character 2434 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 2435 * Parsing continues until the corresponding closing ']'. If a syntax error 2436 * is encountered between the opening and closing brace, the parse fails. 2437 * Upon return from a successful parse, the ParsePosition is updated to 2438 * point to the character following the closing ']', and an inversion 2439 * list for the parsed pattern is returned. This method 2440 * calls itself recursively to parse embedded subpatterns. 2441 * 2442 * @param pattern the string containing the pattern to be parsed. The 2443 * portion of the string from pos.getIndex(), which must be a '[', to the 2444 * corresponding closing ']', is parsed. 2445 * @param pos upon entry, the position at which to being parsing. The 2446 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 2447 * from a successful parse, pos.getIndex() is either the character after the 2448 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 2449 * is the last character of the pattern string. 2450 * @return an inversion list for the parsed substring 2451 * of <code>pattern</code> 2452 * @exception java.lang.IllegalArgumentException if the parse fails. 2453 * @deprecated This API is ICU internal only. 2454 * @hide original deprecated declaration 2455 * @hide draft / provisional / internal are hidden on Android 2456 */ 2457 @Deprecated applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2458 public UnicodeSet applyPattern(String pattern, 2459 ParsePosition pos, 2460 SymbolTable symbols, 2461 int options) { 2462 2463 // Need to build the pattern in a temporary string because 2464 // _applyPattern calls add() etc., which set pat to empty. 2465 boolean parsePositionWasNull = pos == null; 2466 if (parsePositionWasNull) { 2467 pos = new ParsePosition(0); 2468 } 2469 2470 StringBuilder rebuiltPat = new StringBuilder(); 2471 RuleCharacterIterator chars = 2472 new RuleCharacterIterator(pattern, symbols, pos); 2473 applyPattern(chars, symbols, rebuiltPat, options, 0); 2474 if (chars.inVariable()) { 2475 syntaxError(chars, "Extra chars in variable value"); 2476 } 2477 pat = rebuiltPat.toString(); 2478 if (parsePositionWasNull) { 2479 int i = pos.getIndex(); 2480 2481 // Skip over trailing whitespace 2482 if ((options & IGNORE_SPACE) != 0) { 2483 i = PatternProps.skipWhiteSpace(pattern, i); 2484 } 2485 2486 if (i != pattern.length()) { 2487 throw new IllegalArgumentException("Parse of \"" + pattern + 2488 "\" failed at " + i); 2489 } 2490 } 2491 return this; 2492 } 2493 2494 // Add constants to make the applyPattern() code easier to follow. 2495 2496 private static final int LAST0_START = 0, 2497 LAST1_RANGE = 1, 2498 LAST2_SET = 2; 2499 2500 private static final int MODE0_NONE = 0, 2501 MODE1_INBRACKET = 1, 2502 MODE2_OUTBRACKET = 2; 2503 2504 private static final int SETMODE0_NONE = 0, 2505 SETMODE1_UNICODESET = 1, 2506 SETMODE2_PROPERTYPAT = 2, 2507 SETMODE3_PREPARSED = 3; 2508 2509 private static final int MAX_DEPTH = 100; 2510 2511 /** 2512 * Parse the pattern from the given RuleCharacterIterator. The 2513 * iterator is advanced over the parsed pattern. 2514 * @param chars iterator over the pattern characters. Upon return 2515 * it will be advanced to the first character after the parsed 2516 * pattern, or the end of the iteration if all characters are 2517 * parsed. 2518 * @param symbols symbol table to use to parse and dereference 2519 * variables, or null if none. 2520 * @param rebuiltPat the pattern that was parsed, rebuilt or 2521 * copied from the input pattern, as appropriate. 2522 * @param options a bit mask. 2523 * Valid options are {@link #IGNORE_SPACE} and 2524 * at most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 2525 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 2526 */ applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth)2527 private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 2528 Appendable rebuiltPat, int options, int depth) { 2529 if (depth > MAX_DEPTH) { 2530 syntaxError(chars, "Pattern nested too deeply"); 2531 } 2532 2533 // Syntax characters: [ ] ^ - & { } 2534 2535 // Recognized special forms for chars, sets: c-c s-s s&s 2536 2537 int opts = RuleCharacterIterator.PARSE_VARIABLES | 2538 RuleCharacterIterator.PARSE_ESCAPES; 2539 if ((options & IGNORE_SPACE) != 0) { 2540 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 2541 } 2542 2543 StringBuilder patBuf = new StringBuilder(), buf = null; 2544 boolean usePat = false; 2545 UnicodeSet scratch = null; 2546 RuleCharacterIterator.Position backup = null; 2547 2548 // mode: 0=before [, 1=between [...], 2=after ] 2549 // lastItem: 0=none, 1=char, 2=set 2550 int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; 2551 char op = 0; 2552 2553 boolean invert = false; 2554 2555 clear(); 2556 String lastString = null; 2557 2558 while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { 2559 //Eclipse stated the following is "dead code" 2560 /* 2561 if (false) { 2562 // Debugging assertion 2563 if (!((lastItem == 0 && op == 0) || 2564 (lastItem == 1 && (op == 0 || op == '-')) || 2565 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 2566 throw new IllegalArgumentException(); 2567 } 2568 }*/ 2569 2570 int c = 0; 2571 boolean literal = false; 2572 UnicodeSet nested = null; 2573 2574 // -------- Check for property pattern 2575 2576 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 2577 int setMode = SETMODE0_NONE; 2578 if (resemblesPropertyPattern(chars, opts)) { 2579 setMode = SETMODE2_PROPERTYPAT; 2580 } 2581 2582 // -------- Parse '[' of opening delimiter OR nested set. 2583 // If there is a nested set, use `setMode' to define how 2584 // the set should be parsed. If the '[' is part of the 2585 // opening delimiter for this pattern, parse special 2586 // strings "[", "[^", "[-", and "[^-". Check for stand-in 2587 // characters representing a nested set in the symbol 2588 // table. 2589 2590 else { 2591 // Prepare to backup if necessary 2592 backup = chars.getPos(backup); 2593 c = chars.next(opts); 2594 literal = chars.isEscaped(); 2595 2596 if (c == '[' && !literal) { 2597 if (mode == MODE1_INBRACKET) { 2598 chars.setPos(backup); // backup 2599 setMode = SETMODE1_UNICODESET; 2600 } else { 2601 // Handle opening '[' delimiter 2602 mode = MODE1_INBRACKET; 2603 patBuf.append('['); 2604 backup = chars.getPos(backup); // prepare to backup 2605 c = chars.next(opts); 2606 literal = chars.isEscaped(); 2607 if (c == '^' && !literal) { 2608 invert = true; 2609 patBuf.append('^'); 2610 backup = chars.getPos(backup); // prepare to backup 2611 c = chars.next(opts); 2612 literal = chars.isEscaped(); 2613 } 2614 // Fall through to handle special leading '-'; 2615 // otherwise restart loop for nested [], \p{}, etc. 2616 if (c == '-') { 2617 literal = true; 2618 // Fall through to handle literal '-' below 2619 } else { 2620 chars.setPos(backup); // backup 2621 continue; 2622 } 2623 } 2624 } else if (symbols != null) { 2625 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 2626 if (m != null) { 2627 try { 2628 nested = (UnicodeSet) m; 2629 setMode = SETMODE3_PREPARSED; 2630 } catch (ClassCastException e) { 2631 syntaxError(chars, "Syntax error"); 2632 } 2633 } 2634 } 2635 } 2636 2637 // -------- Handle a nested set. This either is inline in 2638 // the pattern or represented by a stand-in that has 2639 // previously been parsed and was looked up in the symbol 2640 // table. 2641 2642 if (setMode != SETMODE0_NONE) { 2643 if (lastItem == LAST1_RANGE) { 2644 if (op != 0) { 2645 syntaxError(chars, "Char expected after operator"); 2646 } 2647 add_unchecked(lastChar, lastChar); 2648 _appendToPat(patBuf, lastChar, false); 2649 lastItem = LAST0_START; 2650 op = 0; 2651 } 2652 2653 if (op == '-' || op == '&') { 2654 patBuf.append(op); 2655 } 2656 2657 if (nested == null) { 2658 if (scratch == null) scratch = new UnicodeSet(); 2659 nested = scratch; 2660 } 2661 switch (setMode) { 2662 case SETMODE1_UNICODESET: 2663 nested.applyPattern(chars, symbols, patBuf, options, depth + 1); 2664 break; 2665 case SETMODE2_PROPERTYPAT: 2666 chars.skipIgnored(opts); 2667 nested.applyPropertyPattern(chars, patBuf, symbols); 2668 break; 2669 case SETMODE3_PREPARSED: // `nested' already parsed 2670 nested._toPattern(patBuf, false); 2671 break; 2672 } 2673 2674 usePat = true; 2675 2676 if (mode == MODE0_NONE) { 2677 // Entire pattern is a category; leave parse loop 2678 set(nested); 2679 mode = MODE2_OUTBRACKET; 2680 break; 2681 } 2682 2683 switch (op) { 2684 case '-': 2685 removeAll(nested); 2686 break; 2687 case '&': 2688 retainAll(nested); 2689 break; 2690 case 0: 2691 addAll(nested); 2692 break; 2693 } 2694 2695 op = 0; 2696 lastItem = LAST2_SET; 2697 2698 continue; 2699 } 2700 2701 if (mode == MODE0_NONE) { 2702 syntaxError(chars, "Missing '['"); 2703 } 2704 2705 // -------- Parse special (syntax) characters. If the 2706 // current character is not special, or if it is escaped, 2707 // then fall through and handle it below. 2708 2709 if (!literal) { 2710 switch (c) { 2711 case ']': 2712 if (lastItem == LAST1_RANGE) { 2713 add_unchecked(lastChar, lastChar); 2714 _appendToPat(patBuf, lastChar, false); 2715 } 2716 // Treat final trailing '-' as a literal 2717 if (op == '-') { 2718 add_unchecked(op, op); 2719 patBuf.append(op); 2720 } else if (op == '&') { 2721 syntaxError(chars, "Trailing '&'"); 2722 } 2723 patBuf.append(']'); 2724 mode = MODE2_OUTBRACKET; 2725 continue; 2726 case '-': 2727 if (op == 0) { 2728 if (lastItem != LAST0_START) { 2729 op = (char) c; 2730 continue; 2731 } else if (lastString != null) { 2732 op = (char) c; 2733 continue; 2734 } else { 2735 // Treat final trailing '-' as a literal 2736 add_unchecked(c, c); 2737 c = chars.next(opts); 2738 literal = chars.isEscaped(); 2739 if (c == ']' && !literal) { 2740 patBuf.append("-]"); 2741 mode = MODE2_OUTBRACKET; 2742 continue; 2743 } 2744 } 2745 } 2746 syntaxError(chars, "'-' not after char, string, or set"); 2747 break; 2748 case '&': 2749 if (lastItem == LAST2_SET && op == 0) { 2750 op = (char) c; 2751 continue; 2752 } 2753 syntaxError(chars, "'&' not after set"); 2754 break; 2755 case '^': 2756 syntaxError(chars, "'^' not after '['"); 2757 break; 2758 case '{': 2759 if (op != 0 && op != '-') { 2760 syntaxError(chars, "Missing operand after operator"); 2761 } 2762 if (lastItem == LAST1_RANGE) { 2763 add_unchecked(lastChar, lastChar); 2764 _appendToPat(patBuf, lastChar, false); 2765 } 2766 lastItem = LAST0_START; 2767 if (buf == null) { 2768 buf = new StringBuilder(); 2769 } else { 2770 buf.setLength(0); 2771 } 2772 boolean ok = false; 2773 while (!chars.atEnd()) { 2774 c = chars.next(opts); 2775 literal = chars.isEscaped(); 2776 if (c == '}' && !literal) { 2777 ok = true; 2778 break; 2779 } 2780 appendCodePoint(buf, c); 2781 } 2782 if (!ok) { 2783 syntaxError(chars, "Invalid multicharacter string"); 2784 } 2785 // We have new string. Add it to set and continue; 2786 // we don't need to drop through to the further 2787 // processing 2788 String curString = buf.toString(); 2789 if (op == '-') { 2790 int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); 2791 int curSingle = CharSequences.getSingleCodePoint(curString); 2792 if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { 2793 add(lastSingle,curSingle); 2794 } else { 2795 if (strings == EMPTY_STRINGS) { 2796 strings = new TreeSet<>(); 2797 } 2798 try { 2799 StringRange.expand(lastString, curString, true, strings); 2800 } catch (Exception e) { 2801 syntaxError(chars, e.getMessage()); 2802 } 2803 } 2804 lastString = null; 2805 op = 0; 2806 } else { 2807 add(curString); 2808 lastString = curString; 2809 } 2810 patBuf.append('{'); 2811 _appendToPat(patBuf, curString, false); 2812 patBuf.append('}'); 2813 continue; 2814 case SymbolTable.SYMBOL_REF: 2815 // symbols nosymbols 2816 // [a-$] error error (ambiguous) 2817 // [a$] anchor anchor 2818 // [a-$x] var "x"* literal '$' 2819 // [a-$.] error literal '$' 2820 // *We won't get here in the case of var "x" 2821 backup = chars.getPos(backup); 2822 c = chars.next(opts); 2823 literal = chars.isEscaped(); 2824 boolean anchor = (c == ']' && !literal); 2825 if (symbols == null && !anchor) { 2826 c = SymbolTable.SYMBOL_REF; 2827 chars.setPos(backup); 2828 break; // literal '$' 2829 } 2830 if (anchor && op == 0) { 2831 if (lastItem == LAST1_RANGE) { 2832 add_unchecked(lastChar, lastChar); 2833 _appendToPat(patBuf, lastChar, false); 2834 } 2835 add_unchecked(UnicodeMatcher.ETHER); 2836 usePat = true; 2837 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 2838 mode = MODE2_OUTBRACKET; 2839 continue; 2840 } 2841 syntaxError(chars, "Unquoted '$'"); 2842 break; 2843 default: 2844 break; 2845 } 2846 } 2847 2848 // -------- Parse literal characters. This includes both 2849 // escaped chars ("\u4E01") and non-syntax characters 2850 // ("a"). 2851 2852 switch (lastItem) { 2853 case LAST0_START: 2854 if (op == '-' && lastString != null) { 2855 syntaxError(chars, "Invalid range"); 2856 } 2857 lastItem = LAST1_RANGE; 2858 lastChar = c; 2859 lastString = null; 2860 break; 2861 case LAST1_RANGE: 2862 if (op == '-') { 2863 if (lastString != null) { 2864 syntaxError(chars, "Invalid range"); 2865 } 2866 if (lastChar >= c) { 2867 // Don't allow redundant (a-a) or empty (b-a) ranges; 2868 // these are most likely typos. 2869 syntaxError(chars, "Invalid range"); 2870 } 2871 add_unchecked(lastChar, c); 2872 _appendToPat(patBuf, lastChar, false); 2873 patBuf.append(op); 2874 _appendToPat(patBuf, c, false); 2875 lastItem = LAST0_START; 2876 op = 0; 2877 } else { 2878 add_unchecked(lastChar, lastChar); 2879 _appendToPat(patBuf, lastChar, false); 2880 lastChar = c; 2881 } 2882 break; 2883 case LAST2_SET: 2884 if (op != 0) { 2885 syntaxError(chars, "Set expected after operator"); 2886 } 2887 lastChar = c; 2888 lastItem = LAST1_RANGE; 2889 break; 2890 } 2891 } 2892 2893 if (mode != MODE2_OUTBRACKET) { 2894 syntaxError(chars, "Missing ']'"); 2895 } 2896 2897 chars.skipIgnored(opts); 2898 2899 /** 2900 * Handle global flags (invert, case insensitivity). If this 2901 * pattern should be compiled case-insensitive, then we need 2902 * to close over case BEFORE COMPLEMENTING. This makes 2903 * patterns like /[^abc]/i work. 2904 */ 2905 if ((options & CASE_MASK) != 0) { 2906 closeOver(options); 2907 } 2908 if (invert) { 2909 complement().removeAllStrings(); // code point complement 2910 } 2911 2912 // Use the rebuilt pattern (pat) only if necessary. Prefer the 2913 // generated pattern. 2914 if (usePat) { 2915 append(rebuiltPat, patBuf.toString()); 2916 } else { 2917 appendNewPattern(rebuiltPat, false, true); 2918 } 2919 } 2920 syntaxError(RuleCharacterIterator chars, String msg)2921 private static void syntaxError(RuleCharacterIterator chars, String msg) { 2922 throw new IllegalArgumentException("Error: " + msg + " at \"" + 2923 Utility.escape(chars.toString()) + 2924 '"'); 2925 } 2926 2927 /** 2928 * Add the contents of the UnicodeSet (as strings) into a collection. 2929 * @param target collection to add into 2930 */ addAllTo(T target)2931 public <T extends Collection<String>> T addAllTo(T target) { 2932 return addAllTo(this, target); 2933 } 2934 2935 2936 /** 2937 * Add the contents of the UnicodeSet (as strings) into a collection. 2938 * @param target collection to add into 2939 * @hide unsupported on Android 2940 */ addAllTo(String[] target)2941 public String[] addAllTo(String[] target) { 2942 return addAllTo(this, target); 2943 } 2944 2945 /** 2946 * Add the contents of the UnicodeSet (as strings) into an array. 2947 * @hide unsupported on Android 2948 */ toArray(UnicodeSet set)2949 public static String[] toArray(UnicodeSet set) { 2950 return addAllTo(set, new String[set.size()]); 2951 } 2952 2953 /** 2954 * Add the contents of the collection (as strings) into this UnicodeSet. 2955 * The collection must not contain null. 2956 * @param source the collection to add 2957 * @return a reference to this object 2958 */ add(Iterable<?> source)2959 public UnicodeSet add(Iterable<?> source) { 2960 return addAll(source); 2961 } 2962 2963 /** 2964 * Add a collection (as strings) into this UnicodeSet. 2965 * Uses standard naming convention. 2966 * @param source collection to add into 2967 * @return a reference to this object 2968 */ addAll(Iterable<?> source)2969 public UnicodeSet addAll(Iterable<?> source) { 2970 checkFrozen(); 2971 for (Object o : source) { 2972 add(o.toString()); 2973 } 2974 return this; 2975 } 2976 2977 //---------------------------------------------------------------- 2978 // Implementation: Utility methods 2979 //---------------------------------------------------------------- 2980 nextCapacity(int minCapacity)2981 private int nextCapacity(int minCapacity) { 2982 // Grow exponentially to reduce the frequency of allocations. 2983 if (minCapacity < INITIAL_CAPACITY) { 2984 return minCapacity + INITIAL_CAPACITY; 2985 } else if (minCapacity <= 2500) { 2986 return 5 * minCapacity; 2987 } else { 2988 int newCapacity = 2 * minCapacity; 2989 if (newCapacity > MAX_LENGTH) { 2990 newCapacity = MAX_LENGTH; 2991 } 2992 return newCapacity; 2993 } 2994 } 2995 ensureCapacity(int newLen)2996 private void ensureCapacity(int newLen) { 2997 if (newLen > MAX_LENGTH) { 2998 newLen = MAX_LENGTH; 2999 } 3000 if (newLen <= list.length) return; 3001 int newCapacity = nextCapacity(newLen); 3002 int[] temp = new int[newCapacity]; 3003 // Copy only the actual contents. 3004 System.arraycopy(list, 0, temp, 0, len); 3005 list = temp; 3006 } 3007 ensureBufferCapacity(int newLen)3008 private void ensureBufferCapacity(int newLen) { 3009 if (newLen > MAX_LENGTH) { 3010 newLen = MAX_LENGTH; 3011 } 3012 if (buffer != null && newLen <= buffer.length) return; 3013 int newCapacity = nextCapacity(newLen); 3014 buffer = new int[newCapacity]; 3015 // The buffer has no contents to be copied. 3016 // It is always filled from scratch after this call. 3017 } 3018 3019 /** 3020 * Assumes start <= end. 3021 */ range(int start, int end)3022 private int[] range(int start, int end) { 3023 if (rangeList == null) { 3024 rangeList = new int[] { start, end+1, HIGH }; 3025 } else { 3026 rangeList[0] = start; 3027 rangeList[1] = end+1; 3028 } 3029 return rangeList; 3030 } 3031 3032 //---------------------------------------------------------------- 3033 // Implementation: Fundamental operations 3034 //---------------------------------------------------------------- 3035 3036 // polarity = 0, 3 is normal: x xor y 3037 // polarity = 1, 2: x xor ~y == x === y 3038 xor(int[] other, int otherLen, int polarity)3039 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 3040 ensureBufferCapacity(len + otherLen); 3041 int i = 0, j = 0, k = 0; 3042 int a = list[i++]; 3043 int b; 3044 // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used 3045 // so the following if statement will not be called. 3046 ///CLOVER:OFF 3047 if (polarity == 1 || polarity == 2) { 3048 b = LOW; 3049 if (other[j] == LOW) { // skip base if already LOW 3050 ++j; 3051 b = other[j]; 3052 } 3053 ///CLOVER:ON 3054 } else { 3055 b = other[j++]; 3056 } 3057 // simplest of all the routines 3058 // sort the values, discarding identicals! 3059 while (true) { 3060 if (a < b) { 3061 buffer[k++] = a; 3062 a = list[i++]; 3063 } else if (b < a) { 3064 buffer[k++] = b; 3065 b = other[j++]; 3066 } else if (a != HIGH) { // at this point, a == b 3067 // discard both values! 3068 a = list[i++]; 3069 b = other[j++]; 3070 } else { // DONE! 3071 buffer[k++] = HIGH; 3072 len = k; 3073 break; 3074 } 3075 } 3076 // swap list and buffer 3077 int[] temp = list; 3078 list = buffer; 3079 buffer = temp; 3080 pat = null; 3081 return this; 3082 } 3083 3084 // polarity = 0 is normal: x union y 3085 // polarity = 2: x union ~y 3086 // polarity = 1: ~x union y 3087 // polarity = 3: ~x union ~y 3088 add(int[] other, int otherLen, int polarity)3089 private UnicodeSet add(int[] other, int otherLen, int polarity) { 3090 ensureBufferCapacity(len + otherLen); 3091 int i = 0, j = 0, k = 0; 3092 int a = list[i++]; 3093 int b = other[j++]; 3094 // change from xor is that we have to check overlapping pairs 3095 // polarity bit 1 means a is second, bit 2 means b is. 3096 main: 3097 while (true) { 3098 switch (polarity) { 3099 case 0: // both first; take lower if unequal 3100 if (a < b) { // take a 3101 // Back up over overlapping ranges in buffer[] 3102 if (k > 0 && a <= buffer[k-1]) { 3103 // Pick latter end value in buffer[] vs. list[] 3104 a = max(list[i], buffer[--k]); 3105 } else { 3106 // No overlap 3107 buffer[k++] = a; 3108 a = list[i]; 3109 } 3110 i++; // Common if/else code factored out 3111 polarity ^= 1; 3112 } else if (b < a) { // take b 3113 if (k > 0 && b <= buffer[k-1]) { 3114 b = max(other[j], buffer[--k]); 3115 } else { 3116 buffer[k++] = b; 3117 b = other[j]; 3118 } 3119 j++; 3120 polarity ^= 2; 3121 } else { // a == b, take a, drop b 3122 if (a == HIGH) break main; 3123 // This is symmetrical; it doesn't matter if 3124 // we backtrack with a or b. - liu 3125 if (k > 0 && a <= buffer[k-1]) { 3126 a = max(list[i], buffer[--k]); 3127 } else { 3128 // No overlap 3129 buffer[k++] = a; 3130 a = list[i]; 3131 } 3132 i++; 3133 polarity ^= 1; 3134 b = other[j++]; polarity ^= 2; 3135 } 3136 break; 3137 case 3: // both second; take higher if unequal, and drop other 3138 if (b <= a) { // take a 3139 if (a == HIGH) break main; 3140 buffer[k++] = a; 3141 } else { // take b 3142 if (b == HIGH) break main; 3143 buffer[k++] = b; 3144 } 3145 a = list[i++]; polarity ^= 1; // factored common code 3146 b = other[j++]; polarity ^= 2; 3147 break; 3148 case 1: // a second, b first; if b < a, overlap 3149 if (a < b) { // no overlap, take a 3150 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3151 } else if (b < a) { // OVERLAP, drop b 3152 b = other[j++]; polarity ^= 2; 3153 } else { // a == b, drop both! 3154 if (a == HIGH) break main; 3155 a = list[i++]; polarity ^= 1; 3156 b = other[j++]; polarity ^= 2; 3157 } 3158 break; 3159 case 2: // a first, b second; if a < b, overlap 3160 if (b < a) { // no overlap, take b 3161 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3162 } else if (a < b) { // OVERLAP, drop a 3163 a = list[i++]; polarity ^= 1; 3164 } else { // a == b, drop both! 3165 if (a == HIGH) break main; 3166 a = list[i++]; polarity ^= 1; 3167 b = other[j++]; polarity ^= 2; 3168 } 3169 break; 3170 } 3171 } 3172 buffer[k++] = HIGH; // terminate 3173 len = k; 3174 // swap list and buffer 3175 int[] temp = list; 3176 list = buffer; 3177 buffer = temp; 3178 pat = null; 3179 return this; 3180 } 3181 3182 // polarity = 0 is normal: x intersect y 3183 // polarity = 2: x intersect ~y == set-minus 3184 // polarity = 1: ~x intersect y 3185 // polarity = 3: ~x intersect ~y 3186 retain(int[] other, int otherLen, int polarity)3187 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 3188 ensureBufferCapacity(len + otherLen); 3189 int i = 0, j = 0, k = 0; 3190 int a = list[i++]; 3191 int b = other[j++]; 3192 // change from xor is that we have to check overlapping pairs 3193 // polarity bit 1 means a is second, bit 2 means b is. 3194 main: 3195 while (true) { 3196 switch (polarity) { 3197 case 0: // both first; drop the smaller 3198 if (a < b) { // drop a 3199 a = list[i++]; polarity ^= 1; 3200 } else if (b < a) { // drop b 3201 b = other[j++]; polarity ^= 2; 3202 } else { // a == b, take one, drop other 3203 if (a == HIGH) break main; 3204 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3205 b = other[j++]; polarity ^= 2; 3206 } 3207 break; 3208 case 3: // both second; take lower if unequal 3209 if (a < b) { // take a 3210 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3211 } else if (b < a) { // take b 3212 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3213 } else { // a == b, take one, drop other 3214 if (a == HIGH) break main; 3215 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3216 b = other[j++]; polarity ^= 2; 3217 } 3218 break; 3219 case 1: // a second, b first; 3220 if (a < b) { // NO OVERLAP, drop a 3221 a = list[i++]; polarity ^= 1; 3222 } else if (b < a) { // OVERLAP, take b 3223 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3224 } else { // a == b, drop both! 3225 if (a == HIGH) break main; 3226 a = list[i++]; polarity ^= 1; 3227 b = other[j++]; polarity ^= 2; 3228 } 3229 break; 3230 case 2: // a first, b second; if a < b, overlap 3231 if (b < a) { // no overlap, drop b 3232 b = other[j++]; polarity ^= 2; 3233 } else if (a < b) { // OVERLAP, take a 3234 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3235 } else { // a == b, drop both! 3236 if (a == HIGH) break main; 3237 a = list[i++]; polarity ^= 1; 3238 b = other[j++]; polarity ^= 2; 3239 } 3240 break; 3241 } 3242 } 3243 buffer[k++] = HIGH; // terminate 3244 len = k; 3245 // swap list and buffer 3246 int[] temp = list; 3247 list = buffer; 3248 buffer = temp; 3249 pat = null; 3250 return this; 3251 } 3252 max(int a, int b)3253 private static final int max(int a, int b) { 3254 return (a > b) ? a : b; 3255 } 3256 3257 //---------------------------------------------------------------- 3258 // Generic filter-based scanning code 3259 //---------------------------------------------------------------- 3260 3261 private static interface Filter { contains(int codePoint)3262 boolean contains(int codePoint); 3263 } 3264 3265 private static final class NumericValueFilter implements Filter { 3266 double value; NumericValueFilter(double value)3267 NumericValueFilter(double value) { this.value = value; } 3268 @Override contains(int ch)3269 public boolean contains(int ch) { 3270 return UCharacter.getUnicodeNumericValue(ch) == value; 3271 } 3272 } 3273 3274 private static final class GeneralCategoryMaskFilter implements Filter { 3275 int mask; GeneralCategoryMaskFilter(int mask)3276 GeneralCategoryMaskFilter(int mask) { this.mask = mask; } 3277 @Override contains(int ch)3278 public boolean contains(int ch) { 3279 return ((1 << UCharacter.getType(ch)) & mask) != 0; 3280 } 3281 } 3282 3283 private static final class IntPropertyFilter implements Filter { 3284 int prop; 3285 int value; IntPropertyFilter(int prop, int value)3286 IntPropertyFilter(int prop, int value) { 3287 this.prop = prop; 3288 this.value = value; 3289 } 3290 @Override contains(int ch)3291 public boolean contains(int ch) { 3292 return UCharacter.getIntPropertyValue(ch, prop) == value; 3293 } 3294 } 3295 3296 private static final class ScriptExtensionsFilter implements Filter { 3297 int script; ScriptExtensionsFilter(int script)3298 ScriptExtensionsFilter(int script) { this.script = script; } 3299 @Override contains(int c)3300 public boolean contains(int c) { 3301 return UScript.hasScript(c, script); 3302 } 3303 } 3304 3305 private static final class IdentifierTypeFilter implements Filter { 3306 int idType; IdentifierTypeFilter(int idType)3307 IdentifierTypeFilter(int idType) { this.idType = idType; } 3308 @Override contains(int c)3309 public boolean contains(int c) { 3310 return UCharacterProperty.INSTANCE.hasIDType(c, idType); 3311 } 3312 } 3313 3314 // VersionInfo for unassigned characters 3315 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 3316 3317 private static final class VersionFilter implements Filter { 3318 VersionInfo version; VersionFilter(VersionInfo version)3319 VersionFilter(VersionInfo version) { this.version = version; } 3320 @Override contains(int ch)3321 public boolean contains(int ch) { 3322 VersionInfo v = UCharacter.getAge(ch); 3323 // Reference comparison ok; VersionInfo caches and reuses 3324 // unique objects. 3325 return !Utility.sameObjects(v, NO_VERSION) && 3326 v.compareTo(version) <= 0; 3327 } 3328 } 3329 3330 /** 3331 * Generic filter-based scanning code for UCD property UnicodeSets. 3332 */ applyFilter(Filter filter, UnicodeSet inclusions)3333 private void applyFilter(Filter filter, UnicodeSet inclusions) { 3334 // Logically, walk through all Unicode characters, noting the start 3335 // and end of each range for which filter.contain(c) is 3336 // true. Add each range to a set. 3337 // 3338 // To improve performance, use an inclusions set which 3339 // encodes information about character ranges that are known 3340 // to have identical properties. 3341 // inclusions contains the first characters of 3342 // same-value ranges for the given property. 3343 3344 clear(); 3345 3346 int startHasProperty = -1; 3347 int limitRange = inclusions.getRangeCount(); 3348 3349 for (int j=0; j<limitRange; ++j) { 3350 // get current range 3351 int start = inclusions.getRangeStart(j); 3352 int end = inclusions.getRangeEnd(j); 3353 3354 // for all the code points in the range, process 3355 for (int ch = start; ch <= end; ++ch) { 3356 // only add to the unicodeset on inflection points -- 3357 // where the hasProperty value changes to false 3358 if (filter.contains(ch)) { 3359 if (startHasProperty < 0) { 3360 startHasProperty = ch; 3361 } 3362 } else if (startHasProperty >= 0) { 3363 add_unchecked(startHasProperty, ch-1); 3364 startHasProperty = -1; 3365 } 3366 } 3367 } 3368 if (startHasProperty >= 0) { 3369 add_unchecked(startHasProperty, 0x10FFFF); 3370 } 3371 } 3372 3373 /** 3374 * Remove leading and trailing Pattern_White_Space and compress 3375 * internal Pattern_White_Space to a single space character. 3376 */ mungeCharName(String source)3377 private static String mungeCharName(String source) { 3378 source = PatternProps.trimWhiteSpace(source); 3379 StringBuilder buf = null; 3380 for (int i=0; i<source.length(); ++i) { 3381 char ch = source.charAt(i); 3382 if (PatternProps.isWhiteSpace(ch)) { 3383 if (buf == null) { 3384 buf = new StringBuilder().append(source, 0, i); 3385 } else if (buf.charAt(buf.length() - 1) == ' ') { 3386 continue; 3387 } 3388 ch = ' '; // convert to ' ' 3389 } 3390 if (buf != null) { 3391 buf.append(ch); 3392 } 3393 } 3394 return buf == null ? source : buf.toString(); 3395 } 3396 3397 //---------------------------------------------------------------- 3398 // Property set API 3399 //---------------------------------------------------------------- 3400 3401 /** 3402 * Modifies this set to contain those code points which have the 3403 * given value for the given binary or enumerated property, as 3404 * returned by UCharacter.getIntPropertyValue. Prior contents of 3405 * this set are lost. 3406 * 3407 * @param prop a property in the range 3408 * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or 3409 * UProperty.INT_START..UProperty.INT_LIMIT-1 or. 3410 * UProperty.MASK_START..UProperty.MASK_LIMIT-1. 3411 * 3412 * @param value a value in the range 3413 * UCharacter.getIntPropertyMinValue(prop).. 3414 * UCharacter.getIntPropertyMaxValue(prop), with one exception. 3415 * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be 3416 * a UCharacter.getType() result, but rather a mask value produced 3417 * by logically ORing (1 << UCharacter.getType()) values together. 3418 * This allows grouped categories such as [:L:] to be represented. 3419 * 3420 * @return a reference to this set 3421 */ applyIntPropertyValue(int prop, int value)3422 public UnicodeSet applyIntPropertyValue(int prop, int value) { 3423 // All of the following include checkFrozen() before modifying this set. 3424 if (prop == UProperty.GENERAL_CATEGORY_MASK) { 3425 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3426 applyFilter(new GeneralCategoryMaskFilter(value), inclusions); 3427 } else if (prop == UProperty.SCRIPT_EXTENSIONS) { 3428 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3429 applyFilter(new ScriptExtensionsFilter(value), inclusions); 3430 } else if (prop == UProperty.IDENTIFIER_TYPE) { 3431 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3432 applyFilter(new IdentifierTypeFilter(value), inclusions); 3433 } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) { 3434 if (value == 0 || value == 1) { 3435 set(CharacterProperties.getBinaryPropertySet(prop)); 3436 if (value == 0) { 3437 complement().removeAllStrings(); // code point complement 3438 } 3439 } else { 3440 clear(); 3441 } 3442 } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) { 3443 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3444 applyFilter(new IntPropertyFilter(prop, value), inclusions); 3445 } else { 3446 throw new IllegalArgumentException("unsupported property " + prop); 3447 } 3448 return this; 3449 } 3450 3451 3452 3453 /** 3454 * Modifies this set to contain those code points which have the 3455 * given value for the given property. Prior contents of this 3456 * set are lost. 3457 * 3458 * @param propertyAlias a property alias, either short or long. 3459 * The name is matched loosely. See PropertyAliases.txt for names 3460 * and a description of loose matching. If the value string is 3461 * empty, then this string is interpreted as either a 3462 * General_Category value alias, a Script value alias, a binary 3463 * property alias, or a special ID. Special IDs are matched 3464 * loosely and correspond to the following sets: 3465 * 3466 * "ANY" = [\\u0000-\\U0010FFFF], 3467 * "ASCII" = [\\u0000-\\u007F]. 3468 * 3469 * @param valueAlias a value alias, either short or long. The 3470 * name is matched loosely. See PropertyValueAliases.txt for 3471 * names and a description of loose matching. In addition to 3472 * aliases listed, numeric values and canonical combining classes 3473 * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", 3474 * "220"). The value string may also be empty. 3475 * 3476 * @return a reference to this set 3477 */ applyPropertyAlias(String propertyAlias, String valueAlias)3478 public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { 3479 return applyPropertyAlias(propertyAlias, valueAlias, null); 3480 } 3481 3482 /** 3483 * Modifies this set to contain those code points which have the 3484 * given value for the given property. Prior contents of this 3485 * set are lost. 3486 * @param propertyAlias A string of the property alias. 3487 * @param valueAlias A string of the value alias. 3488 * @param symbols if not null, then symbols are first called to see if a property 3489 * is available. If true, then everything else is skipped. 3490 * @return this set 3491 */ applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3492 public UnicodeSet applyPropertyAlias(String propertyAlias, 3493 String valueAlias, SymbolTable symbols) { 3494 checkFrozen(); 3495 int p; 3496 int v; 3497 boolean invert = false; 3498 3499 if (symbols != null 3500 && (symbols instanceof XSymbolTable) 3501 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { 3502 return this; 3503 } 3504 3505 if (XSYMBOL_TABLE != null) { 3506 if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { 3507 return this; 3508 } 3509 } 3510 3511 if (valueAlias.length() > 0) { 3512 p = UCharacter.getPropertyEnum(propertyAlias); 3513 3514 // Treat gc as gcm 3515 if (p == UProperty.GENERAL_CATEGORY) { 3516 p = UProperty.GENERAL_CATEGORY_MASK; 3517 } 3518 3519 if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) || 3520 (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) || 3521 (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) { 3522 try { 3523 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3524 } catch (IllegalArgumentException e) { 3525 // Handle numeric CCC 3526 if (p == UProperty.CANONICAL_COMBINING_CLASS || 3527 p == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 3528 p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) { 3529 v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias)); 3530 // Anything between 0 and 255 is valid even if unused. 3531 if (v < 0 || v > 255) throw e; 3532 } else { 3533 throw e; 3534 } 3535 } 3536 } 3537 3538 else { 3539 switch (p) { 3540 case UProperty.NUMERIC_VALUE: 3541 { 3542 double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); 3543 applyFilter(new NumericValueFilter(value), 3544 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3545 return this; 3546 } 3547 case UProperty.NAME: 3548 { 3549 // Must munge name, since 3550 // UCharacter.charFromName() does not do 3551 // 'loose' matching. 3552 String buf = mungeCharName(valueAlias); 3553 int ch = UCharacter.getCharFromExtendedName(buf); 3554 if (ch == -1) { 3555 throw new IllegalArgumentException("Invalid character name"); 3556 } 3557 clear(); 3558 add_unchecked(ch); 3559 return this; 3560 } 3561 case UProperty.UNICODE_1_NAME: 3562 // ICU 49 deprecates the Unicode_1_Name property APIs. 3563 throw new IllegalArgumentException("Unicode_1_Name (na1) not supported"); 3564 case UProperty.AGE: 3565 { 3566 // Must munge name, since 3567 // VersionInfo.getInstance() does not do 3568 // 'loose' matching. 3569 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 3570 applyFilter(new VersionFilter(version), 3571 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3572 return this; 3573 } 3574 case UProperty.SCRIPT_EXTENSIONS: 3575 v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias); 3576 // fall through to calling applyIntPropertyValue() 3577 break; 3578 case UProperty.IDENTIFIER_TYPE: 3579 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3580 // fall through to calling applyIntPropertyValue() 3581 break; 3582 default: 3583 // p is a non-binary, non-enumerated property that we 3584 // don't support (yet). 3585 throw new IllegalArgumentException("Unsupported property"); 3586 } 3587 } 3588 } 3589 3590 else { 3591 // valueAlias is empty. Interpret as General Category, Script, 3592 // Binary property, or ANY or ASCII. Upon success, p and v will 3593 // be set. 3594 UPropertyAliases pnames = UPropertyAliases.INSTANCE; 3595 p = UProperty.GENERAL_CATEGORY_MASK; 3596 v = pnames.getPropertyValueEnum(p, propertyAlias); 3597 if (v == UProperty.UNDEFINED) { 3598 p = UProperty.SCRIPT; 3599 v = pnames.getPropertyValueEnum(p, propertyAlias); 3600 if (v == UProperty.UNDEFINED) { 3601 p = pnames.getPropertyEnum(propertyAlias); 3602 if (p == UProperty.UNDEFINED) { 3603 p = -1; 3604 } 3605 if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) { 3606 v = 1; 3607 } else if (p == -1) { 3608 if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) { 3609 set(MIN_VALUE, MAX_VALUE); 3610 return this; 3611 } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) { 3612 set(0, 0x7F); 3613 return this; 3614 } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) { 3615 // [:Assigned:]=[:^Cn:] 3616 p = UProperty.GENERAL_CATEGORY_MASK; 3617 v = (1<<UCharacter.UNASSIGNED); 3618 invert = true; 3619 } else { 3620 // Property name was never matched. 3621 throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias); 3622 } 3623 } else { 3624 // Valid property name, but it isn't binary, so the value 3625 // must be supplied. 3626 throw new IllegalArgumentException("Missing property value"); 3627 } 3628 } 3629 } 3630 } 3631 3632 applyIntPropertyValue(p, v); 3633 if(invert) { 3634 complement().removeAllStrings(); // code point complement 3635 } 3636 3637 return this; 3638 } 3639 3640 //---------------------------------------------------------------- 3641 // Property set patterns 3642 //---------------------------------------------------------------- 3643 3644 /** 3645 * Return true if the given position, in the given pattern, appears 3646 * to be the start of a property set pattern. 3647 */ resemblesPropertyPattern(String pattern, int pos)3648 private static boolean resemblesPropertyPattern(String pattern, int pos) { 3649 // Patterns are at least 5 characters long 3650 if ((pos+5) > pattern.length()) { 3651 return false; 3652 } 3653 3654 // Look for an opening [:, [:^, \p, or \P 3655 return pattern.regionMatches(pos, "[:", 0, 2) || 3656 pattern.regionMatches(true, pos, "\\p", 0, 2) || 3657 pattern.regionMatches(pos, "\\N", 0, 2); 3658 } 3659 3660 /** 3661 * Return true if the given iterator appears to point at a 3662 * property pattern. Regardless of the result, return with the 3663 * iterator unchanged. 3664 * @param chars iterator over the pattern characters. Upon return 3665 * it will be unchanged. 3666 * @param iterOpts RuleCharacterIterator options 3667 */ resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3668 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 3669 int iterOpts) { 3670 boolean result = false; 3671 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 3672 RuleCharacterIterator.Position pos = chars.getPos(null); 3673 int c = chars.next(iterOpts); 3674 if (c == '[' || c == '\\') { 3675 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 3676 result = (c == '[') ? (d == ':') : 3677 (d == 'N' || d == 'p' || d == 'P'); 3678 } 3679 chars.setPos(pos); 3680 return result; 3681 } 3682 3683 /** 3684 * Parse the given property pattern at the given parse position. 3685 * @param symbols TODO 3686 */ applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3687 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 3688 int pos = ppos.getIndex(); 3689 3690 // On entry, ppos should point to one of the following locations: 3691 3692 // Minimum length is 5 characters, e.g. \p{L} 3693 if ((pos+5) > pattern.length()) { 3694 return null; 3695 } 3696 3697 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 3698 boolean isName = false; // true for \N{pat}, o/w false 3699 boolean invert = false; 3700 3701 // Look for an opening [:, [:^, \p, or \P 3702 if (pattern.regionMatches(pos, "[:", 0, 2)) { 3703 posix = true; 3704 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3705 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 3706 ++pos; 3707 invert = true; 3708 } 3709 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 3710 pattern.regionMatches(pos, "\\N", 0, 2)) { 3711 char c = pattern.charAt(pos+1); 3712 invert = (c == 'P'); 3713 isName = (c == 'N'); 3714 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3715 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 3716 // Syntax error; "\p" or "\P" not followed by "{" 3717 return null; 3718 } 3719 } else { 3720 // Open delimiter not seen 3721 return null; 3722 } 3723 3724 // Look for the matching close delimiter, either :] or } 3725 int close = pattern.indexOf(posix ? ":]" : "}", pos); 3726 if (close < 0) { 3727 // Syntax error; close delimiter missing 3728 return null; 3729 } 3730 3731 // Look for an '=' sign. If this is present, we will parse a 3732 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 3733 // pattern. 3734 int equals = pattern.indexOf('=', pos); 3735 String propName, valueName; 3736 if (equals >= 0 && equals < close && !isName) { 3737 // Equals seen; parse medium/long pattern 3738 propName = pattern.substring(pos, equals); 3739 valueName = pattern.substring(equals+1, close); 3740 } 3741 3742 else { 3743 // Handle case where no '=' is seen, and \N{} 3744 propName = pattern.substring(pos, close); 3745 valueName = ""; 3746 3747 // Handle \N{name} 3748 if (isName) { 3749 // This is a little inefficient since it means we have to 3750 // parse "na" back to UProperty.NAME even though we already 3751 // know it's UProperty.NAME. If we refactor the API to 3752 // support args of (int, String) then we can remove 3753 // "na" and make this a little more efficient. 3754 valueName = propName; 3755 propName = "na"; 3756 } 3757 } 3758 3759 applyPropertyAlias(propName, valueName, symbols); 3760 3761 if (invert) { 3762 complement().removeAllStrings(); // code point complement 3763 } 3764 3765 // Move to the limit position after the close delimiter 3766 ppos.setIndex(close + (posix ? 2 : 1)); 3767 3768 return this; 3769 } 3770 3771 /** 3772 * Parse a property pattern. 3773 * @param chars iterator over the pattern characters. Upon return 3774 * it will be advanced to the first character after the parsed 3775 * pattern, or the end of the iteration if all characters are 3776 * parsed. 3777 * @param rebuiltPat the pattern that was parsed, rebuilt or 3778 * copied from the input pattern, as appropriate. 3779 * @param symbols TODO 3780 */ applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3781 private void applyPropertyPattern(RuleCharacterIterator chars, 3782 Appendable rebuiltPat, SymbolTable symbols) { 3783 String patStr = chars.getCurrentBuffer(); 3784 int start = chars.getCurrentBufferPos(); 3785 ParsePosition pos = new ParsePosition(start); 3786 applyPropertyPattern(patStr, pos, symbols); 3787 int length = pos.getIndex() - start; 3788 if (length == 0) { 3789 syntaxError(chars, "Invalid property pattern"); 3790 } 3791 chars.jumpahead(length); 3792 append(rebuiltPat, patStr.substring(start, pos.getIndex())); 3793 } 3794 3795 //---------------------------------------------------------------- 3796 // Case folding API 3797 //---------------------------------------------------------------- 3798 3799 /** 3800 * Bitmask for constructor and applyPattern() indicating that 3801 * white space should be ignored. If set, ignore Unicode Pattern_White_Space characters, 3802 * unless they are quoted or escaped. This may be ORed together 3803 * with other selectors. 3804 */ 3805 public static final int IGNORE_SPACE = 1; 3806 3807 /** 3808 * Alias for {@link #CASE_INSENSITIVE}. 3809 * 3810 * @deprecated ICU 73 Use {@link #CASE_INSENSITIVE} instead. 3811 */ 3812 @Deprecated 3813 public static final int CASE = 2; 3814 3815 /** 3816 * Enable case insensitive matching. E.g., "[ab]" with this flag 3817 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3818 * match all except 'a', 'A', 'b', and 'B'. This performs a full 3819 * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. 3820 * 3821 * <p>This value is an options bit set value for some 3822 * constructors, applyPattern(), and closeOver(). 3823 * It can be ORed together with other, unrelated options. 3824 * 3825 * <p>The resulting set is a superset of the input for the code points but 3826 * not for the strings. 3827 * It performs a case mapping closure of the code points and adds 3828 * full case folding strings for the code points, and reduces strings of 3829 * the original set to their full case folding equivalents. 3830 * 3831 * <p>This is designed for case-insensitive matches, for example 3832 * in regular expressions. The full code point case closure allows checking of 3833 * an input character directly against the closure set. 3834 * Strings are matched by comparing the case-folded form from the closure 3835 * set with an incremental case folding of the string in question. 3836 * 3837 * <p>The closure set will also contain single code points if the original 3838 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 3839 * This is not necessary (that is, redundant) for the above matching method 3840 * but results in the same closure sets regardless of whether the original 3841 * set contained the code point or a string. 3842 */ 3843 public static final int CASE_INSENSITIVE = 2; 3844 3845 /** 3846 * Adds all case mappings for each element in the set. 3847 * This adds the full lower-, title-, and uppercase mappings as well as the full case folding 3848 * of each existing element in the set. 3849 * 3850 * <p>This value is an options bit set value for some 3851 * constructors, applyPattern(), and closeOver(). 3852 * It can be ORed together with other, unrelated options. 3853 * 3854 * <p>Unlike the “case insensitive” options, this does not perform a closure. 3855 * For example, it does not add 'ſ' (U+017F long s) for 's', 3856 * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. 3857 */ 3858 public static final int ADD_CASE_MAPPINGS = 4; 3859 3860 /** 3861 * Enable case insensitive matching. 3862 * Same as {@link #CASE_INSENSITIVE} but using only Simple_Case_Folding (scf) mappings, 3863 * which map each code point to one code point, 3864 * not full Case_Folding (cf) mappings, which map some code points to multiple code points. 3865 * 3866 * <p>This is designed for case-insensitive matches, for example in certain 3867 * regular expression implementations where only Simple_Case_Folding mappings are used, 3868 * such as in ECMAScript (JavaScript) regular expressions. 3869 * 3870 * <p>This value is an options bit set value for some 3871 * constructors, applyPattern(), and closeOver(). 3872 * It can be ORed together with other, unrelated options. 3873 * 3874 * @hide unsupported on Android 3875 */ 3876 public static final int SIMPLE_CASE_INSENSITIVE = 6; 3877 3878 private static final int CASE_MASK = CASE_INSENSITIVE | ADD_CASE_MAPPINGS; 3879 3880 // add the result of a full case mapping to the set 3881 // use str as a temporary string to avoid constructing one addCaseMapping(UnicodeSet set, int result, StringBuilder full)3882 private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { 3883 if(result >= 0) { 3884 if(result > UCaseProps.MAX_STRING_LENGTH) { 3885 // add a single-code point case mapping 3886 set.add(result); 3887 } else { 3888 // add a string case mapping from full with length result 3889 set.add(full.toString()); 3890 full.setLength(0); 3891 } 3892 } 3893 // result < 0: the code point mapped to itself, no need to add it 3894 // see UCaseProps 3895 } 3896 3897 /** For case closure on a large set, look only at code points with relevant properties. */ maybeOnlyCaseSensitive(UnicodeSet src)3898 UnicodeSet maybeOnlyCaseSensitive(UnicodeSet src) { 3899 if (src.size() < 30) { 3900 return src; 3901 } 3902 // Return the intersection of the src code points with Case_Sensitive ones. 3903 UnicodeSet sensitive = CharacterProperties.getBinaryPropertySet(UProperty.CASE_SENSITIVE); 3904 // Start by cloning the "smaller" set. Try not to copy the strings, if there are any in src. 3905 if (src.hasStrings() || src.getRangeCount() > sensitive.getRangeCount()) { 3906 return sensitive.cloneAsThawed().retainAll(src); 3907 } else { 3908 return ((UnicodeSet) src.clone()).retainAll(sensitive); 3909 } 3910 } 3911 3912 // Per-character scf = Simple_Case_Folding of a string. 3913 // (Normally when we case-fold a string we use full case foldings.) scfString(CharSequence s, StringBuilder scf)3914 private static final boolean scfString(CharSequence s, StringBuilder scf) { 3915 int length = s.length(); 3916 // Loop while not needing modification. 3917 for (int i = 0; i < length;) { 3918 int c = Character.codePointAt(s, i); 3919 int scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); 3920 if (scfChar != c) { 3921 // Copy the characters before c. 3922 scf.setLength(0); 3923 scf.append(s, 0, i); 3924 // Loop over the rest of the string and keep case-folding. 3925 for (;;) { 3926 scf.appendCodePoint(scfChar); 3927 i += Character.charCount(c); 3928 if (i == length) { 3929 return true; 3930 } 3931 c = Character.codePointAt(s, i); 3932 scfChar = UCharacter.foldCase(c, UCharacter.FOLD_CASE_DEFAULT); 3933 } 3934 } 3935 i += Character.charCount(c); 3936 } 3937 return false; 3938 } 3939 3940 /** 3941 * Close this set over the given attribute. For the attribute 3942 * {@link #CASE_INSENSITIVE}, the result is to modify this set so that: 3943 * 3944 * <ol> 3945 * <li>For each character or string 'a' in this set, all strings 3946 * 'b' such that foldCase(a) == foldCase(b) are added to this set. 3947 * (For most 'a' that are single characters, 'b' will have 3948 * b.length() == 1.) 3949 * 3950 * <li>For each string 'e' in the resulting set, if e != 3951 * foldCase(e), 'e' will be removed. 3952 * </ol> 3953 * 3954 * <p>Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] 3955 * 3956 * <p>(Here foldCase(x) refers to the operation 3957 * UCharacter.foldCase(x, true), and a == b actually denotes 3958 * a.equals(b), not pointer comparison.) 3959 * 3960 * @param attribute bitmask for attributes to close over. 3961 * Valid options: 3962 * At most one of {@link #CASE_INSENSITIVE}, {@link #ADD_CASE_MAPPINGS}, 3963 * {@link #SIMPLE_CASE_INSENSITIVE}. These case options are mutually exclusive. 3964 * Unrelated options bits are ignored. 3965 * @return a reference to this set. 3966 */ closeOver(int attribute)3967 public UnicodeSet closeOver(int attribute) { 3968 checkFrozen(); 3969 switch (attribute & CASE_MASK) { 3970 case 0: 3971 break; 3972 case CASE_INSENSITIVE: 3973 closeOverCaseInsensitive(/* simple= */ false); 3974 break; 3975 case ADD_CASE_MAPPINGS: 3976 closeOverAddCaseMappings(); 3977 break; 3978 case SIMPLE_CASE_INSENSITIVE: 3979 closeOverCaseInsensitive(/* simple= */ true); 3980 break; 3981 default: 3982 // bad option (unreachable) 3983 break; 3984 } 3985 return this; 3986 } 3987 closeOverCaseInsensitive(boolean simple)3988 private void closeOverCaseInsensitive(boolean simple) { 3989 UCaseProps csp = UCaseProps.INSTANCE; 3990 // Start with input set to guarantee inclusion. 3991 UnicodeSet foldSet = new UnicodeSet(this); 3992 3993 // Full case mappings closure: 3994 // Remove strings because the strings will actually be reduced (folded); 3995 // therefore, start with no strings and add only those needed. 3996 // Do this before processing code points, because they may add strings. 3997 if (!simple && foldSet.hasStrings()) { 3998 foldSet.strings.clear(); 3999 } 4000 4001 UnicodeSet codePoints = maybeOnlyCaseSensitive(this); 4002 4003 // Iterate over the ranges of single code points. Nested loop for each code point. 4004 int n = codePoints.getRangeCount(); 4005 for (int i=0; i<n; ++i) { 4006 int start = codePoints.getRangeStart(i); 4007 int end = codePoints.getRangeEnd(i); 4008 4009 if (simple) { 4010 for (int cp=start; cp<=end; ++cp) { 4011 csp.addSimpleCaseClosure(cp, foldSet); 4012 } 4013 } else { 4014 for (int cp=start; cp<=end; ++cp) { 4015 csp.addCaseClosure(cp, foldSet); 4016 } 4017 } 4018 } 4019 if (hasStrings()) { 4020 StringBuilder sb = simple ? new StringBuilder() : null; 4021 for (String s : strings) { 4022 if (simple) { 4023 if (scfString(s, sb)) { 4024 foldSet.remove(s).add(sb); 4025 } 4026 } else { 4027 String str = UCharacter.foldCase(s, 0); 4028 if(!csp.addStringCaseClosure(str, foldSet)) { 4029 foldSet.add(str); // does not map to code points: add the folded string itself 4030 } 4031 } 4032 } 4033 } 4034 set(foldSet); 4035 } 4036 closeOverAddCaseMappings()4037 private void closeOverAddCaseMappings() { 4038 UCaseProps csp = UCaseProps.INSTANCE; 4039 // Start with input set to guarantee inclusion. 4040 UnicodeSet foldSet = new UnicodeSet(this); 4041 4042 UnicodeSet codePoints = maybeOnlyCaseSensitive(this); 4043 4044 // Iterate over the ranges of single code points. Nested loop for each code point. 4045 int n = codePoints.getRangeCount(); 4046 int result; 4047 StringBuilder full = new StringBuilder(); 4048 4049 for (int i=0; i<n; ++i) { 4050 int start = codePoints.getRangeStart(i); 4051 int end = codePoints.getRangeEnd(i); 4052 4053 // add case mappings 4054 // (does not add long s for regular s, or Kelvin for k, for example) 4055 for (int cp=start; cp<=end; ++cp) { 4056 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); 4057 addCaseMapping(foldSet, result, full); 4058 4059 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); 4060 addCaseMapping(foldSet, result, full); 4061 4062 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); 4063 addCaseMapping(foldSet, result, full); 4064 4065 result = csp.toFullFolding(cp, full, 0); 4066 addCaseMapping(foldSet, result, full); 4067 } 4068 } 4069 if (hasStrings()) { 4070 ULocale root = ULocale.ROOT; 4071 BreakIterator bi = BreakIterator.getWordInstance(root); 4072 for (String str : strings) { 4073 // TODO: call lower-level functions 4074 foldSet.add(UCharacter.toLowerCase(root, str)); 4075 foldSet.add(UCharacter.toTitleCase(root, str, bi)); 4076 foldSet.add(UCharacter.toUpperCase(root, str)); 4077 foldSet.add(UCharacter.foldCase(str, 0)); 4078 } 4079 } 4080 set(foldSet); 4081 } 4082 4083 /** 4084 * Internal class for customizing UnicodeSet parsing of properties. 4085 * TODO: extend to allow customizing of codepoint ranges 4086 * @author medavis 4087 * @hide Only a subset of ICU is exposed in Android 4088 * @hide draft / provisional / internal are hidden on Android 4089 */ 4090 abstract public static class XSymbolTable implements SymbolTable { 4091 /** 4092 * Default constructor 4093 * @hide draft / provisional / internal are hidden on Android 4094 */ XSymbolTable()4095 public XSymbolTable(){} 4096 /** 4097 * Supplies default implementation for SymbolTable (no action). 4098 * @hide draft / provisional / internal are hidden on Android 4099 */ 4100 @Override lookupMatcher(int i)4101 public UnicodeMatcher lookupMatcher(int i) { 4102 return null; 4103 } 4104 4105 /** 4106 * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style 4107 * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be. 4108 * <p> 4109 * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a 4110 * propertyName+propertyValue combination. 4111 * 4112 * @param propertyName 4113 * the name of the property 4114 * @param propertyValue 4115 * the name of the property value 4116 * @param result UnicodeSet value to change 4117 * a set to which the characters having the propertyName+propertyValue are to be added. 4118 * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters 4119 * with that property have been added to the UnicodeSet, and returns false if the 4120 * propertyName+propertyValue combination is not recognized (in which case result is unaltered). 4121 * @hide draft / provisional / internal are hidden on Android 4122 */ applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)4123 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 4124 return false; 4125 } 4126 /** 4127 * Supplies default implementation for SymbolTable (no action). 4128 * @hide draft / provisional / internal are hidden on Android 4129 */ 4130 @Override lookup(String s)4131 public char[] lookup(String s) { 4132 return null; 4133 } 4134 /** 4135 * Supplies default implementation for SymbolTable (no action). 4136 * @hide draft / provisional / internal are hidden on Android 4137 */ 4138 @Override parseReference(String text, ParsePosition pos, int limit)4139 public String parseReference(String text, ParsePosition pos, int limit) { 4140 return null; 4141 } 4142 } 4143 4144 /** 4145 * Is this frozen, according to the Freezable interface? 4146 * 4147 * @return value 4148 */ 4149 @Override isFrozen()4150 public boolean isFrozen() { 4151 return (bmpSet != null || stringSpan != null); 4152 } 4153 4154 /** 4155 * Freeze this class, according to the Freezable interface. 4156 * 4157 * @return this 4158 */ 4159 @Override freeze()4160 public UnicodeSet freeze() { 4161 if (!isFrozen()) { 4162 compact(); 4163 4164 // Optimize contains() and span() and similar functions. 4165 if (hasStrings()) { 4166 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL); 4167 } 4168 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { 4169 // Optimize for code point spans. 4170 // There are no strings, or 4171 // all strings are irrelevant for span() etc. because 4172 // all of each string's code points are contained in this set. 4173 // However, fully contained strings are relevant for spanAndCount(), 4174 // so we create both objects. 4175 bmpSet = new BMPSet(list, len); 4176 } 4177 } 4178 return this; 4179 } 4180 4181 /** 4182 * Span a string using this UnicodeSet. 4183 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4184 * @param s The string to be spanned 4185 * @param spanCondition The span condition 4186 * @return the length of the span 4187 */ span(CharSequence s, SpanCondition spanCondition)4188 public int span(CharSequence s, SpanCondition spanCondition) { 4189 return span(s, 0, spanCondition); 4190 } 4191 4192 /** 4193 * Span a string using this UnicodeSet. 4194 * If the start index is less than 0, span will start from 0. 4195 * If the start index is greater than the string length, span returns the string length. 4196 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4197 * @param s The string to be spanned 4198 * @param start The start index that the span begins 4199 * @param spanCondition The span condition 4200 * @return the string index which ends the span (i.e. exclusive) 4201 */ span(CharSequence s, int start, SpanCondition spanCondition)4202 public int span(CharSequence s, int start, SpanCondition spanCondition) { 4203 int end = s.length(); 4204 if (start < 0) { 4205 start = 0; 4206 } else if (start >= end) { 4207 return end; 4208 } 4209 if (bmpSet != null) { 4210 // Frozen set without strings, or no string is relevant for span(). 4211 return bmpSet.span(s, start, spanCondition, null); 4212 } 4213 if (stringSpan != null) { 4214 return stringSpan.span(s, start, spanCondition); 4215 } else if (hasStrings()) { 4216 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4217 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4218 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4219 if (strSpan.needsStringSpanUTF16()) { 4220 return strSpan.span(s, start, spanCondition); 4221 } 4222 } 4223 4224 return spanCodePointsAndCount(s, start, spanCondition, null); 4225 } 4226 4227 /** 4228 * Same as span() but also counts the smallest number of set elements on any path across the span. 4229 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4230 * @param outCount An output-only object (must not be null) for returning the count. 4231 * @return the limit (exclusive end) of the span 4232 * @deprecated This API is ICU internal only. 4233 * @hide original deprecated declaration 4234 * @hide draft / provisional / internal are hidden on Android 4235 */ 4236 @Deprecated spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4237 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { 4238 if (outCount == null) { 4239 throw new IllegalArgumentException("outCount must not be null"); 4240 } 4241 int end = s.length(); 4242 if (start < 0) { 4243 start = 0; 4244 } else if (start >= end) { 4245 return end; 4246 } 4247 if (stringSpan != null) { 4248 // We might also have bmpSet != null, 4249 // but fully-contained strings are relevant for counting elements. 4250 return stringSpan.spanAndCount(s, start, spanCondition, outCount); 4251 } else if (bmpSet != null) { 4252 return bmpSet.span(s, start, spanCondition, outCount); 4253 } else if (hasStrings()) { 4254 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4255 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4256 which |= UnicodeSetStringSpan.WITH_COUNT; 4257 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4258 return strSpan.spanAndCount(s, start, spanCondition, outCount); 4259 } 4260 4261 return spanCodePointsAndCount(s, start, spanCondition, outCount); 4262 } 4263 spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4264 private int spanCodePointsAndCount(CharSequence s, int start, 4265 SpanCondition spanCondition, OutputInt outCount) { 4266 // Pin to 0/1 values. 4267 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4268 4269 int c; 4270 int next = start; 4271 int length = s.length(); 4272 int count = 0; 4273 do { 4274 c = Character.codePointAt(s, next); 4275 if (spanContained != contains(c)) { 4276 break; 4277 } 4278 ++count; 4279 next += Character.charCount(c); 4280 } while (next < length); 4281 if (outCount != null) { outCount.value = count; } 4282 return next; 4283 } 4284 4285 /** 4286 * Span a string backwards (from the end) using this UnicodeSet. 4287 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4288 * @param s The string to be spanned 4289 * @param spanCondition The span condition 4290 * @return The string index which starts the span (i.e. inclusive). 4291 */ spanBack(CharSequence s, SpanCondition spanCondition)4292 public int spanBack(CharSequence s, SpanCondition spanCondition) { 4293 return spanBack(s, s.length(), spanCondition); 4294 } 4295 4296 /** 4297 * Span a string backwards (from the fromIndex) using this UnicodeSet. 4298 * If the fromIndex is less than 0, spanBack will return 0. 4299 * If fromIndex is greater than the string length, spanBack will start from the string length. 4300 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4301 * @param s The string to be spanned 4302 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards 4303 * @param spanCondition The span condition 4304 * @return The string index which starts the span (i.e. inclusive). 4305 */ spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4306 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { 4307 if (fromIndex <= 0) { 4308 return 0; 4309 } 4310 if (fromIndex > s.length()) { 4311 fromIndex = s.length(); 4312 } 4313 if (bmpSet != null) { 4314 // Frozen set without strings, or no string is relevant for spanBack(). 4315 return bmpSet.spanBack(s, fromIndex, spanCondition); 4316 } 4317 if (stringSpan != null) { 4318 return stringSpan.spanBack(s, fromIndex, spanCondition); 4319 } else if (hasStrings()) { 4320 int which = (spanCondition == SpanCondition.NOT_CONTAINED) 4321 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED 4322 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; 4323 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4324 if (strSpan.needsStringSpanUTF16()) { 4325 return strSpan.spanBack(s, fromIndex, spanCondition); 4326 } 4327 } 4328 4329 // Pin to 0/1 values. 4330 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4331 4332 int c; 4333 int prev = fromIndex; 4334 do { 4335 c = Character.codePointBefore(s, prev); 4336 if (spanContained != contains(c)) { 4337 break; 4338 } 4339 prev -= Character.charCount(c); 4340 } while (prev > 0); 4341 return prev; 4342 } 4343 4344 /** 4345 * Clone a thawed version of this class, according to the Freezable interface. 4346 * @return the clone, not frozen 4347 */ 4348 @Override cloneAsThawed()4349 public UnicodeSet cloneAsThawed() { 4350 UnicodeSet result = new UnicodeSet(this); 4351 assert !result.isFrozen(); 4352 return result; 4353 } 4354 4355 // internal function checkFrozen()4356 private void checkFrozen() { 4357 if (isFrozen()) { 4358 throw new UnsupportedOperationException("Attempt to modify frozen object"); 4359 } 4360 } 4361 4362 // ************************ 4363 // Additional methods for integration with Generics and Collections 4364 // ************************ 4365 4366 /** 4367 * A struct-like class used for iteration through ranges, for faster iteration than by String. 4368 * Read about the restrictions on usage in {@link UnicodeSet#ranges()}. 4369 */ 4370 public static class EntryRange { 4371 /** 4372 * The starting code point of the range. 4373 */ 4374 public int codepoint; 4375 /** 4376 * The ending code point of the range 4377 */ 4378 public int codepointEnd; 4379 EntryRange()4380 EntryRange() { 4381 } 4382 4383 /** 4384 * {@inheritDoc} 4385 */ 4386 @Override toString()4387 public String toString() { 4388 StringBuilder b = new StringBuilder(); 4389 return ( 4390 codepoint == codepointEnd ? _appendToPat(b, codepoint, false) 4391 : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) 4392 .toString(); 4393 } 4394 } 4395 4396 /** 4397 * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points. 4398 * The UnicodeSet must not be altered during the iteration. 4399 * The EntryRange instance is the same each time; the contents are just reset. 4400 * 4401 * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings. 4402 * 4403 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4404 * Do not alter the UnicodeSet while iterating. 4405 * 4406 * <pre> 4407 * // Sample code 4408 * for (EntryRange range : us1.ranges()) { 4409 * // do something with code points between range.codepoint and range.codepointEnd; 4410 * } 4411 * for (String s : us1.strings()) { 4412 * // do something with each string; 4413 * } 4414 * </pre> 4415 */ ranges()4416 public Iterable<EntryRange> ranges() { 4417 return new EntryRangeIterable(); 4418 } 4419 4420 private class EntryRangeIterable implements Iterable<EntryRange> { 4421 @Override iterator()4422 public Iterator<EntryRange> iterator() { 4423 return new EntryRangeIterator(); 4424 } 4425 } 4426 4427 private class EntryRangeIterator implements Iterator<EntryRange> { 4428 int pos; 4429 EntryRange result = new EntryRange(); 4430 4431 @Override hasNext()4432 public boolean hasNext() { 4433 return pos < len-1; 4434 } 4435 @Override next()4436 public EntryRange next() { 4437 if (pos < len-1) { 4438 result.codepoint = list[pos++]; 4439 result.codepointEnd = list[pos++]-1; 4440 } else { 4441 throw new NoSuchElementException(); 4442 } 4443 return result; 4444 } 4445 @Override remove()4446 public void remove() { 4447 throw new UnsupportedOperationException(); 4448 } 4449 } 4450 4451 4452 /** 4453 * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. 4454 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4455 * Do not alter the UnicodeSet while iterating. 4456 * @see java.util.Set#iterator() 4457 */ 4458 @Override iterator()4459 public Iterator<String> iterator() { 4460 return new UnicodeSetIterator2(this); 4461 } 4462 4463 // Cover for string iteration. 4464 private static class UnicodeSetIterator2 implements Iterator<String> { 4465 // Invariants: 4466 // sourceList != null then sourceList[item] is a valid character 4467 // sourceList == null then delegates to stringIterator 4468 private int[] sourceList; 4469 private int len; 4470 private int item; 4471 private int current; 4472 private int limit; 4473 private SortedSet<String> sourceStrings; 4474 private Iterator<String> stringIterator; 4475 private char[] buffer; 4476 UnicodeSetIterator2(UnicodeSet source)4477 UnicodeSetIterator2(UnicodeSet source) { 4478 // set according to invariants 4479 len = source.len - 1; 4480 if (len > 0) { 4481 sourceStrings = source.strings; 4482 sourceList = source.list; 4483 current = sourceList[item++]; 4484 limit = sourceList[item++]; 4485 } else { 4486 stringIterator = source.strings.iterator(); 4487 sourceList = null; 4488 } 4489 } 4490 4491 /* (non-Javadoc) 4492 * @see java.util.Iterator#hasNext() 4493 */ 4494 @Override hasNext()4495 public boolean hasNext() { 4496 return sourceList != null || stringIterator.hasNext(); 4497 } 4498 4499 /* (non-Javadoc) 4500 * @see java.util.Iterator#next() 4501 */ 4502 @Override next()4503 public String next() { 4504 if (sourceList == null) { 4505 return stringIterator.next(); 4506 } 4507 int codepoint = current++; 4508 // we have the codepoint we need, but we may need to adjust the state 4509 if (current >= limit) { 4510 if (item >= len) { 4511 stringIterator = sourceStrings.iterator(); 4512 sourceList = null; 4513 } else { 4514 current = sourceList[item++]; 4515 limit = sourceList[item++]; 4516 } 4517 } 4518 // Now return. Single code point is easy 4519 if (codepoint <= 0xFFFF) { 4520 return String.valueOf((char)codepoint); 4521 } 4522 // But Java lacks a valueOfCodePoint, so we handle ourselves for speed 4523 // allocate a buffer the first time, to make conversion faster. 4524 if (buffer == null) { 4525 buffer = new char[2]; 4526 } 4527 // compute ourselves, to save tests and calls 4528 int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; 4529 buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); 4530 buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); 4531 return String.valueOf(buffer); 4532 } 4533 4534 /* (non-Javadoc) 4535 * @see java.util.Iterator#remove() 4536 */ 4537 @Override remove()4538 public void remove() { 4539 throw new UnsupportedOperationException(); 4540 } 4541 } 4542 4543 /** 4544 * @see #containsAll(android.icu.text.UnicodeSet) 4545 */ containsAll(Iterable<T> collection)4546 public <T extends CharSequence> boolean containsAll(Iterable<T> collection) { 4547 for (T o : collection) { 4548 if (!contains(o)) { 4549 return false; 4550 } 4551 } 4552 return true; 4553 } 4554 4555 /** 4556 * @see #containsNone(android.icu.text.UnicodeSet) 4557 */ containsNone(Iterable<T> collection)4558 public <T extends CharSequence> boolean containsNone(Iterable<T> collection) { 4559 for (T o : collection) { 4560 if (contains(o)) { 4561 return false; 4562 } 4563 } 4564 return true; 4565 } 4566 4567 /** 4568 * @see #containsAll(android.icu.text.UnicodeSet) 4569 */ containsSome(Iterable<T> collection)4570 public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) { 4571 return !containsNone(collection); 4572 } 4573 4574 /** 4575 * @see #addAll(android.icu.text.UnicodeSet) 4576 */ 4577 @SuppressWarnings("unchecked") // See ticket #11395, this is safe. addAll(T... collection)4578 public <T extends CharSequence> UnicodeSet addAll(T... collection) { 4579 checkFrozen(); 4580 for (T str : collection) { 4581 add(str); 4582 } 4583 return this; 4584 } 4585 4586 4587 /** 4588 * @see #removeAll(android.icu.text.UnicodeSet) 4589 */ removeAll(Iterable<T> collection)4590 public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) { 4591 checkFrozen(); 4592 for (T o : collection) { 4593 remove(o); 4594 } 4595 return this; 4596 } 4597 4598 /** 4599 * @see #retainAll(android.icu.text.UnicodeSet) 4600 */ retainAll(Iterable<T> collection)4601 public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) { 4602 checkFrozen(); 4603 // TODO optimize 4604 UnicodeSet toRetain = new UnicodeSet(); 4605 toRetain.addAll(collection); 4606 retainAll(toRetain); 4607 return this; 4608 } 4609 4610 /** 4611 * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}. 4612 */ 4613 public enum ComparisonStyle { 4614 /** 4615 */ 4616 SHORTER_FIRST, 4617 /** 4618 */ 4619 LEXICOGRAPHIC, 4620 /** 4621 */ 4622 LONGER_FIRST 4623 } 4624 4625 /** 4626 * Compares UnicodeSets, where shorter come first, and otherwise lexicographically 4627 * (according to the comparison of the first characters that differ). 4628 * @see java.lang.Comparable#compareTo(java.lang.Object) 4629 */ 4630 @Override compareTo(UnicodeSet o)4631 public int compareTo(UnicodeSet o) { 4632 return compareTo(o, ComparisonStyle.SHORTER_FIRST); 4633 } 4634 /** 4635 * Compares UnicodeSets, in three different ways. 4636 * @see java.lang.Comparable#compareTo(java.lang.Object) 4637 */ compareTo(UnicodeSet o, ComparisonStyle style)4638 public int compareTo(UnicodeSet o, ComparisonStyle style) { 4639 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4640 int diff = size() - o.size(); 4641 if (diff != 0) { 4642 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4643 } 4644 } 4645 int result; 4646 for (int i = 0; ; ++i) { 4647 if (0 != (result = list[i] - o.list[i])) { 4648 // if either list ran out, compare to the last string 4649 if (list[i] == HIGH) { 4650 if (!hasStrings()) return 1; 4651 String item = strings.first(); 4652 return compare(item, o.list[i]); 4653 } 4654 if (o.list[i] == HIGH) { 4655 if (!o.hasStrings()) return -1; 4656 String item = o.strings.first(); 4657 int compareResult = compare(item, list[i]); 4658 return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order. 4659 } 4660 // otherwise return the result if even index, or the reversal if not 4661 return (i & 1) == 0 ? result : -result; 4662 } 4663 if (list[i] == HIGH) { 4664 break; 4665 } 4666 } 4667 return compare(strings, o.strings); 4668 } 4669 4670 /** 4671 */ compareTo(Iterable<String> other)4672 public int compareTo(Iterable<String> other) { 4673 return compare(this, other); 4674 } 4675 4676 /** 4677 * Utility to compare a string to a code point. 4678 * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) 4679 * and comparing, but much faster (no object creation). 4680 * Actually, there is one difference; a null compares as less. 4681 * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order. 4682 * @hide unsupported on Android 4683 */ 4684 compare(CharSequence string, int codePoint)4685 public static int compare(CharSequence string, int codePoint) { 4686 return CharSequences.compare(string, codePoint); 4687 } 4688 4689 /** 4690 * Utility to compare a string to a code point. 4691 * Same results as turning the code point into a string and comparing, but much faster (no object creation). 4692 * Actually, there is one difference; a null compares as less. 4693 * Note that this (=String) order is UTF-16 order -- <i>not</i> code point order. 4694 * @hide unsupported on Android 4695 */ compare(int codePoint, CharSequence string)4696 public static int compare(int codePoint, CharSequence string) { 4697 return -CharSequences.compare(string, codePoint); 4698 } 4699 4700 4701 /** 4702 * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered, 4703 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4704 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4705 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4706 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4707 * @hide unsupported on Android 4708 */ compare(Iterable<T> collection1, Iterable<T> collection2)4709 public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) { 4710 return compare(collection1.iterator(), collection2.iterator()); 4711 } 4712 4713 /** 4714 * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, 4715 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4716 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4717 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4718 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4719 * @deprecated This API is ICU internal only. 4720 * @hide original deprecated declaration 4721 * @hide draft / provisional / internal are hidden on Android 4722 */ 4723 @Deprecated compare(Iterator<T> first, Iterator<T> other)4724 public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) { 4725 while (true) { 4726 if (!first.hasNext()) { 4727 return other.hasNext() ? -1 : 0; 4728 } else if (!other.hasNext()) { 4729 return 1; 4730 } 4731 T item1 = first.next(); 4732 T item2 = other.next(); 4733 int result = item1.compareTo(item2); 4734 if (result != 0) { 4735 return result; 4736 } 4737 } 4738 } 4739 4740 4741 /** 4742 * Utility to compare two collections, optionally by size, and then lexicographically. 4743 * @hide unsupported on Android 4744 */ compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4745 public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) { 4746 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4747 int diff = collection1.size() - collection2.size(); 4748 if (diff != 0) { 4749 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4750 } 4751 } 4752 return compare(collection1, collection2); 4753 } 4754 4755 /** 4756 * Utility for adding the contents of an iterable to a collection. 4757 * @hide unsupported on Android 4758 */ addAllTo(Iterable<T> source, U target)4759 public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) { 4760 for (T item : source) { 4761 target.add(item); 4762 } 4763 return target; 4764 } 4765 4766 /** 4767 * Utility for adding the contents of an iterable to a collection. 4768 * @hide unsupported on Android 4769 */ addAllTo(Iterable<T> source, T[] target)4770 public static <T> T[] addAllTo(Iterable<T> source, T[] target) { 4771 int i = 0; 4772 for (T item : source) { 4773 target[i++] = item; 4774 } 4775 return target; 4776 } 4777 4778 /** 4779 * For iterating through the strings in the set. Example: 4780 * <pre> 4781 * for (String key : myUnicodeSet.strings()) { 4782 * doSomethingWith(key); 4783 * } 4784 * </pre> 4785 */ strings()4786 public Collection<String> strings() { 4787 if (hasStrings()) { 4788 return Collections.unmodifiableSortedSet(strings); 4789 } else { 4790 return EMPTY_STRINGS; 4791 } 4792 } 4793 4794 /** 4795 * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE. 4796 * @deprecated This API is ICU internal only. 4797 * @hide original deprecated declaration 4798 * @hide draft / provisional / internal are hidden on Android 4799 */ 4800 @Deprecated getSingleCodePoint(CharSequence s)4801 public static int getSingleCodePoint(CharSequence s) { 4802 return CharSequences.getSingleCodePoint(s); 4803 } 4804 4805 /** 4806 * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set. 4807 * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E 4808 * if the dontCare set includes unassigned characters (for a particular version of Unicode). 4809 * @param dontCare Set with the don't-care characters for spanning 4810 * @return the input set, modified 4811 * @deprecated This API is ICU internal only. 4812 * @hide original deprecated declaration 4813 * @hide draft / provisional / internal are hidden on Android 4814 */ 4815 @Deprecated addBridges(UnicodeSet dontCare)4816 public UnicodeSet addBridges(UnicodeSet dontCare) { 4817 UnicodeSet notInInput = new UnicodeSet(this).complement().removeAllStrings(); 4818 for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) { 4819 if (it.codepoint != 0 && it.codepointEnd != 0x10FFFF && 4820 dontCare.contains(it.codepoint, it.codepointEnd)) { 4821 add(it.codepoint,it.codepointEnd); 4822 } 4823 } 4824 return this; 4825 } 4826 4827 /** 4828 * Find the first index at or after fromIndex where the UnicodeSet matches at that index. 4829 * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. 4830 * If there is no match, length is returned. 4831 * @deprecated This API is ICU internal only. Use span instead. 4832 * @hide original deprecated declaration 4833 * @hide draft / provisional / internal are hidden on Android 4834 */ 4835 @Deprecated findIn(CharSequence value, int fromIndex, boolean findNot)4836 public int findIn(CharSequence value, int fromIndex, boolean findNot) { 4837 //TODO add strings, optimize, using ICU4C algorithms 4838 int cp; 4839 for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) { 4840 cp = UTF16.charAt(value, fromIndex); 4841 if (contains(cp) != findNot) { 4842 break; 4843 } 4844 } 4845 return fromIndex; 4846 } 4847 4848 /** 4849 * Find the last index before fromIndex where the UnicodeSet matches at that index. 4850 * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match. 4851 * If there is no match, -1 is returned. 4852 * BEFORE index is not in the UnicodeSet. 4853 * @deprecated This API is ICU internal only. Use spanBack instead. 4854 * @hide original deprecated declaration 4855 * @hide draft / provisional / internal are hidden on Android 4856 */ 4857 @Deprecated findLastIn(CharSequence value, int fromIndex, boolean findNot)4858 public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { 4859 //TODO add strings, optimize, using ICU4C algorithms 4860 int cp; 4861 fromIndex -= 1; 4862 for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) { 4863 cp = UTF16.charAt(value, fromIndex); 4864 if (contains(cp) != findNot) { 4865 break; 4866 } 4867 } 4868 return fromIndex < 0 ? -1 : fromIndex; 4869 } 4870 4871 /** 4872 * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match. 4873 * @param source The source of the CharSequence to strip from. 4874 * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. 4875 * @return The string after it has been stripped. 4876 * @deprecated This API is ICU internal only. Use replaceFrom. 4877 * @hide original deprecated declaration 4878 * @hide draft / provisional / internal are hidden on Android 4879 */ 4880 @Deprecated stripFrom(CharSequence source, boolean matches)4881 public String stripFrom(CharSequence source, boolean matches) { 4882 StringBuilder result = new StringBuilder(); 4883 for (int pos = 0; pos < source.length();) { 4884 int inside = findIn(source, pos, !matches); 4885 result.append(source.subSequence(pos, inside)); 4886 pos = findIn(source, inside, matches); // get next start 4887 } 4888 return result.toString(); 4889 } 4890 4891 /** 4892 * Argument values for whether span() and similar functions continue while the current character is contained vs. 4893 * not contained in the set. 4894 * <p> 4895 * The functionality is straightforward for sets with only single code points, without strings (which is the common 4896 * case): 4897 * <ul> 4898 * <li>CONTAINED and SIMPLE work the same. 4899 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. 4900 * <li>span() and spanBack() partition any string the 4901 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). 4902 * <li>Using a 4903 * complemented (inverted) set and the opposite span conditions yields the same results. 4904 * </ul> 4905 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in 4906 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with 4907 * strings: 4908 * <ul> 4909 * <li>The complement of the set contains the opposite set of code points, but the same set of strings. 4910 * Therefore, complementing both the set and the span conditions may yield different results. 4911 * <li>When starting spans 4912 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 4913 * because a set string may start before the later position. 4914 * <li>span(SIMPLE) may be shorter than 4915 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which 4916 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", 4917 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). 4918 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, 4919 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield 4920 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. 4921 * </ul> 4922 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then 4923 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could 4924 * be used. 4925 * <p> 4926 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point 4927 * boundaries, never in the middle of a surrogate pair. 4928 */ 4929 public enum SpanCondition { 4930 /** 4931 * Continues a span() while there is no set element at the current position. 4932 * Increments by one code point at a time. 4933 * Stops before the first set element (character or string). 4934 * (For code points only, this is like while contains(current)==false). 4935 * <p> 4936 * When span() returns, the substring between where it started and the position it returned consists only of 4937 * characters that are not in the set, and none of its strings overlap with the span. 4938 */ 4939 NOT_CONTAINED, 4940 4941 /** 4942 * Spans the longest substring that is a concatenation of set elements (characters or strings). 4943 * (For characters only, this is like while contains(current)==true). 4944 * <p> 4945 * When span() returns, the substring between where it started and the position it returned consists only of set 4946 * elements (characters or strings) that are in the set. 4947 * <p> 4948 * If a set contains strings, then the span will be the longest substring for which there 4949 * exists at least one non-overlapping concatenation of set elements (characters or strings). 4950 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 4951 * (Java/ICU/Perl regex stops at the first match of an OR.) 4952 */ 4953 CONTAINED, 4954 4955 /** 4956 * Continues a span() while there is a set element at the current position. 4957 * Increments by the longest matching element at each position. 4958 * (For characters only, this is like while contains(current)==true). 4959 * <p> 4960 * When span() returns, the substring between where it started and the position it returned consists only of set 4961 * elements (characters or strings) that are in the set. 4962 * <p> 4963 * If a set only contains single characters, then this is the same as CONTAINED. 4964 * <p> 4965 * If a set contains strings, then the span will be the longest substring with a match at each position with the 4966 * longest single set element (character or string). 4967 * <p> 4968 * Use this span condition together with other longest-match algorithms, such as ICU converters 4969 * (ucnv_getUnicodeSet()). 4970 */ 4971 SIMPLE, 4972 4973 /** 4974 * One more than the last span condition. 4975 */ 4976 CONDITION_COUNT 4977 } 4978 4979 /** 4980 * Get the default symbol table. Null means ordinary processing. For internal use only. 4981 * @return the symbol table 4982 * @deprecated This API is ICU internal only. 4983 * @hide original deprecated declaration 4984 * @hide draft / provisional / internal are hidden on Android 4985 */ 4986 @Deprecated getDefaultXSymbolTable()4987 public static XSymbolTable getDefaultXSymbolTable() { 4988 return XSYMBOL_TABLE; 4989 } 4990 4991 /** 4992 * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing 4993 * of UnicodeSets. 4994 * <p> 4995 * WARNING: If this function is used with a UnicodeProperty, and the 4996 * Unassigned characters (gc=Cn) are different than in ICU, you MUST call 4997 * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} 4998 * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. 4999 * 5000 * @param xSymbolTable the new default symbol table. 5001 * @deprecated This API is ICU internal only. 5002 * @hide original deprecated declaration 5003 * @hide draft / provisional / internal are hidden on Android 5004 */ 5005 @Deprecated setDefaultXSymbolTable(XSymbolTable xSymbolTable)5006 public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { 5007 // If the properties override inclusions, these have to be regenerated. 5008 // TODO: Check if the Unicode Tools or Unicode Utilities really need this. 5009 CharacterPropertiesImpl.clear(); 5010 XSYMBOL_TABLE = xSymbolTable; 5011 } 5012 } 5013 //eof 5014