1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.io.IOException; 12 import java.text.ParsePosition; 13 import java.util.ArrayList; 14 import java.util.Arrays; 15 import java.util.Collection; 16 import java.util.Collections; 17 import java.util.Iterator; 18 import java.util.NoSuchElementException; 19 import java.util.SortedSet; 20 import java.util.TreeSet; 21 22 import com.ibm.icu.impl.BMPSet; 23 import com.ibm.icu.impl.CharacterPropertiesImpl; 24 import com.ibm.icu.impl.PatternProps; 25 import com.ibm.icu.impl.RuleCharacterIterator; 26 import com.ibm.icu.impl.SortedSetRelation; 27 import com.ibm.icu.impl.StringRange; 28 import com.ibm.icu.impl.UCaseProps; 29 import com.ibm.icu.impl.UPropertyAliases; 30 import com.ibm.icu.impl.UnicodeSetStringSpan; 31 import com.ibm.icu.impl.Utility; 32 import com.ibm.icu.lang.CharSequences; 33 import com.ibm.icu.lang.CharacterProperties; 34 import com.ibm.icu.lang.UCharacter; 35 import com.ibm.icu.lang.UProperty; 36 import com.ibm.icu.lang.UScript; 37 import com.ibm.icu.util.Freezable; 38 import com.ibm.icu.util.ICUUncheckedIOException; 39 import com.ibm.icu.util.OutputInt; 40 import com.ibm.icu.util.ULocale; 41 import com.ibm.icu.util.VersionInfo; 42 43 /** 44 * A mutable set of Unicode characters and multicharacter strings. 45 * Objects of this class represent <em>character classes</em> used 46 * in regular expressions. A character specifies a subset of Unicode 47 * code points. Legal code points are U+0000 to U+10FFFF, inclusive. 48 * 49 * Note: method freeze() will not only make the set immutable, but 50 * also makes important methods much higher performance: 51 * contains(c), containsNone(...), span(...), spanBack(...) etc. 52 * After the object is frozen, any subsequent call that wants to change 53 * the object will throw UnsupportedOperationException. 54 * 55 * <p>The UnicodeSet class is not designed to be subclassed. 56 * 57 * <p><code>UnicodeSet</code> supports two APIs. The first is the 58 * <em>operand</em> API that allows the caller to modify the value of 59 * a <code>UnicodeSet</code> object. It conforms to Java 2's 60 * <code>java.util.Set</code> interface, although 61 * <code>UnicodeSet</code> does not actually implement that 62 * interface. All methods of <code>Set</code> are supported, with the 63 * modification that they take a character range or single character 64 * instead of an <code>Object</code>, and they take a 65 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 66 * operand API may be thought of in terms of boolean logic: a boolean 67 * OR is implemented by <code>add</code>, a boolean AND is implemented 68 * by <code>retain</code>, a boolean XOR is implemented by 69 * <code>complement</code> taking an argument, and a boolean NOT is 70 * implemented by <code>complement</code> with no argument. In terms 71 * of traditional set theory function names, <code>add</code> is a 72 * union, <code>retain</code> is an intersection, <code>remove</code> 73 * is an asymmetric difference, and <code>complement</code> with no 74 * argument is a set complement with respect to the superset range 75 * <code>MIN_VALUE-MAX_VALUE</code> 76 * 77 * <p>The second API is the 78 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 79 * <code>java.text.Format</code>-derived classes. Unlike the 80 * methods that add characters, add categories, and control the logic 81 * of the set, the method <code>applyPattern()</code> sets all 82 * attributes of a <code>UnicodeSet</code> at once, based on a 83 * string pattern. 84 * 85 * <p><b>Pattern syntax</b></p> 86 * 87 * Patterns are accepted by the constructors and the 88 * <code>applyPattern()</code> methods and returned by the 89 * <code>toPattern()</code> method. These patterns follow a syntax 90 * similar to that employed by version 8 regular expression character 91 * classes. Here are some simple examples: 92 * 93 * <blockquote> 94 * <table> 95 * <tr style="vertical-align: top"> 96 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td> 97 * <td style="vertical-align: top;">No characters</td> 98 * </tr><tr style="vertical-align: top"> 99 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td> 100 * <td style="vertical-align: top;">The character 'a'</td> 101 * </tr><tr style="vertical-align: top"> 102 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td> 103 * <td style="vertical-align: top;">The characters 'a' and 'e'</td> 104 * </tr> 105 * <tr> 106 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td> 107 * <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code 108 * point order</td> 109 * </tr> 110 * <tr> 111 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td> 112 * <td style="vertical-align: top;">The character U+4E01</td> 113 * </tr> 114 * <tr> 115 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td> 116 * <td style="vertical-align: top;">The character 'a' and the multicharacter strings "ab" and 117 * "ac"</td> 118 * </tr> 119 * <tr> 120 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td> 121 * <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td> 122 * </tr> 123 * </table> 124 * </blockquote> 125 * 126 * Any character may be preceded by a backslash in order to remove any special 127 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are 128 * ignored, unless they are escaped. 129 * 130 * <p>Property patterns specify a set of characters having a certain 131 * property as defined by the Unicode standard. Both the POSIX-like 132 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 133 * complete list of supported property patterns, see the User's Guide 134 * for UnicodeSet at 135 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset"> 136 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>. 137 * Actual determination of property data is defined by the underlying 138 * Unicode database as implemented by UCharacter. 139 * 140 * <p>Patterns specify individual characters, ranges of characters, and 141 * Unicode property sets. When elements are concatenated, they 142 * specify their union. To complement a set, place a '^' immediately 143 * after the opening '['. Property patterns are inverted by modifying 144 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 145 * '^' has no special meaning. 146 * 147 * <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]" 148 * perform a “code point complement” (all code points minus the original set), 149 * removing all multicharacter strings, 150 * equivalent to .{@link #complement()}.{@link #removeAllStrings()} . 151 * The {@link #complement()} API function continues to perform a 152 * symmetric difference with all code points and thus retains all multicharacter strings. 153 * 154 * <p>Ranges are indicated by placing two a '-' between two 155 * characters, as in "a-z". This specifies the range of all 156 * characters from the left to the right, in Unicode order. If the 157 * left character is greater than or equal to the 158 * right character it is a syntax error. If a '-' occurs as the first 159 * character after the opening '[' or '[^', or if it occurs as the 160 * last character before the closing ']', then it is taken as a 161 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 162 * set of three characters, 'a', 'b', and '-'. 163 * 164 * <p>Sets may be intersected using the '&' operator or the asymmetric 165 * set difference may be taken using the '-' operator, for example, 166 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 167 * with values less than 4096. Operators ('&' and '|') have equal 168 * precedence and bind left-to-right. Thus 169 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 170 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 171 * difference; intersection is commutative. 172 * 173 * <table> 174 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a' 175 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a' 176 * through 'z' and all letters in between, in Unicode order 177 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing 178 * all characters but 'a' through 'z', 179 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 180 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 181 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 182 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 183 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 184 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 185 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 186 * <em>pat2</em> 187 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code> 188 * <td>The set of characters having the specified 189 * Unicode property; in 190 * this case, Unicode uppercase letters 191 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code> 192 * <td>The set of characters <em>not</em> having the given 193 * Unicode property 194 * </table> 195 * 196 * <p><b>Formal syntax</b></p> 197 * 198 * <blockquote> 199 * <table> 200 * <tr style="vertical-align: top"> 201 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern := </code></td> 202 * <td style="vertical-align: top;"><code>('[' '^'? item* ']') | 203 * property</code></td> 204 * </tr> 205 * <tr style="vertical-align: top"> 206 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item := </code></td> 207 * <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br> 208 * </code></td> 209 * </tr> 210 * <tr style="vertical-align: top"> 211 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr := </code></td> 212 * <td style="vertical-align: top;"><code>pattern | pattern-expr pattern | 213 * pattern-expr op pattern<br> 214 * </code></td> 215 * </tr> 216 * <tr style="vertical-align: top"> 217 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op := </code></td> 218 * <td style="vertical-align: top;"><code>'&' | '-'<br> 219 * </code></td> 220 * </tr> 221 * <tr style="vertical-align: top"> 222 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special := </code></td> 223 * <td style="vertical-align: top;"><code>'[' | ']' | '-'<br> 224 * </code></td> 225 * </tr> 226 * <tr style="vertical-align: top"> 227 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char := </code></td> 228 * <td style="vertical-align: top;"><em>any character that is not</em><code> special<br> 229 * | ('\\' </code><em>any character</em><code>)<br> 230 * | ('\u' hex hex hex hex)<br> 231 * </code></td> 232 * </tr> 233 * <tr style="vertical-align: top"> 234 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> 235 * <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> 236 * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> 237 * </tr> 238 * <tr> 239 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td> 240 * <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td> 241 * </tr> 242 * </table> 243 * <br> 244 * <table border="1"> 245 * <tr> 246 * <td>Legend: <table> 247 * <tr> 248 * <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td> 249 * <td style="width: 20; vertical-align: top;"> </td> 250 * <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td> 251 * </tr> 252 * <tr> 253 * <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td> 254 * <td style="vertical-align: top;"></td> 255 * <td style="vertical-align: top;">zero or one instance of <code>a</code><br> 256 * </td> 257 * </tr> 258 * <tr> 259 * <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td> 260 * <td style="vertical-align: top;"></td> 261 * <td style="vertical-align: top;">one or more instances of <code>a</code><br> 262 * </td> 263 * </tr> 264 * <tr> 265 * <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td> 266 * <td style="vertical-align: top;"></td> 267 * <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br> 268 * </td> 269 * </tr> 270 * <tr> 271 * <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td> 272 * <td style="vertical-align: top;"></td> 273 * <td style="vertical-align: top;">the literal string between the quotes </td> 274 * </tr> 275 * </table> 276 * </td> 277 * </tr> 278 * </table> 279 * </blockquote> 280 * <p>To iterate over contents of UnicodeSet, the following are available: 281 * <ul><li>{@link #ranges()} to iterate through the ranges</li> 282 * <li>{@link #strings()} to iterate through the strings</li> 283 * <li>{@link #iterator()} to iterate through the entire contents in a single loop. 284 * That method is, however, not particularly efficient, since it "boxes" each code point into a String. 285 * </ul> 286 * All of the above can be used in <b>for</b> loops. 287 * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. 288 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 289 * 290 * @author Alan Liu 291 * @stable ICU 2.0 292 * @see UnicodeSetIterator 293 * @see UnicodeSetSpanner 294 */ 295 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> { 296 private static final SortedSet<String> EMPTY_STRINGS = 297 Collections.unmodifiableSortedSet(new TreeSet<String>()); 298 299 /** 300 * Constant for the empty set. 301 * @stable ICU 4.8 302 */ 303 public static final UnicodeSet EMPTY = new UnicodeSet().freeze(); 304 /** 305 * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.) 306 * @stable ICU 4.8 307 */ 308 public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); 309 310 private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing 311 312 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 313 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 314 // 110000 for codepoints 315 316 /** 317 * Enough for sets with few ranges. 318 * For example, White_Space has 10 ranges, list length 21. 319 */ 320 private static final int INITIAL_CAPACITY = 25; 321 322 /** Max list [0, 1, 2, ..., max code point, HIGH] */ 323 private static final int MAX_LENGTH = HIGH + 1; 324 325 /** 326 * Minimum value that can be stored in a UnicodeSet. 327 * @stable ICU 2.0 328 */ 329 public static final int MIN_VALUE = LOW; 330 331 /** 332 * Maximum value that can be stored in a UnicodeSet. 333 * @stable ICU 2.0 334 */ 335 public static final int MAX_VALUE = HIGH - 1; 336 337 private int len; // length used; list may be longer to minimize reallocs 338 private int[] list; // MUST be terminated with HIGH 339 private int[] rangeList; // internal buffer 340 private int[] buffer; // internal buffer 341 342 // is not private so that UnicodeSetIterator can get access 343 SortedSet<String> strings = EMPTY_STRINGS; 344 345 /** 346 * The pattern representation of this set. This may not be the 347 * most economical pattern. It is the pattern supplied to 348 * applyPattern(), with variables substituted and whitespace 349 * removed. For sets constructed without applyPattern(), or 350 * modified using the non-pattern API, this string will be null, 351 * indicating that toPattern() must generate a pattern 352 * representation from the inversion list. 353 */ 354 private String pat = null; 355 356 // Special property set IDs 357 private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF] 358 private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] 359 private static final String ASSIGNED = "Assigned"; // [:^Cn:] 360 361 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. 362 private volatile UnicodeSetStringSpan stringSpan; 363 //---------------------------------------------------------------- 364 // Public API 365 //---------------------------------------------------------------- 366 367 /** 368 * Constructs an empty set. 369 * @stable ICU 2.0 370 */ UnicodeSet()371 public UnicodeSet() { 372 list = new int[INITIAL_CAPACITY]; 373 list[0] = HIGH; 374 len = 1; 375 } 376 377 /** 378 * Constructs a copy of an existing set. 379 * @stable ICU 2.0 380 */ UnicodeSet(UnicodeSet other)381 public UnicodeSet(UnicodeSet other) { 382 set(other); 383 } 384 385 /** 386 * Constructs a set containing the given range. If <code>end > 387 * start</code> then an empty set is created. 388 * 389 * @param start first character, inclusive, of range 390 * @param end last character, inclusive, of range 391 * @stable ICU 2.0 392 */ UnicodeSet(int start, int end)393 public UnicodeSet(int start, int end) { 394 this(); 395 add(start, end); 396 } 397 398 /** 399 * Quickly constructs a set from a set of ranges <s0, e0, s1, e1, s2, e2, ..., sn, en>. 400 * There must be an even number of integers, and they must be all greater than zero, 401 * all less than or equal to Character.MAX_CODE_POINT. 402 * In each pair (..., si, ei, ...) it must be true that si <= ei 403 * Between adjacent pairs (...ei, sj...), it must be true that ei+1 < sj 404 * @param pairs pairs of character representing ranges 405 * @stable ICU 4.4 406 */ UnicodeSet(int... pairs)407 public UnicodeSet(int... pairs) { 408 if ((pairs.length & 1) != 0) { 409 throw new IllegalArgumentException("Must have even number of integers"); 410 } 411 list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set. 412 len = list.length; 413 int last = -1; // used to ensure that the results are monotonically increasing. 414 int i = 0; 415 while (i < pairs.length) { 416 int start = pairs[i]; 417 if (last >= start) { 418 throw new IllegalArgumentException("Must be monotonically increasing."); 419 } 420 list[i++] = start; 421 int limit = pairs[i] + 1; 422 if (start >= limit) { 423 throw new IllegalArgumentException("Must be monotonically increasing."); 424 } 425 list[i++] = last = limit; 426 } 427 list[i] = HIGH; // terminate 428 } 429 430 /** 431 * Constructs a set from the given pattern. See the class description 432 * for the syntax of the pattern language. Whitespace is ignored. 433 * @param pattern a string specifying what characters are in the set 434 * @exception java.lang.IllegalArgumentException if the pattern contains 435 * a syntax error. 436 * @stable ICU 2.0 437 */ UnicodeSet(String pattern)438 public UnicodeSet(String pattern) { 439 this(); 440 applyPattern(pattern, null, null, IGNORE_SPACE); 441 } 442 443 /** 444 * Constructs a set from the given pattern. See the class description 445 * for the syntax of the pattern language. 446 * @param pattern a string specifying what characters are in the set 447 * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters 448 * @exception java.lang.IllegalArgumentException if the pattern contains 449 * a syntax error. 450 * @stable ICU 2.0 451 */ UnicodeSet(String pattern, boolean ignoreWhitespace)452 public UnicodeSet(String pattern, boolean ignoreWhitespace) { 453 this(); 454 applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 455 } 456 457 /** 458 * Constructs a set from the given pattern. See the class description 459 * for the syntax of the pattern language. 460 * @param pattern a string specifying what characters are in the set 461 * @param options a bitmask indicating which options to apply. 462 * Valid options are IGNORE_SPACE and CASE. 463 * @exception java.lang.IllegalArgumentException if the pattern contains 464 * a syntax error. 465 * @stable ICU 3.8 466 */ UnicodeSet(String pattern, int options)467 public UnicodeSet(String pattern, int options) { 468 this(); 469 applyPattern(pattern, null, null, options); 470 } 471 472 /** 473 * Constructs a set from the given pattern. See the class description 474 * for the syntax of the pattern language. 475 * @param pattern a string specifying what characters are in the set 476 * @param pos on input, the position in pattern at which to start parsing. 477 * On output, the position after the last character parsed. 478 * @param symbols a symbol table mapping variables to char[] arrays 479 * and chars to UnicodeSets 480 * @exception java.lang.IllegalArgumentException if the pattern 481 * contains a syntax error. 482 * @stable ICU 2.0 483 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)484 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) { 485 this(); 486 applyPattern(pattern, pos, symbols, IGNORE_SPACE); 487 } 488 489 /** 490 * Constructs a set from the given pattern. See the class description 491 * for the syntax of the pattern language. 492 * @param pattern a string specifying what characters are in the set 493 * @param pos on input, the position in pattern at which to start parsing. 494 * On output, the position after the last character parsed. 495 * @param symbols a symbol table mapping variables to char[] arrays 496 * and chars to UnicodeSets 497 * @param options a bitmask indicating which options to apply. 498 * Valid options are IGNORE_SPACE and CASE. 499 * @exception java.lang.IllegalArgumentException if the pattern 500 * contains a syntax error. 501 * @stable ICU 3.2 502 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)503 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) { 504 this(); 505 applyPattern(pattern, pos, symbols, options); 506 } 507 508 509 /** 510 * Return a new set that is equivalent to this one. 511 * @stable ICU 2.0 512 */ 513 @Override clone()514 public Object clone() { 515 if (isFrozen()) { 516 return this; 517 } 518 return new UnicodeSet(this); 519 } 520 521 /** 522 * Make this object represent the range <code>start - end</code>. 523 * If <code>start > end</code> then this object is set to an empty range. 524 * 525 * @param start first character in the set, inclusive 526 * @param end last character in the set, inclusive 527 * @stable ICU 2.0 528 */ set(int start, int end)529 public UnicodeSet set(int start, int end) { 530 checkFrozen(); 531 clear(); 532 complement(start, end); 533 return this; 534 } 535 536 /** 537 * Make this object represent the same set as <code>other</code>. 538 * @param other a <code>UnicodeSet</code> whose value will be 539 * copied to this object 540 * @stable ICU 2.0 541 */ set(UnicodeSet other)542 public UnicodeSet set(UnicodeSet other) { 543 checkFrozen(); 544 list = Arrays.copyOf(other.list, other.len); 545 len = other.len; 546 pat = other.pat; 547 if (other.hasStrings()) { 548 strings = new TreeSet<>(other.strings); 549 } else { 550 strings = EMPTY_STRINGS; 551 } 552 return this; 553 } 554 555 /** 556 * Modifies this set to represent the set specified by the given pattern. 557 * See the class description for the syntax of the pattern language. 558 * Whitespace is ignored. 559 * @param pattern a string specifying what characters are in the set 560 * @exception java.lang.IllegalArgumentException if the pattern 561 * contains a syntax error. 562 * @stable ICU 2.0 563 */ applyPattern(String pattern)564 public final UnicodeSet applyPattern(String pattern) { 565 checkFrozen(); 566 return applyPattern(pattern, null, null, IGNORE_SPACE); 567 } 568 569 /** 570 * Modifies this set to represent the set specified by the given pattern, 571 * optionally ignoring whitespace. 572 * See the class description for the syntax of the pattern language. 573 * @param pattern a string specifying what characters are in the set 574 * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored 575 * @exception java.lang.IllegalArgumentException if the pattern 576 * contains a syntax error. 577 * @stable ICU 2.0 578 */ applyPattern(String pattern, boolean ignoreWhitespace)579 public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) { 580 checkFrozen(); 581 return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 582 } 583 584 /** 585 * Modifies this set to represent the set specified by the given pattern, 586 * optionally ignoring whitespace. 587 * See the class description for the syntax of the pattern language. 588 * @param pattern a string specifying what characters are in the set 589 * @param options a bitmask indicating which options to apply. 590 * Valid options are IGNORE_SPACE and CASE. 591 * @exception java.lang.IllegalArgumentException if the pattern 592 * contains a syntax error. 593 * @stable ICU 3.8 594 */ applyPattern(String pattern, int options)595 public UnicodeSet applyPattern(String pattern, int options) { 596 checkFrozen(); 597 return applyPattern(pattern, null, null, options); 598 } 599 600 /** 601 * Return true if the given position, in the given pattern, appears 602 * to be the start of a UnicodeSet pattern. 603 * @stable ICU 2.0 604 */ resemblesPattern(String pattern, int pos)605 public static boolean resemblesPattern(String pattern, int pos) { 606 return ((pos+1) < pattern.length() && 607 pattern.charAt(pos) == '[') || 608 resemblesPropertyPattern(pattern, pos); 609 } 610 611 /** 612 * TODO: create Appendable version of UTF16.append(buf, c), 613 * maybe in new class Appendables? 614 * @throws IOException 615 */ appendCodePoint(Appendable app, int c)616 private static void appendCodePoint(Appendable app, int c) { 617 assert 0 <= c && c <= 0x10ffff; 618 try { 619 if (c <= 0xffff) { 620 app.append((char) c); 621 } else { 622 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c)); 623 } 624 } catch (IOException e) { 625 throw new ICUUncheckedIOException(e); 626 } 627 } 628 629 /** 630 * TODO: create class Appendables? 631 * @throws IOException 632 */ append(Appendable app, CharSequence s)633 private static void append(Appendable app, CharSequence s) { 634 try { 635 app.append(s); 636 } catch (IOException e) { 637 throw new ICUUncheckedIOException(e); 638 } 639 } 640 641 /** 642 * Append the <code>toPattern()</code> representation of a 643 * string to the given <code>Appendable</code>. 644 */ _appendToPat(T buf, String s, boolean escapeUnprintable)645 private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) { 646 int cp; 647 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 648 cp = s.codePointAt(i); 649 _appendToPat(buf, cp, escapeUnprintable); 650 } 651 return buf; 652 } 653 654 /** 655 * Append the <code>toPattern()</code> representation of a 656 * character to the given <code>Appendable</code>. 657 */ _appendToPat(T buf, int c, boolean escapeUnprintable)658 private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) { 659 try { 660 if (escapeUnprintable ? Utility.isUnprintable(c) : Utility.shouldAlwaysBeEscaped(c)) { 661 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 662 // unprintable 663 return Utility.escape(buf, c); 664 } 665 // Okay to let ':' pass through 666 switch (c) { 667 case '[': // SET_OPEN: 668 case ']': // SET_CLOSE: 669 case '-': // HYPHEN: 670 case '^': // COMPLEMENT: 671 case '&': // INTERSECTION: 672 case '\\': //BACKSLASH: 673 case '{': 674 case '}': 675 case '$': 676 case ':': 677 buf.append('\\'); 678 break; 679 default: 680 // Escape whitespace 681 if (PatternProps.isWhiteSpace(c)) { 682 buf.append('\\'); 683 } 684 break; 685 } 686 appendCodePoint(buf, c); 687 return buf; 688 } catch (IOException e) { 689 throw new ICUUncheckedIOException(e); 690 } 691 } 692 _appendToPat( T result, int start, int end, boolean escapeUnprintable)693 private static <T extends Appendable> T _appendToPat( 694 T result, int start, int end, boolean escapeUnprintable) { 695 _appendToPat(result, start, escapeUnprintable); 696 if (start != end) { 697 if ((start+1) != end || 698 // Avoid writing what looks like a lead+trail surrogate pair. 699 start == 0xdbff) { 700 try { 701 result.append('-'); 702 } catch (IOException e) { 703 throw new ICUUncheckedIOException(e); 704 } 705 } 706 _appendToPat(result, end, escapeUnprintable); 707 } 708 return result; 709 } 710 711 /** 712 * Returns a string representation of this set. If the result of 713 * calling this function is passed to a UnicodeSet constructor, it 714 * will produce another set that is equal to this one. 715 * @stable ICU 2.0 716 */ 717 @Override toPattern(boolean escapeUnprintable)718 public String toPattern(boolean escapeUnprintable) { 719 if (pat != null && !escapeUnprintable) { 720 return pat; 721 } 722 StringBuilder result = new StringBuilder(); 723 return _toPattern(result, escapeUnprintable).toString(); 724 } 725 726 /** 727 * Append a string representation of this set to result. This will be 728 * a cleaned version of the string passed to applyPattern(), if there 729 * is one. Otherwise it will be generated. 730 */ _toPattern(T result, boolean escapeUnprintable)731 private <T extends Appendable> T _toPattern(T result, 732 boolean escapeUnprintable) { 733 if (pat == null) { 734 return appendNewPattern(result, escapeUnprintable, true); 735 } 736 try { 737 if (!escapeUnprintable) { 738 // TODO: The C++ version does not have this shortcut, and instead 739 // always cleans up the pattern string, 740 // which also escapes Utility.shouldAlwaysBeEscaped(c). 741 // We should sync these implementations. 742 result.append(pat); 743 return result; 744 } 745 boolean oddNumberOfBackslashes = false; 746 for (int i=0; i<pat.length(); ) { 747 int c = pat.codePointAt(i); 748 i += Character.charCount(c); 749 if (Utility.isUnprintable(c)) { 750 // If the unprintable character is preceded by an odd 751 // number of backslashes, then it has been escaped 752 // and we omit the last backslash. 753 Utility.escape(result, c); 754 oddNumberOfBackslashes = false; 755 } else if (!oddNumberOfBackslashes && c == '\\') { 756 // Temporarily withhold an odd-numbered backslash. 757 oddNumberOfBackslashes = true; 758 } else { 759 if (oddNumberOfBackslashes) { 760 result.append('\\'); 761 } 762 appendCodePoint(result, c); 763 oddNumberOfBackslashes = false; 764 } 765 } 766 if (oddNumberOfBackslashes) { 767 result.append('\\'); 768 } 769 return result; 770 } catch (IOException e) { 771 throw new ICUUncheckedIOException(e); 772 } 773 } 774 775 /** 776 * Generate and append a string representation of this set to result. 777 * This does not use this.pat, the cleaned up copy of the string 778 * passed to applyPattern(). 779 * 780 * @param result the buffer into which to generate the pattern 781 * @param escapeUnprintable escape unprintable characters if true 782 * @stable ICU 2.0 783 */ _generatePattern(StringBuffer result, boolean escapeUnprintable)784 public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { 785 return _generatePattern(result, escapeUnprintable, true); 786 } 787 788 /** 789 * Generate and append a string representation of this set to result. 790 * This does not use this.pat, the cleaned up copy of the string 791 * passed to applyPattern(). 792 * 793 * @param result the buffer into which to generate the pattern 794 * @param escapeUnprintable escape unprintable characters if true 795 * @param includeStrings if false, doesn't include the strings. 796 * @stable ICU 3.8 797 */ _generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)798 public StringBuffer _generatePattern(StringBuffer result, 799 boolean escapeUnprintable, boolean includeStrings) { 800 return appendNewPattern(result, escapeUnprintable, includeStrings); 801 } 802 803 // Implementation of public _generatePattern(). 804 // Allows other callers to use a StringBuilder while the existing API is stuck with StringBuffer. appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)805 private <T extends Appendable> T appendNewPattern( 806 T result, boolean escapeUnprintable, boolean includeStrings) { 807 try { 808 result.append('['); 809 810 int i = 0; 811 int limit = len & ~1; // = 2 * getRangeCount() 812 813 // If the set contains at least 2 intervals and includes both 814 // MIN_VALUE and MAX_VALUE, then the inverse representation will 815 // be more economical. 816 // if (getRangeCount() >= 2 && 817 // getRangeStart(0) == MIN_VALUE && 818 // getRangeEnd(last) == MAX_VALUE) 819 // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 820 // If limit == len then len is even and the last range ends with MAX_VALUE. 821 // 822 // *But* do not write the inverse (complement) if there are strings. 823 // Since ICU 70, the '^' performs a code point complement which removes all strings. 824 if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { 825 // Emit the inverse 826 result.append('^'); 827 // Offsetting the inversion list index by one lets us 828 // iterate over the ranges of the set complement. 829 i = 1; 830 --limit; 831 } 832 833 // Emit the ranges as pairs. 834 while (i < limit) { 835 int start = list[i]; // getRangeStart() 836 int end = list[i + 1] - 1; // getRangeEnd() = range limit minus one 837 if (!(0xd800 <= end && end <= 0xdbff)) { 838 _appendToPat(result, start, end, escapeUnprintable); 839 i += 2; 840 } else { 841 // The range ends with a lead surrogate. 842 // Avoid writing what looks like a lead+trail surrogate pair. 843 // 1. Postpone ranges that start with a lead surrogate code point. 844 int firstLead = i; 845 while ((i += 2) < limit && list[i] <= 0xdbff) {} 846 int firstAfterLead = i; 847 // 2. Write following ranges that start with a trail surrogate code point. 848 while (i < limit && (start = list[i]) <= 0xdfff) { 849 _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); 850 i += 2; 851 } 852 // 3. Now write the postponed ranges. 853 for (int j = firstLead; j < firstAfterLead; j += 2) { 854 _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); 855 } 856 } 857 } 858 859 if (includeStrings && hasStrings()) { 860 for (String s : strings) { 861 result.append('{'); 862 _appendToPat(result, s, escapeUnprintable); 863 result.append('}'); 864 } 865 } 866 result.append(']'); 867 return result; 868 } catch (IOException e) { 869 throw new ICUUncheckedIOException(e); 870 } 871 } 872 873 /** 874 * Returns the number of elements in this set (its cardinality) 875 * Note than the elements of a set may include both individual 876 * codepoints and strings. 877 * 878 * @return the number of elements in this set (its cardinality). 879 * @stable ICU 2.0 880 */ size()881 public int size() { 882 int n = 0; 883 int count = getRangeCount(); 884 for (int i = 0; i < count; ++i) { 885 n += getRangeEnd(i) - getRangeStart(i) + 1; 886 } 887 return n + strings.size(); 888 } 889 890 /** 891 * Returns <tt>true</tt> if this set contains no elements. 892 * 893 * @return <tt>true</tt> if this set contains no elements. 894 * @stable ICU 2.0 895 */ isEmpty()896 public boolean isEmpty() { 897 return len == 1 && !hasStrings(); 898 } 899 900 /** 901 * @return true if this set contains multi-character strings or the empty string. 902 * @stable ICU 70 903 */ hasStrings()904 public boolean hasStrings() { 905 return !strings.isEmpty(); 906 } 907 908 /** 909 * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if 910 * this set contains any character whose low byte is the given 911 * value. This is used by <tt>RuleBasedTransliterator</tt> for 912 * indexing. 913 * @stable ICU 2.0 914 */ 915 @Override matchesIndexValue(int v)916 public boolean matchesIndexValue(int v) { 917 /* The index value v, in the range [0,255], is contained in this set if 918 * it is contained in any pair of this set. Pairs either have the high 919 * bytes equal, or unequal. If the high bytes are equal, then we have 920 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 921 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 922 * Then v is contained if xx <= v || v <= yy. (This is identical to the 923 * time zone month containment logic.) 924 */ 925 for (int i=0; i<getRangeCount(); ++i) { 926 int low = getRangeStart(i); 927 int high = getRangeEnd(i); 928 if ((low & ~0xFF) == (high & ~0xFF)) { 929 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 930 return true; 931 } 932 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 933 return true; 934 } 935 } 936 if (hasStrings()) { 937 for (String s : strings) { 938 if (s.isEmpty()) { 939 continue; // skip the empty string 940 } 941 int c = UTF16.charAt(s, 0); 942 if ((c & 0xFF) == v) { 943 return true; 944 } 945 } 946 } 947 return false; 948 } 949 950 /** 951 * Implementation of UnicodeMatcher.matches(). Always matches the 952 * longest possible multichar string. 953 * @stable ICU 2.0 954 */ 955 @Override matches(Replaceable text, int[] offset, int limit, boolean incremental)956 public int matches(Replaceable text, 957 int[] offset, 958 int limit, 959 boolean incremental) { 960 961 if (offset[0] == limit) { 962 if (contains(UnicodeMatcher.ETHER)) { 963 return incremental ? U_PARTIAL_MATCH : U_MATCH; 964 } else { 965 return U_MISMATCH; 966 } 967 } else { 968 if (hasStrings()) { // try strings first 969 970 // might separate forward and backward loops later 971 // for now they are combined 972 973 // TODO Improve efficiency of this, at least in the forward 974 // direction, if not in both. In the forward direction we 975 // can assume the strings are sorted. 976 977 boolean forward = offset[0] < limit; 978 979 // firstChar is the leftmost char to match in the 980 // forward direction or the rightmost char to match in 981 // the reverse direction. 982 char firstChar = text.charAt(offset[0]); 983 984 // If there are multiple strings that can match we 985 // return the longest match. 986 int highWaterLength = 0; 987 988 for (String trial : strings) { 989 if (trial.isEmpty()) { 990 continue; // skip the empty string 991 } 992 993 char c = trial.charAt(forward ? 0 : trial.length() - 1); 994 995 // Strings are sorted, so we can optimize in the 996 // forward direction. 997 if (forward && c > firstChar) break; 998 if (c != firstChar) continue; 999 1000 int length = matchRest(text, offset[0], limit, trial); 1001 1002 if (incremental) { 1003 int maxLen = forward ? limit-offset[0] : offset[0]-limit; 1004 if (length == maxLen) { 1005 // We have successfully matched but only up to limit. 1006 return U_PARTIAL_MATCH; 1007 } 1008 } 1009 1010 if (length == trial.length()) { 1011 // We have successfully matched the whole string. 1012 if (length > highWaterLength) { 1013 highWaterLength = length; 1014 } 1015 // In the forward direction we know strings 1016 // are sorted so we can bail early. 1017 if (forward && length < highWaterLength) { 1018 break; 1019 } 1020 continue; 1021 } 1022 } 1023 1024 // We've checked all strings without a partial match. 1025 // If we have full matches, return the longest one. 1026 if (highWaterLength != 0) { 1027 offset[0] += forward ? highWaterLength : -highWaterLength; 1028 return U_MATCH; 1029 } 1030 } 1031 return super.matches(text, offset, limit, incremental); 1032 } 1033 } 1034 1035 /** 1036 * Returns the longest match for s in text at the given position. 1037 * If limit > start then match forward from start+1 to limit 1038 * matching all characters except s.charAt(0). If limit < start, 1039 * go backward starting from start-1 matching all characters 1040 * except s.charAt(s.length()-1). This method assumes that the 1041 * first character, text.charAt(start), matches s, so it does not 1042 * check it. 1043 * @param text the text to match 1044 * @param start the first character to match. In the forward 1045 * direction, text.charAt(start) is matched against s.charAt(0). 1046 * In the reverse direction, it is matched against 1047 * s.charAt(s.length()-1). 1048 * @param limit the limit offset for matching, either last+1 in 1049 * the forward direction, or last-1 in the reverse direction, 1050 * where last is the index of the last character to match. 1051 * @return If part of s matches up to the limit, return |limit - 1052 * start|. If all of s matches before reaching the limit, return 1053 * s.length(). If there is a mismatch between s and text, return 1054 * 0 1055 */ matchRest(Replaceable text, int start, int limit, String s)1056 private static int matchRest (Replaceable text, int start, int limit, String s) { 1057 int maxLen; 1058 int slen = s.length(); 1059 if (start < limit) { 1060 maxLen = limit - start; 1061 if (maxLen > slen) maxLen = slen; 1062 for (int i = 1; i < maxLen; ++i) { 1063 if (text.charAt(start + i) != s.charAt(i)) return 0; 1064 } 1065 } else { 1066 maxLen = start - limit; 1067 if (maxLen > slen) maxLen = slen; 1068 --slen; // <=> slen = s.length() - 1; 1069 for (int i = 1; i < maxLen; ++i) { 1070 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1071 } 1072 } 1073 return maxLen; 1074 } 1075 1076 /** 1077 * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. 1078 * @internal 1079 * @deprecated This API is ICU internal only. 1080 */ 1081 @Deprecated matchesAt(CharSequence text, int offset)1082 public int matchesAt(CharSequence text, int offset) { 1083 int lastLen = -1; 1084 strings: 1085 if (hasStrings()) { 1086 char firstChar = text.charAt(offset); 1087 String trial = null; 1088 // find the first string starting with firstChar 1089 Iterator<String> it = strings.iterator(); 1090 while (it.hasNext()) { 1091 trial = it.next(); 1092 char firstStringChar = trial.charAt(0); 1093 if (firstStringChar < firstChar) continue; 1094 if (firstStringChar > firstChar) break strings; 1095 } 1096 1097 // now keep checking string until we get the longest one 1098 for (;;) { 1099 int tempLen = matchesAt(text, offset, trial); 1100 if (lastLen > tempLen) break strings; 1101 lastLen = tempLen; 1102 if (!it.hasNext()) break; 1103 trial = it.next(); 1104 } 1105 } 1106 1107 if (lastLen < 2) { 1108 int cp = UTF16.charAt(text, offset); 1109 if (contains(cp)) lastLen = UTF16.getCharCount(cp); 1110 } 1111 1112 return offset+lastLen; 1113 } 1114 1115 /** 1116 * Does one string contain another, starting at a specific offset? 1117 * @param text text to match 1118 * @param offsetInText offset within that text 1119 * @param substring substring to match at offset in text 1120 * @return -1 if match fails, otherwise other.length() 1121 */ 1122 // Note: This method was moved from CollectionUtilities matchesAt(CharSequence text, int offsetInText, CharSequence substring)1123 private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) { 1124 int len = substring.length(); 1125 int textLength = text.length(); 1126 if (textLength + offsetInText > len) { 1127 return -1; 1128 } 1129 int i = 0; 1130 for (int j = offsetInText; i < len; ++i, ++j) { 1131 char pc = substring.charAt(i); 1132 char tc = text.charAt(j); 1133 if (pc != tc) return -1; 1134 } 1135 return i; 1136 } 1137 1138 /** 1139 * Implementation of UnicodeMatcher API. Union the set of all 1140 * characters that may be matched by this object into the given 1141 * set. 1142 * @param toUnionTo the set into which to union the source characters 1143 * @stable ICU 2.2 1144 */ 1145 @Override addMatchSetTo(UnicodeSet toUnionTo)1146 public void addMatchSetTo(UnicodeSet toUnionTo) { 1147 toUnionTo.addAll(this); 1148 } 1149 1150 /** 1151 * Returns the index of the given character within this set, where 1152 * the set is ordered by ascending code point. If the character 1153 * is not in this set, return -1. The inverse of this method is 1154 * <code>charAt()</code>. 1155 * @return an index from 0..size()-1, or -1 1156 * @stable ICU 2.0 1157 */ indexOf(int c)1158 public int indexOf(int c) { 1159 if (c < MIN_VALUE || c > MAX_VALUE) { 1160 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1161 } 1162 int i = 0; 1163 int n = 0; 1164 for (;;) { 1165 int start = list[i++]; 1166 if (c < start) { 1167 return -1; 1168 } 1169 int limit = list[i++]; 1170 if (c < limit) { 1171 return n + c - start; 1172 } 1173 n += limit - start; 1174 } 1175 } 1176 1177 /** 1178 * Returns the character at the given index within this set, where 1179 * the set is ordered by ascending code point. If the index is 1180 * out of range, return -1. The inverse of this method is 1181 * <code>indexOf()</code>. 1182 * @param index an index from 0..size()-1 1183 * @return the character at the given index, or -1. 1184 * @stable ICU 2.0 1185 */ charAt(int index)1186 public int charAt(int index) { 1187 if (index >= 0) { 1188 // len2 is the largest even integer <= len, that is, it is len 1189 // for even values and len-1 for odd values. With odd values 1190 // the last entry is UNICODESET_HIGH. 1191 int len2 = len & ~1; 1192 for (int i=0; i < len2;) { 1193 int start = list[i++]; 1194 int count = list[i++] - start; 1195 if (index < count) { 1196 return start + index; 1197 } 1198 index -= count; 1199 } 1200 } 1201 return -1; 1202 } 1203 1204 /** 1205 * Adds the specified range to this set if it is not already 1206 * present. If this set already contains the specified range, 1207 * the call leaves this set unchanged. If <code>start > end</code> 1208 * then an empty range is added, leaving the set unchanged. 1209 * 1210 * @param start first character, inclusive, of range to be added 1211 * to this set. 1212 * @param end last character, inclusive, of range to be added 1213 * to this set. 1214 * @stable ICU 2.0 1215 */ add(int start, int end)1216 public UnicodeSet add(int start, int end) { 1217 checkFrozen(); 1218 return add_unchecked(start, end); 1219 } 1220 1221 /** 1222 * Adds all characters in range (uses preferred naming convention). 1223 * @param start The index of where to start on adding all characters. 1224 * @param end The index of where to end on adding all characters. 1225 * @return a reference to this object 1226 * @stable ICU 4.4 1227 */ addAll(int start, int end)1228 public UnicodeSet addAll(int start, int end) { 1229 checkFrozen(); 1230 return add_unchecked(start, end); 1231 } 1232 1233 // for internal use, after checkFrozen has been called add_unchecked(int start, int end)1234 private UnicodeSet add_unchecked(int start, int end) { 1235 if (start < MIN_VALUE || start > MAX_VALUE) { 1236 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1237 } 1238 if (end < MIN_VALUE || end > MAX_VALUE) { 1239 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1240 } 1241 if (start < end) { 1242 int limit = end + 1; 1243 // Fast path for adding a new range after the last one. 1244 // Odd list length: [..., lastStart, lastLimit, HIGH] 1245 if ((len & 1) != 0) { 1246 // If the list is empty, set lastLimit low enough to not be adjacent to 0. 1247 int lastLimit = len == 1 ? -2 : list[len - 2]; 1248 if (lastLimit <= start) { 1249 checkFrozen(); 1250 if (lastLimit == start) { 1251 // Extend the last range. 1252 list[len - 2] = limit; 1253 if (limit == HIGH) { 1254 --len; 1255 } 1256 } else { 1257 list[len - 1] = start; 1258 if (limit < HIGH) { 1259 ensureCapacity(len + 2); 1260 list[len++] = limit; 1261 list[len++] = HIGH; 1262 } else { // limit == HIGH 1263 ensureCapacity(len + 1); 1264 list[len++] = HIGH; 1265 } 1266 } 1267 pat = null; 1268 return this; 1269 } 1270 } 1271 // This is slow. Could be much faster using findCodePoint(start) 1272 // and modifying the list, dealing with adjacent & overlapping ranges. 1273 add(range(start, end), 2, 0); 1274 } else if (start == end) { 1275 add(start); 1276 } 1277 return this; 1278 } 1279 1280 // /** 1281 // * Format out the inversion list as a string, for debugging. Uncomment when 1282 // * needed. 1283 // */ 1284 // public final String dump() { 1285 // StringBuffer buf = new StringBuffer("["); 1286 // for (int i=0; i<len; ++i) { 1287 // if (i != 0) buf.append(", "); 1288 // int c = list[i]; 1289 // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') { 1290 // // buf.append((char) c); 1291 // //} else { 1292 // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6)); 1293 // //} 1294 // } 1295 // buf.append("]"); 1296 // return buf.toString(); 1297 // } 1298 1299 /** 1300 * Adds the specified character to this set if it is not already 1301 * present. If this set already contains the specified character, 1302 * the call leaves this set unchanged. 1303 * @stable ICU 2.0 1304 */ add(int c)1305 public final UnicodeSet add(int c) { 1306 checkFrozen(); 1307 return add_unchecked(c); 1308 } 1309 1310 // for internal use only, after checkFrozen has been called add_unchecked(int c)1311 private final UnicodeSet add_unchecked(int c) { 1312 if (c < MIN_VALUE || c > MAX_VALUE) { 1313 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1314 } 1315 1316 // find smallest i such that c < list[i] 1317 // if odd, then it is IN the set 1318 // if even, then it is OUT of the set 1319 int i = findCodePoint(c); 1320 1321 // already in set? 1322 if ((i & 1) != 0) return this; 1323 1324 // HIGH is 0x110000 1325 // assert(list[len-1] == HIGH); 1326 1327 // empty = [HIGH] 1328 // [start_0, limit_0, start_1, limit_1, HIGH] 1329 1330 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1331 // ^ 1332 // list[i] 1333 1334 // i == 0 means c is before the first range 1335 // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into 1336 // add_unchecked, the maximum value that "c" will be compared to 1337 // is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will 1338 // never be reached according to this logic. 1339 if (c == list[i]-1) { 1340 // c is before start of next range 1341 list[i] = c; 1342 // if we touched the HIGH mark, then add a new one 1343 if (c == MAX_VALUE) { 1344 ensureCapacity(len+1); 1345 list[len++] = HIGH; 1346 } 1347 if (i > 0 && c == list[i-1]) { 1348 // collapse adjacent ranges 1349 1350 // [..., start_k-1, c, c, limit_k, ..., HIGH] 1351 // ^ 1352 // list[i] 1353 System.arraycopy(list, i+1, list, i-1, len-i-1); 1354 len -= 2; 1355 } 1356 } 1357 1358 else if (i > 0 && c == list[i-1]) { 1359 // c is after end of prior range 1360 list[i-1]++; 1361 // no need to check for collapse here 1362 } 1363 1364 else { 1365 // At this point we know the new char is not adjacent to 1366 // any existing ranges, and it is not 10FFFF. 1367 1368 1369 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1370 // ^ 1371 // list[i] 1372 1373 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1374 // ^ 1375 // list[i] 1376 1377 // Don't use ensureCapacity() to save on copying. 1378 // NOTE: This has no measurable impact on performance, 1379 // but it might help in some usage patterns. 1380 if (len+2 > list.length) { 1381 int[] temp = new int[nextCapacity(len + 2)]; 1382 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 1383 System.arraycopy(list, i, temp, i+2, len-i); 1384 list = temp; 1385 } else { 1386 System.arraycopy(list, i, list, i+2, len-i); 1387 } 1388 1389 list[i] = c; 1390 list[i+1] = c+1; 1391 len += 2; 1392 } 1393 1394 pat = null; 1395 return this; 1396 } 1397 1398 /** 1399 * Adds the specified multicharacter to this set if it is not already 1400 * present. If this set already contains the multicharacter, 1401 * the call leaves this set unchanged. 1402 * Thus "ch" => {"ch"} 1403 * 1404 * @param s the source string 1405 * @return this object, for chaining 1406 * @stable ICU 2.0 1407 */ add(CharSequence s)1408 public final UnicodeSet add(CharSequence s) { 1409 checkFrozen(); 1410 int cp = getSingleCP(s); 1411 if (cp < 0) { 1412 String str = s.toString(); 1413 if (!strings.contains(str)) { 1414 addString(str); 1415 pat = null; 1416 } 1417 } else { 1418 add_unchecked(cp, cp); 1419 } 1420 return this; 1421 } 1422 addString(CharSequence s)1423 private void addString(CharSequence s) { 1424 if (strings == EMPTY_STRINGS) { 1425 strings = new TreeSet<>(); 1426 } 1427 strings.add(s.toString()); 1428 } 1429 1430 /** 1431 * Utility for getting code point from single code point CharSequence. 1432 * See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE). 1433 * 1434 * @return a code point IF the string consists of a single one. 1435 * otherwise returns -1. 1436 * @param s to test 1437 */ getSingleCP(CharSequence s)1438 private static int getSingleCP(CharSequence s) { 1439 if (s.length() == 1) return s.charAt(0); 1440 if (s.length() == 2) { 1441 int cp = Character.codePointAt(s, 0); 1442 if (cp > 0xFFFF) { // is surrogate pair 1443 return cp; 1444 } 1445 } 1446 return -1; 1447 } 1448 1449 /** 1450 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1451 * If this set already any particular character, it has no effect on that character. 1452 * @param s the source string 1453 * @return this object, for chaining 1454 * @stable ICU 2.0 1455 */ addAll(CharSequence s)1456 public final UnicodeSet addAll(CharSequence s) { 1457 checkFrozen(); 1458 int cp; 1459 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1460 cp = UTF16.charAt(s, i); 1461 add_unchecked(cp, cp); 1462 } 1463 return this; 1464 } 1465 1466 /** 1467 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1468 * If this set already any particular character, it has no effect on that character. 1469 * @param s the source string 1470 * @return this object, for chaining 1471 * @stable ICU 2.0 1472 */ retainAll(CharSequence s)1473 public final UnicodeSet retainAll(CharSequence s) { 1474 return retainAll(fromAll(s)); 1475 } 1476 1477 /** 1478 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1479 * If this set already any particular character, it has no effect on that character. 1480 * @param s the source string 1481 * @return this object, for chaining 1482 * @stable ICU 2.0 1483 */ complementAll(CharSequence s)1484 public final UnicodeSet complementAll(CharSequence s) { 1485 return complementAll(fromAll(s)); 1486 } 1487 1488 /** 1489 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1490 * If this set already any particular character, it has no effect on that character. 1491 * @param s the source string 1492 * @return this object, for chaining 1493 * @stable ICU 2.0 1494 */ removeAll(CharSequence s)1495 public final UnicodeSet removeAll(CharSequence s) { 1496 return removeAll(fromAll(s)); 1497 } 1498 1499 /** 1500 * Remove all strings from this UnicodeSet 1501 * @return this object, for chaining 1502 * @stable ICU 4.2 1503 */ removeAllStrings()1504 public final UnicodeSet removeAllStrings() { 1505 checkFrozen(); 1506 if (hasStrings()) { 1507 strings.clear(); 1508 pat = null; 1509 } 1510 return this; 1511 } 1512 1513 /** 1514 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1515 * 1516 * @param s the source string 1517 * @return a newly created set containing the given string 1518 * @stable ICU 2.0 1519 */ from(CharSequence s)1520 public static UnicodeSet from(CharSequence s) { 1521 return new UnicodeSet().add(s); 1522 } 1523 1524 1525 /** 1526 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1527 * @param s the source string 1528 * @return a newly created set containing the given characters 1529 * @stable ICU 2.0 1530 */ fromAll(CharSequence s)1531 public static UnicodeSet fromAll(CharSequence s) { 1532 return new UnicodeSet().addAll(s); 1533 } 1534 1535 1536 /** 1537 * Retain only the elements in this set that are contained in the 1538 * specified range. If <code>start > end</code> then an empty range is 1539 * retained, leaving the set empty. 1540 * 1541 * @param start first character, inclusive, of range 1542 * @param end last character, inclusive, of range 1543 * @stable ICU 2.0 1544 */ retain(int start, int end)1545 public UnicodeSet retain(int start, int end) { 1546 checkFrozen(); 1547 if (start < MIN_VALUE || start > MAX_VALUE) { 1548 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1549 } 1550 if (end < MIN_VALUE || end > MAX_VALUE) { 1551 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1552 } 1553 if (start <= end) { 1554 retain(range(start, end), 2, 0); 1555 } else { 1556 clear(); 1557 } 1558 return this; 1559 } 1560 1561 /** 1562 * Retain the specified character from this set if it is present. 1563 * Upon return this set will be empty if it did not contain c, or 1564 * will only contain c if it did contain c. 1565 * @param c the character to be retained 1566 * @return this object, for chaining 1567 * @stable ICU 2.0 1568 */ retain(int c)1569 public final UnicodeSet retain(int c) { 1570 return retain(c, c); 1571 } 1572 1573 /** 1574 * Retain the specified string in this set if it is present. 1575 * Upon return this set will be empty if it did not contain s, or 1576 * will only contain s if it did contain s. 1577 * @param cs the string to be retained 1578 * @return this object, for chaining 1579 * @stable ICU 2.0 1580 */ retain(CharSequence cs)1581 public final UnicodeSet retain(CharSequence cs) { 1582 int cp = getSingleCP(cs); 1583 if (cp < 0) { 1584 checkFrozen(); 1585 String s = cs.toString(); 1586 boolean isIn = strings.contains(s); 1587 // Check for getRangeCount() first to avoid somewhat-expensive size() 1588 // when there are single code points. 1589 if (isIn && getRangeCount() == 0 && size() == 1) { 1590 return this; 1591 } 1592 clear(); 1593 if (isIn) { 1594 addString(s); 1595 } 1596 pat = null; 1597 } else { 1598 retain(cp, cp); 1599 } 1600 return this; 1601 } 1602 1603 /** 1604 * Removes the specified range from this set if it is present. 1605 * The set will not contain the specified range once the call 1606 * returns. If <code>start > end</code> then an empty range is 1607 * removed, leaving the set unchanged. 1608 * 1609 * @param start first character, inclusive, of range to be removed 1610 * from this set. 1611 * @param end last character, inclusive, of range to be removed 1612 * from this set. 1613 * @stable ICU 2.0 1614 */ remove(int start, int end)1615 public UnicodeSet remove(int start, int end) { 1616 checkFrozen(); 1617 if (start < MIN_VALUE || start > MAX_VALUE) { 1618 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1619 } 1620 if (end < MIN_VALUE || end > MAX_VALUE) { 1621 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1622 } 1623 if (start <= end) { 1624 retain(range(start, end), 2, 2); 1625 } 1626 return this; 1627 } 1628 1629 /** 1630 * Removes the specified character from this set if it is present. 1631 * The set will not contain the specified character once the call 1632 * returns. 1633 * @param c the character to be removed 1634 * @return this object, for chaining 1635 * @stable ICU 2.0 1636 */ remove(int c)1637 public final UnicodeSet remove(int c) { 1638 return remove(c, c); 1639 } 1640 1641 /** 1642 * Removes the specified string from this set if it is present. 1643 * The set will not contain the specified string once the call 1644 * returns. 1645 * @param s the string to be removed 1646 * @return this object, for chaining 1647 * @stable ICU 2.0 1648 */ remove(CharSequence s)1649 public final UnicodeSet remove(CharSequence s) { 1650 int cp = getSingleCP(s); 1651 if (cp < 0) { 1652 checkFrozen(); 1653 String str = s.toString(); 1654 if (strings.contains(str)) { 1655 strings.remove(str); 1656 pat = null; 1657 } 1658 } else { 1659 remove(cp, cp); 1660 } 1661 return this; 1662 } 1663 1664 /** 1665 * Complements the specified range in this set. Any character in 1666 * the range will be removed if it is in this set, or will be 1667 * added if it is not in this set. If <code>start > end</code> 1668 * then an empty range is complemented, leaving the set unchanged. 1669 * 1670 * @param start first character, inclusive, of range 1671 * @param end last character, inclusive, of range 1672 * @stable ICU 2.0 1673 */ complement(int start, int end)1674 public UnicodeSet complement(int start, int end) { 1675 checkFrozen(); 1676 if (start < MIN_VALUE || start > MAX_VALUE) { 1677 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1678 } 1679 if (end < MIN_VALUE || end > MAX_VALUE) { 1680 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1681 } 1682 if (start <= end) { 1683 xor(range(start, end), 2, 0); 1684 } 1685 pat = null; 1686 return this; 1687 } 1688 1689 /** 1690 * Complements the specified character in this set. The character 1691 * will be removed if it is in this set, or will be added if it is 1692 * not in this set. 1693 * @stable ICU 2.0 1694 */ complement(int c)1695 public final UnicodeSet complement(int c) { 1696 return complement(c, c); 1697 } 1698 1699 /** 1700 * This is equivalent to 1701 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1702 * 1703 * <p><strong>Note:</strong> This performs a symmetric difference with all code points 1704 * <em>and thus retains all multicharacter strings</em>. 1705 * In order to achieve a “code point complement” (all code points minus this set), 1706 * the easiest is to .{@link #complement()}.{@link #removeAllStrings()} . 1707 * 1708 * @stable ICU 2.0 1709 */ complement()1710 public UnicodeSet complement() { 1711 checkFrozen(); 1712 if (list[0] == LOW) { 1713 System.arraycopy(list, 1, list, 0, len-1); 1714 --len; 1715 } else { 1716 ensureCapacity(len+1); 1717 System.arraycopy(list, 0, list, 1, len); 1718 list[0] = LOW; 1719 ++len; 1720 } 1721 pat = null; 1722 return this; 1723 } 1724 1725 /** 1726 * Complement the specified string in this set. 1727 * The set will not contain the specified string once the call 1728 * returns. 1729 * 1730 * @param s the string to complement 1731 * @return this object, for chaining 1732 * @stable ICU 2.0 1733 */ complement(CharSequence s)1734 public final UnicodeSet complement(CharSequence s) { 1735 checkFrozen(); 1736 int cp = getSingleCP(s); 1737 if (cp < 0) { 1738 String s2 = s.toString(); 1739 if (strings.contains(s2)) { 1740 strings.remove(s2); 1741 } else { 1742 addString(s2); 1743 } 1744 pat = null; 1745 } else { 1746 complement(cp, cp); 1747 } 1748 return this; 1749 } 1750 1751 /** 1752 * Returns true if this set contains the given character. 1753 * @param c character to be checked for containment 1754 * @return true if the test condition is met 1755 * @stable ICU 2.0 1756 */ 1757 @Override contains(int c)1758 public boolean contains(int c) { 1759 if (c < MIN_VALUE || c > MAX_VALUE) { 1760 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1761 } 1762 if (bmpSet != null) { 1763 return bmpSet.contains(c); 1764 } 1765 if (stringSpan != null) { 1766 return stringSpan.contains(c); 1767 } 1768 1769 /* 1770 // Set i to the index of the start item greater than ch 1771 // We know we will terminate without length test! 1772 int i = -1; 1773 while (true) { 1774 if (c < list[++i]) break; 1775 } 1776 */ 1777 1778 int i = findCodePoint(c); 1779 1780 return ((i & 1) != 0); // return true if odd 1781 } 1782 1783 /** 1784 * Returns the smallest value i such that c < list[i]. Caller 1785 * must ensure that c is a legal value or this method will enter 1786 * an infinite loop. This method performs a binary search. 1787 * @param c a character in the range MIN_VALUE..MAX_VALUE 1788 * inclusive 1789 * @return the smallest integer i in the range 0..len-1, 1790 * inclusive, such that c < list[i] 1791 */ findCodePoint(int c)1792 private final int findCodePoint(int c) { 1793 /* Examples: 1794 findCodePoint(c) 1795 set list[] c=0 1 3 4 7 8 1796 === ============== =========== 1797 [] [110000] 0 0 0 0 0 0 1798 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1799 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1800 [:all:] [0, 110000] 1 1 1 1 1 1 1801 */ 1802 1803 // Return the smallest i such that c < list[i]. Assume 1804 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1805 if (c < list[0]) return 0; 1806 // High runner test. c is often after the last range, so an 1807 // initial check for this condition pays off. 1808 if (len >= 2 && c >= list[len-2]) return len-1; 1809 int lo = 0; 1810 int hi = len - 1; 1811 // invariant: c >= list[lo] 1812 // invariant: c < list[hi] 1813 for (;;) { 1814 int i = (lo + hi) >>> 1; 1815 if (i == lo) return hi; 1816 if (c < list[i]) { 1817 hi = i; 1818 } else { 1819 lo = i; 1820 } 1821 } 1822 } 1823 1824 // //---------------------------------------------------------------- 1825 // // Unrolled binary search 1826 // //---------------------------------------------------------------- 1827 // 1828 // private int validLen = -1; // validated value of len 1829 // private int topOfLow; 1830 // private int topOfHigh; 1831 // private int power; 1832 // private int deltaStart; 1833 // 1834 // private void validate() { 1835 // if (len <= 1) { 1836 // throw new IllegalArgumentException("list.len==" + len + "; must be >1"); 1837 // } 1838 // 1839 // // find greatest power of 2 less than or equal to len 1840 // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {} 1841 // 1842 // // assert(exp2[power] <= len); 1843 // 1844 // // determine the starting points 1845 // topOfLow = exp2[power] - 1; 1846 // topOfHigh = len - 1; 1847 // deltaStart = exp2[power-1]; 1848 // validLen = len; 1849 // } 1850 // 1851 // private static final int exp2[] = { 1852 // 0x1, 0x2, 0x4, 0x8, 1853 // 0x10, 0x20, 0x40, 0x80, 1854 // 0x100, 0x200, 0x400, 0x800, 1855 // 0x1000, 0x2000, 0x4000, 0x8000, 1856 // 0x10000, 0x20000, 0x40000, 0x80000, 1857 // 0x100000, 0x200000, 0x400000, 0x800000, 1858 // 0x1000000, 0x2000000, 0x4000000, 0x8000000, 1859 // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java 1860 // }; 1861 // 1862 // /** 1863 // * Unrolled lowest index GT. 1864 // */ 1865 // private final int leastIndexGT(int searchValue) { 1866 // 1867 // if (len != validLen) { 1868 // if (len == 1) return 0; 1869 // validate(); 1870 // } 1871 // int temp; 1872 // 1873 // // set up initial range to search. Each subrange is a power of two in length 1874 // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh; 1875 // 1876 // // Completely unrolled binary search, folhighing "Programming Pearls" 1877 // // Each case deliberately falls through to the next 1878 // // Logically, list[-1] < all_search_values && list[count] > all_search_values 1879 // // although the values -1 and count are never actually touched. 1880 // 1881 // // The bounds at each point are low & high, 1882 // // where low == high - delta*2 1883 // // so high - delta is the midpoint 1884 // 1885 // // The invariant AFTER each line is that list[low] < searchValue <= list[high] 1886 // 1887 // switch (power) { 1888 // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java 1889 // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp; 1890 // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp; 1891 // 1892 // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp; 1893 // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp; 1894 // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp; 1895 // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp; 1896 // 1897 // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp; 1898 // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp; 1899 // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp; 1900 // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp; 1901 // 1902 // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp; 1903 // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp; 1904 // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp; 1905 // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp; 1906 // 1907 // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp; 1908 // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp; 1909 // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp; 1910 // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp; 1911 // 1912 // case 12: if (searchValue < list[temp = high- 0x800]) high = temp; 1913 // case 11: if (searchValue < list[temp = high- 0x400]) high = temp; 1914 // case 10: if (searchValue < list[temp = high- 0x200]) high = temp; 1915 // case 9: if (searchValue < list[temp = high- 0x100]) high = temp; 1916 // 1917 // case 8: if (searchValue < list[temp = high- 0x80]) high = temp; 1918 // case 7: if (searchValue < list[temp = high- 0x40]) high = temp; 1919 // case 6: if (searchValue < list[temp = high- 0x20]) high = temp; 1920 // case 5: if (searchValue < list[temp = high- 0x10]) high = temp; 1921 // 1922 // case 4: if (searchValue < list[temp = high- 0x8]) high = temp; 1923 // case 3: if (searchValue < list[temp = high- 0x4]) high = temp; 1924 // case 2: if (searchValue < list[temp = high- 0x2]) high = temp; 1925 // case 1: if (searchValue < list[temp = high- 0x1]) high = temp; 1926 // } 1927 // 1928 // return high; 1929 // } 1930 // 1931 // // For debugging only 1932 // public int len() { 1933 // return len; 1934 // } 1935 // 1936 // //---------------------------------------------------------------- 1937 // //---------------------------------------------------------------- 1938 1939 /** 1940 * Returns true if this set contains every character 1941 * of the given range. 1942 * @param start first character, inclusive, of the range 1943 * @param end last character, inclusive, of the range 1944 * @return true if the test condition is met 1945 * @stable ICU 2.0 1946 */ contains(int start, int end)1947 public boolean contains(int start, int end) { 1948 if (start < MIN_VALUE || start > MAX_VALUE) { 1949 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1950 } 1951 if (end < MIN_VALUE || end > MAX_VALUE) { 1952 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1953 } 1954 //int i = -1; 1955 //while (true) { 1956 // if (start < list[++i]) break; 1957 //} 1958 int i = findCodePoint(start); 1959 return ((i & 1) != 0 && end < list[i]); 1960 } 1961 1962 /** 1963 * Returns <tt>true</tt> if this set contains the given 1964 * multicharacter string. 1965 * @param s string to be checked for containment 1966 * @return <tt>true</tt> if this set contains the specified string 1967 * @stable ICU 2.0 1968 */ contains(CharSequence s)1969 public final boolean contains(CharSequence s) { 1970 1971 int cp = getSingleCP(s); 1972 if (cp < 0) { 1973 return strings.contains(s.toString()); 1974 } else { 1975 return contains(cp); 1976 } 1977 } 1978 1979 /** 1980 * Returns true if this set contains all the characters and strings 1981 * of the given set. 1982 * @param b set to be checked for containment 1983 * @return true if the test condition is met 1984 * @stable ICU 2.0 1985 */ containsAll(UnicodeSet b)1986 public boolean containsAll(UnicodeSet b) { 1987 // The specified set is a subset if all of its pairs are contained in 1988 // this set. This implementation accesses the lists directly for speed. 1989 // TODO: this could be faster if size() were cached. But that would affect building speed 1990 // so it needs investigation. 1991 int[] listB = b.list; 1992 boolean needA = true; 1993 boolean needB = true; 1994 int aPtr = 0; 1995 int bPtr = 0; 1996 int aLen = len - 1; 1997 int bLen = b.len - 1; 1998 int startA = 0, startB = 0, limitA = 0, limitB = 0; 1999 while (true) { 2000 // double iterations are such a pain... 2001 if (needA) { 2002 if (aPtr >= aLen) { 2003 // ran out of A. If B is also exhausted, then break; 2004 if (needB && bPtr >= bLen) { 2005 break; 2006 } 2007 return false; 2008 } 2009 startA = list[aPtr++]; 2010 limitA = list[aPtr++]; 2011 } 2012 if (needB) { 2013 if (bPtr >= bLen) { 2014 // ran out of B. Since we got this far, we have an A and we are ok so far 2015 break; 2016 } 2017 startB = listB[bPtr++]; 2018 limitB = listB[bPtr++]; 2019 } 2020 // if B doesn't overlap and is greater than A, get new A 2021 if (startB >= limitA) { 2022 needA = true; 2023 needB = false; 2024 continue; 2025 } 2026 // if B is wholy contained in A, then get a new B 2027 if (startB >= startA && limitB <= limitA) { 2028 needA = false; 2029 needB = true; 2030 continue; 2031 } 2032 // all other combinations mean we fail 2033 return false; 2034 } 2035 2036 if (!strings.containsAll(b.strings)) return false; 2037 return true; 2038 } 2039 2040 // /** 2041 // * Returns true if this set contains all the characters and strings 2042 // * of the given set. 2043 // * @param c set to be checked for containment 2044 // * @return true if the test condition is met 2045 // * @stable ICU 2.0 2046 // */ 2047 // public boolean containsAllOld(UnicodeSet c) { 2048 // // The specified set is a subset if all of its pairs are contained in 2049 // // this set. It's possible to code this more efficiently in terms of 2050 // // direct manipulation of the inversion lists if the need arises. 2051 // int n = c.getRangeCount(); 2052 // for (int i=0; i<n; ++i) { 2053 // if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 2054 // return false; 2055 // } 2056 // } 2057 // if (!strings.containsAll(c.strings)) return false; 2058 // return true; 2059 // } 2060 2061 /** 2062 * Returns true if there is a partition of the string such that this set contains each of the partitioned strings. 2063 * For example, for the Unicode set [a{bc}{cd}]<br> 2064 * containsAll is true for each of: "a", "bc", ""cdbca"<br> 2065 * containsAll is false for each of: "acb", "bcda", "bcx"<br> 2066 * @param s string containing characters to be checked for containment 2067 * @return true if the test condition is met 2068 * @stable ICU 2.0 2069 */ containsAll(String s)2070 public boolean containsAll(String s) { 2071 int cp; 2072 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 2073 cp = UTF16.charAt(s, i); 2074 if (!contains(cp)) { 2075 if (!hasStrings()) { 2076 return false; 2077 } 2078 return containsAll(s, 0); 2079 } 2080 } 2081 return true; 2082 } 2083 2084 /** 2085 * Recursive routine called if we fail to find a match in containsAll, and there are strings 2086 * @param s source string 2087 * @param i point to match to the end on 2088 * @return true if ok 2089 */ containsAll(String s, int i)2090 private boolean containsAll(String s, int i) { 2091 if (i >= s.length()) { 2092 return true; 2093 } 2094 int cp= UTF16.charAt(s, i); 2095 if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) { 2096 return true; 2097 } 2098 for (String setStr : strings) { 2099 if (!setStr.isEmpty() && // skip the empty string 2100 s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) { 2101 return true; 2102 } 2103 } 2104 return false; 2105 2106 } 2107 2108 /** 2109 * Get the Regex equivalent for this UnicodeSet 2110 * @return regex pattern equivalent to this UnicodeSet 2111 * @internal 2112 * @deprecated This API is ICU internal only. 2113 */ 2114 @Deprecated getRegexEquivalent()2115 public String getRegexEquivalent() { 2116 if (!hasStrings()) { 2117 return toString(); 2118 } 2119 StringBuilder result = new StringBuilder("(?:"); 2120 appendNewPattern(result, true, false); 2121 for (String s : strings) { 2122 result.append('|'); 2123 _appendToPat(result, s, true); 2124 } 2125 return result.append(")").toString(); 2126 } 2127 2128 /** 2129 * Returns true if this set contains none of the characters 2130 * of the given range. 2131 * @param start first character, inclusive, of the range 2132 * @param end last character, inclusive, of the range 2133 * @return true if the test condition is met 2134 * @stable ICU 2.0 2135 */ containsNone(int start, int end)2136 public boolean containsNone(int start, int end) { 2137 if (start < MIN_VALUE || start > MAX_VALUE) { 2138 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 2139 } 2140 if (end < MIN_VALUE || end > MAX_VALUE) { 2141 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 2142 } 2143 int i = -1; 2144 while (true) { 2145 if (start < list[++i]) break; 2146 } 2147 return ((i & 1) == 0 && end < list[i]); 2148 } 2149 2150 /** 2151 * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2152 * For example, for the Unicode set [a{bc}{cd}]<br> 2153 * containsNone is true for: "xy", "cb"<br> 2154 * containsNone is false for: "a", "bc", "bcd"<br> 2155 * @param b set to be checked for containment 2156 * @return true if the test condition is met 2157 * @stable ICU 2.0 2158 */ containsNone(UnicodeSet b)2159 public boolean containsNone(UnicodeSet b) { 2160 // The specified set is a subset if some of its pairs overlap with some of this set's pairs. 2161 // This implementation accesses the lists directly for speed. 2162 int[] listB = b.list; 2163 boolean needA = true; 2164 boolean needB = true; 2165 int aPtr = 0; 2166 int bPtr = 0; 2167 int aLen = len - 1; 2168 int bLen = b.len - 1; 2169 int startA = 0, startB = 0, limitA = 0, limitB = 0; 2170 while (true) { 2171 // double iterations are such a pain... 2172 if (needA) { 2173 if (aPtr >= aLen) { 2174 // ran out of A: break so we test strings 2175 break; 2176 } 2177 startA = list[aPtr++]; 2178 limitA = list[aPtr++]; 2179 } 2180 if (needB) { 2181 if (bPtr >= bLen) { 2182 // ran out of B: break so we test strings 2183 break; 2184 } 2185 startB = listB[bPtr++]; 2186 limitB = listB[bPtr++]; 2187 } 2188 // if B is higher than any part of A, get new A 2189 if (startB >= limitA) { 2190 needA = true; 2191 needB = false; 2192 continue; 2193 } 2194 // if A is higher than any part of B, get new B 2195 if (startA >= limitB) { 2196 needA = false; 2197 needB = true; 2198 continue; 2199 } 2200 // all other combinations mean we fail 2201 return false; 2202 } 2203 2204 if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false; 2205 return true; 2206 } 2207 2208 // /** 2209 // * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2210 // * For example, for the Unicode set [a{bc}{cd}]<br> 2211 // * containsNone is true for: "xy", "cb"<br> 2212 // * containsNone is false for: "a", "bc", "bcd"<br> 2213 // * @param c set to be checked for containment 2214 // * @return true if the test condition is met 2215 // * @stable ICU 2.0 2216 // */ 2217 // public boolean containsNoneOld(UnicodeSet c) { 2218 // // The specified set is a subset if all of its pairs are contained in 2219 // // this set. It's possible to code this more efficiently in terms of 2220 // // direct manipulation of the inversion lists if the need arises. 2221 // int n = c.getRangeCount(); 2222 // for (int i=0; i<n; ++i) { 2223 // if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 2224 // return false; 2225 // } 2226 // } 2227 // if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false; 2228 // return true; 2229 // } 2230 2231 /** 2232 * Returns true if this set contains none of the characters 2233 * of the given string. 2234 * @param s string containing characters to be checked for containment 2235 * @return true if the test condition is met 2236 * @stable ICU 2.0 2237 */ containsNone(CharSequence s)2238 public boolean containsNone(CharSequence s) { 2239 return span(s, SpanCondition.NOT_CONTAINED) == s.length(); 2240 } 2241 2242 /** 2243 * Returns true if this set contains one or more of the characters 2244 * in the given range. 2245 * @param start first character, inclusive, of the range 2246 * @param end last character, inclusive, of the range 2247 * @return true if the condition is met 2248 * @stable ICU 2.0 2249 */ containsSome(int start, int end)2250 public final boolean containsSome(int start, int end) { 2251 return !containsNone(start, end); 2252 } 2253 2254 /** 2255 * Returns true if this set contains one or more of the characters 2256 * and strings of the given set. 2257 * @param s set to be checked for containment 2258 * @return true if the condition is met 2259 * @stable ICU 2.0 2260 */ containsSome(UnicodeSet s)2261 public final boolean containsSome(UnicodeSet s) { 2262 return !containsNone(s); 2263 } 2264 2265 /** 2266 * Returns true if this set contains one or more of the characters 2267 * of the given string. 2268 * @param s string containing characters to be checked for containment 2269 * @return true if the condition is met 2270 * @stable ICU 2.0 2271 */ containsSome(CharSequence s)2272 public final boolean containsSome(CharSequence s) { 2273 return !containsNone(s); 2274 } 2275 2276 2277 /** 2278 * Adds all of the elements in the specified set to this set if 2279 * they're not already present. This operation effectively 2280 * modifies this set so that its value is the <i>union</i> of the two 2281 * sets. The behavior of this operation is unspecified if the specified 2282 * collection is modified while the operation is in progress. 2283 * 2284 * @param c set whose elements are to be added to this set. 2285 * @stable ICU 2.0 2286 */ addAll(UnicodeSet c)2287 public UnicodeSet addAll(UnicodeSet c) { 2288 checkFrozen(); 2289 add(c.list, c.len, 0); 2290 if (c.hasStrings()) { 2291 if (strings == EMPTY_STRINGS) { 2292 strings = new TreeSet<>(c.strings); 2293 } else { 2294 strings.addAll(c.strings); 2295 } 2296 } 2297 return this; 2298 } 2299 2300 /** 2301 * Retains only the elements in this set that are contained in the 2302 * specified set. In other words, removes from this set all of 2303 * its elements that are not contained in the specified set. This 2304 * operation effectively modifies this set so that its value is 2305 * the <i>intersection</i> of the two sets. 2306 * 2307 * @param c set that defines which elements this set will retain. 2308 * @stable ICU 2.0 2309 */ retainAll(UnicodeSet c)2310 public UnicodeSet retainAll(UnicodeSet c) { 2311 checkFrozen(); 2312 retain(c.list, c.len, 0); 2313 if (hasStrings()) { 2314 if (!c.hasStrings()) { 2315 strings.clear(); 2316 } else { 2317 strings.retainAll(c.strings); 2318 } 2319 } 2320 return this; 2321 } 2322 2323 /** 2324 * Removes from this set all of its elements that are contained in the 2325 * specified set. This operation effectively modifies this 2326 * set so that its value is the <i>asymmetric set difference</i> of 2327 * the two sets. 2328 * 2329 * @param c set that defines which elements will be removed from 2330 * this set. 2331 * @stable ICU 2.0 2332 */ removeAll(UnicodeSet c)2333 public UnicodeSet removeAll(UnicodeSet c) { 2334 checkFrozen(); 2335 retain(c.list, c.len, 2); 2336 if (hasStrings() && c.hasStrings()) { 2337 strings.removeAll(c.strings); 2338 } 2339 return this; 2340 } 2341 2342 /** 2343 * Complements in this set all elements contained in the specified 2344 * set. Any character in the other set will be removed if it is 2345 * in this set, or will be added if it is not in this set. 2346 * 2347 * @param c set that defines which elements will be complemented from 2348 * this set. 2349 * @stable ICU 2.0 2350 */ complementAll(UnicodeSet c)2351 public UnicodeSet complementAll(UnicodeSet c) { 2352 checkFrozen(); 2353 xor(c.list, c.len, 0); 2354 if (c.hasStrings()) { 2355 if (strings == EMPTY_STRINGS) { 2356 strings = new TreeSet<>(c.strings); 2357 } else { 2358 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings); 2359 } 2360 } 2361 return this; 2362 } 2363 2364 /** 2365 * Removes all of the elements from this set. This set will be 2366 * empty after this call returns. 2367 * @stable ICU 2.0 2368 */ clear()2369 public UnicodeSet clear() { 2370 checkFrozen(); 2371 list[0] = HIGH; 2372 len = 1; 2373 pat = null; 2374 if (hasStrings()) { 2375 strings.clear(); 2376 } 2377 return this; 2378 } 2379 2380 /** 2381 * Iteration method that returns the number of ranges contained in 2382 * this set. 2383 * @see #getRangeStart 2384 * @see #getRangeEnd 2385 * @stable ICU 2.0 2386 */ getRangeCount()2387 public int getRangeCount() { 2388 return len/2; 2389 } 2390 2391 /** 2392 * Iteration method that returns the first character in the 2393 * specified range of this set. 2394 * @exception ArrayIndexOutOfBoundsException if index is outside 2395 * the range <code>0..getRangeCount()-1</code> 2396 * @see #getRangeCount 2397 * @see #getRangeEnd 2398 * @stable ICU 2.0 2399 */ getRangeStart(int index)2400 public int getRangeStart(int index) { 2401 return list[index*2]; 2402 } 2403 2404 /** 2405 * Iteration method that returns the last character in the 2406 * specified range of this set. 2407 * @exception ArrayIndexOutOfBoundsException if index is outside 2408 * the range <code>0..getRangeCount()-1</code> 2409 * @see #getRangeStart 2410 * @see #getRangeEnd 2411 * @stable ICU 2.0 2412 */ getRangeEnd(int index)2413 public int getRangeEnd(int index) { 2414 return (list[index*2 + 1] - 1); 2415 } 2416 2417 /** 2418 * Reallocate this objects internal structures to take up the least 2419 * possible space, without changing this object's value. 2420 * @stable ICU 2.0 2421 */ compact()2422 public UnicodeSet compact() { 2423 checkFrozen(); 2424 if ((len + 7) < list.length) { 2425 // If we have more than a little unused capacity, shrink it to len. 2426 list = Arrays.copyOf(list, len); 2427 } 2428 rangeList = null; 2429 buffer = null; 2430 if (strings != EMPTY_STRINGS && strings.isEmpty()) { 2431 strings = EMPTY_STRINGS; 2432 } 2433 return this; 2434 } 2435 2436 /** 2437 * Compares the specified object with this set for equality. Returns 2438 * <tt>true</tt> if the specified object is also a set, the two sets 2439 * have the same size, and every member of the specified set is 2440 * contained in this set (or equivalently, every member of this set is 2441 * contained in the specified set). 2442 * 2443 * @param o Object to be compared for equality with this set. 2444 * @return <tt>true</tt> if the specified Object is equal to this set. 2445 * @stable ICU 2.0 2446 */ 2447 @Override equals(Object o)2448 public boolean equals(Object o) { 2449 if (o == null) { 2450 return false; 2451 } 2452 if (this == o) { 2453 return true; 2454 } 2455 try { 2456 UnicodeSet that = (UnicodeSet) o; 2457 if (len != that.len) return false; 2458 for (int i = 0; i < len; ++i) { 2459 if (list[i] != that.list[i]) return false; 2460 } 2461 if (!strings.equals(that.strings)) return false; 2462 } catch (Exception e) { 2463 return false; 2464 } 2465 return true; 2466 } 2467 2468 /** 2469 * Returns the hash code value for this set. 2470 * 2471 * @return the hash code value for this set. 2472 * @see java.lang.Object#hashCode() 2473 * @stable ICU 2.0 2474 */ 2475 @Override hashCode()2476 public int hashCode() { 2477 int result = len; 2478 for (int i = 0; i < len; ++i) { 2479 result *= 1000003; 2480 result += list[i]; 2481 } 2482 return result; 2483 } 2484 2485 /** 2486 * Return a programmer-readable string representation of this object. 2487 * @stable ICU 2.0 2488 */ 2489 @Override toString()2490 public String toString() { 2491 return toPattern(true); 2492 } 2493 2494 //---------------------------------------------------------------- 2495 // Implementation: Pattern parsing 2496 //---------------------------------------------------------------- 2497 2498 /** 2499 * Parses the given pattern, starting at the given position. The character 2500 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 2501 * Parsing continues until the corresponding closing ']'. If a syntax error 2502 * is encountered between the opening and closing brace, the parse fails. 2503 * Upon return from a successful parse, the ParsePosition is updated to 2504 * point to the character following the closing ']', and an inversion 2505 * list for the parsed pattern is returned. This method 2506 * calls itself recursively to parse embedded subpatterns. 2507 * 2508 * @param pattern the string containing the pattern to be parsed. The 2509 * portion of the string from pos.getIndex(), which must be a '[', to the 2510 * corresponding closing ']', is parsed. 2511 * @param pos upon entry, the position at which to being parsing. The 2512 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 2513 * from a successful parse, pos.getIndex() is either the character after the 2514 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 2515 * is the last character of the pattern string. 2516 * @return an inversion list for the parsed substring 2517 * of <code>pattern</code> 2518 * @exception java.lang.IllegalArgumentException if the parse fails. 2519 * @internal 2520 * @deprecated This API is ICU internal only. 2521 */ 2522 @Deprecated applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2523 public UnicodeSet applyPattern(String pattern, 2524 ParsePosition pos, 2525 SymbolTable symbols, 2526 int options) { 2527 2528 // Need to build the pattern in a temporary string because 2529 // _applyPattern calls add() etc., which set pat to empty. 2530 boolean parsePositionWasNull = pos == null; 2531 if (parsePositionWasNull) { 2532 pos = new ParsePosition(0); 2533 } 2534 2535 StringBuilder rebuiltPat = new StringBuilder(); 2536 RuleCharacterIterator chars = 2537 new RuleCharacterIterator(pattern, symbols, pos); 2538 applyPattern(chars, symbols, rebuiltPat, options, 0); 2539 if (chars.inVariable()) { 2540 syntaxError(chars, "Extra chars in variable value"); 2541 } 2542 pat = rebuiltPat.toString(); 2543 if (parsePositionWasNull) { 2544 int i = pos.getIndex(); 2545 2546 // Skip over trailing whitespace 2547 if ((options & IGNORE_SPACE) != 0) { 2548 i = PatternProps.skipWhiteSpace(pattern, i); 2549 } 2550 2551 if (i != pattern.length()) { 2552 throw new IllegalArgumentException("Parse of \"" + pattern + 2553 "\" failed at " + i); 2554 } 2555 } 2556 return this; 2557 } 2558 2559 // Add constants to make the applyPattern() code easier to follow. 2560 2561 private static final int LAST0_START = 0, 2562 LAST1_RANGE = 1, 2563 LAST2_SET = 2; 2564 2565 private static final int MODE0_NONE = 0, 2566 MODE1_INBRACKET = 1, 2567 MODE2_OUTBRACKET = 2; 2568 2569 private static final int SETMODE0_NONE = 0, 2570 SETMODE1_UNICODESET = 1, 2571 SETMODE2_PROPERTYPAT = 2, 2572 SETMODE3_PREPARSED = 3; 2573 2574 private static final int MAX_DEPTH = 100; 2575 2576 /** 2577 * Parse the pattern from the given RuleCharacterIterator. The 2578 * iterator is advanced over the parsed pattern. 2579 * @param chars iterator over the pattern characters. Upon return 2580 * it will be advanced to the first character after the parsed 2581 * pattern, or the end of the iteration if all characters are 2582 * parsed. 2583 * @param symbols symbol table to use to parse and dereference 2584 * variables, or null if none. 2585 * @param rebuiltPat the pattern that was parsed, rebuilt or 2586 * copied from the input pattern, as appropriate. 2587 * @param options a bit mask of zero or more of the following: 2588 * IGNORE_SPACE, CASE. 2589 */ applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth)2590 private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 2591 Appendable rebuiltPat, int options, int depth) { 2592 if (depth > MAX_DEPTH) { 2593 syntaxError(chars, "Pattern nested too deeply"); 2594 } 2595 2596 // Syntax characters: [ ] ^ - & { } 2597 2598 // Recognized special forms for chars, sets: c-c s-s s&s 2599 2600 int opts = RuleCharacterIterator.PARSE_VARIABLES | 2601 RuleCharacterIterator.PARSE_ESCAPES; 2602 if ((options & IGNORE_SPACE) != 0) { 2603 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 2604 } 2605 2606 StringBuilder patBuf = new StringBuilder(), buf = null; 2607 boolean usePat = false; 2608 UnicodeSet scratch = null; 2609 RuleCharacterIterator.Position backup = null; 2610 2611 // mode: 0=before [, 1=between [...], 2=after ] 2612 // lastItem: 0=none, 1=char, 2=set 2613 int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; 2614 char op = 0; 2615 2616 boolean invert = false; 2617 2618 clear(); 2619 String lastString = null; 2620 2621 while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { 2622 //Eclipse stated the following is "dead code" 2623 /* 2624 if (false) { 2625 // Debugging assertion 2626 if (!((lastItem == 0 && op == 0) || 2627 (lastItem == 1 && (op == 0 || op == '-')) || 2628 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 2629 throw new IllegalArgumentException(); 2630 } 2631 }*/ 2632 2633 int c = 0; 2634 boolean literal = false; 2635 UnicodeSet nested = null; 2636 2637 // -------- Check for property pattern 2638 2639 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 2640 int setMode = SETMODE0_NONE; 2641 if (resemblesPropertyPattern(chars, opts)) { 2642 setMode = SETMODE2_PROPERTYPAT; 2643 } 2644 2645 // -------- Parse '[' of opening delimiter OR nested set. 2646 // If there is a nested set, use `setMode' to define how 2647 // the set should be parsed. If the '[' is part of the 2648 // opening delimiter for this pattern, parse special 2649 // strings "[", "[^", "[-", and "[^-". Check for stand-in 2650 // characters representing a nested set in the symbol 2651 // table. 2652 2653 else { 2654 // Prepare to backup if necessary 2655 backup = chars.getPos(backup); 2656 c = chars.next(opts); 2657 literal = chars.isEscaped(); 2658 2659 if (c == '[' && !literal) { 2660 if (mode == MODE1_INBRACKET) { 2661 chars.setPos(backup); // backup 2662 setMode = SETMODE1_UNICODESET; 2663 } else { 2664 // Handle opening '[' delimiter 2665 mode = MODE1_INBRACKET; 2666 patBuf.append('['); 2667 backup = chars.getPos(backup); // prepare to backup 2668 c = chars.next(opts); 2669 literal = chars.isEscaped(); 2670 if (c == '^' && !literal) { 2671 invert = true; 2672 patBuf.append('^'); 2673 backup = chars.getPos(backup); // prepare to backup 2674 c = chars.next(opts); 2675 literal = chars.isEscaped(); 2676 } 2677 // Fall through to handle special leading '-'; 2678 // otherwise restart loop for nested [], \p{}, etc. 2679 if (c == '-') { 2680 literal = true; 2681 // Fall through to handle literal '-' below 2682 } else { 2683 chars.setPos(backup); // backup 2684 continue; 2685 } 2686 } 2687 } else if (symbols != null) { 2688 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 2689 if (m != null) { 2690 try { 2691 nested = (UnicodeSet) m; 2692 setMode = SETMODE3_PREPARSED; 2693 } catch (ClassCastException e) { 2694 syntaxError(chars, "Syntax error"); 2695 } 2696 } 2697 } 2698 } 2699 2700 // -------- Handle a nested set. This either is inline in 2701 // the pattern or represented by a stand-in that has 2702 // previously been parsed and was looked up in the symbol 2703 // table. 2704 2705 if (setMode != SETMODE0_NONE) { 2706 if (lastItem == LAST1_RANGE) { 2707 if (op != 0) { 2708 syntaxError(chars, "Char expected after operator"); 2709 } 2710 add_unchecked(lastChar, lastChar); 2711 _appendToPat(patBuf, lastChar, false); 2712 lastItem = LAST0_START; 2713 op = 0; 2714 } 2715 2716 if (op == '-' || op == '&') { 2717 patBuf.append(op); 2718 } 2719 2720 if (nested == null) { 2721 if (scratch == null) scratch = new UnicodeSet(); 2722 nested = scratch; 2723 } 2724 switch (setMode) { 2725 case SETMODE1_UNICODESET: 2726 nested.applyPattern(chars, symbols, patBuf, options, depth + 1); 2727 break; 2728 case SETMODE2_PROPERTYPAT: 2729 chars.skipIgnored(opts); 2730 nested.applyPropertyPattern(chars, patBuf, symbols); 2731 break; 2732 case SETMODE3_PREPARSED: // `nested' already parsed 2733 nested._toPattern(patBuf, false); 2734 break; 2735 } 2736 2737 usePat = true; 2738 2739 if (mode == MODE0_NONE) { 2740 // Entire pattern is a category; leave parse loop 2741 set(nested); 2742 mode = MODE2_OUTBRACKET; 2743 break; 2744 } 2745 2746 switch (op) { 2747 case '-': 2748 removeAll(nested); 2749 break; 2750 case '&': 2751 retainAll(nested); 2752 break; 2753 case 0: 2754 addAll(nested); 2755 break; 2756 } 2757 2758 op = 0; 2759 lastItem = LAST2_SET; 2760 2761 continue; 2762 } 2763 2764 if (mode == MODE0_NONE) { 2765 syntaxError(chars, "Missing '['"); 2766 } 2767 2768 // -------- Parse special (syntax) characters. If the 2769 // current character is not special, or if it is escaped, 2770 // then fall through and handle it below. 2771 2772 if (!literal) { 2773 switch (c) { 2774 case ']': 2775 if (lastItem == LAST1_RANGE) { 2776 add_unchecked(lastChar, lastChar); 2777 _appendToPat(patBuf, lastChar, false); 2778 } 2779 // Treat final trailing '-' as a literal 2780 if (op == '-') { 2781 add_unchecked(op, op); 2782 patBuf.append(op); 2783 } else if (op == '&') { 2784 syntaxError(chars, "Trailing '&'"); 2785 } 2786 patBuf.append(']'); 2787 mode = MODE2_OUTBRACKET; 2788 continue; 2789 case '-': 2790 if (op == 0) { 2791 if (lastItem != LAST0_START) { 2792 op = (char) c; 2793 continue; 2794 } else if (lastString != null) { 2795 op = (char) c; 2796 continue; 2797 } else { 2798 // Treat final trailing '-' as a literal 2799 add_unchecked(c, c); 2800 c = chars.next(opts); 2801 literal = chars.isEscaped(); 2802 if (c == ']' && !literal) { 2803 patBuf.append("-]"); 2804 mode = MODE2_OUTBRACKET; 2805 continue; 2806 } 2807 } 2808 } 2809 syntaxError(chars, "'-' not after char, string, or set"); 2810 break; 2811 case '&': 2812 if (lastItem == LAST2_SET && op == 0) { 2813 op = (char) c; 2814 continue; 2815 } 2816 syntaxError(chars, "'&' not after set"); 2817 break; 2818 case '^': 2819 syntaxError(chars, "'^' not after '['"); 2820 break; 2821 case '{': 2822 if (op != 0 && op != '-') { 2823 syntaxError(chars, "Missing operand after operator"); 2824 } 2825 if (lastItem == LAST1_RANGE) { 2826 add_unchecked(lastChar, lastChar); 2827 _appendToPat(patBuf, lastChar, false); 2828 } 2829 lastItem = LAST0_START; 2830 if (buf == null) { 2831 buf = new StringBuilder(); 2832 } else { 2833 buf.setLength(0); 2834 } 2835 boolean ok = false; 2836 while (!chars.atEnd()) { 2837 c = chars.next(opts); 2838 literal = chars.isEscaped(); 2839 if (c == '}' && !literal) { 2840 ok = true; 2841 break; 2842 } 2843 appendCodePoint(buf, c); 2844 } 2845 if (!ok) { 2846 syntaxError(chars, "Invalid multicharacter string"); 2847 } 2848 // We have new string. Add it to set and continue; 2849 // we don't need to drop through to the further 2850 // processing 2851 String curString = buf.toString(); 2852 if (op == '-') { 2853 int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); 2854 int curSingle = CharSequences.getSingleCodePoint(curString); 2855 if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { 2856 add(lastSingle,curSingle); 2857 } else { 2858 if (strings == EMPTY_STRINGS) { 2859 strings = new TreeSet<>(); 2860 } 2861 try { 2862 StringRange.expand(lastString, curString, true, strings); 2863 } catch (Exception e) { 2864 syntaxError(chars, e.getMessage()); 2865 } 2866 } 2867 lastString = null; 2868 op = 0; 2869 } else { 2870 add(curString); 2871 lastString = curString; 2872 } 2873 patBuf.append('{'); 2874 _appendToPat(patBuf, curString, false); 2875 patBuf.append('}'); 2876 continue; 2877 case SymbolTable.SYMBOL_REF: 2878 // symbols nosymbols 2879 // [a-$] error error (ambiguous) 2880 // [a$] anchor anchor 2881 // [a-$x] var "x"* literal '$' 2882 // [a-$.] error literal '$' 2883 // *We won't get here in the case of var "x" 2884 backup = chars.getPos(backup); 2885 c = chars.next(opts); 2886 literal = chars.isEscaped(); 2887 boolean anchor = (c == ']' && !literal); 2888 if (symbols == null && !anchor) { 2889 c = SymbolTable.SYMBOL_REF; 2890 chars.setPos(backup); 2891 break; // literal '$' 2892 } 2893 if (anchor && op == 0) { 2894 if (lastItem == LAST1_RANGE) { 2895 add_unchecked(lastChar, lastChar); 2896 _appendToPat(patBuf, lastChar, false); 2897 } 2898 add_unchecked(UnicodeMatcher.ETHER); 2899 usePat = true; 2900 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 2901 mode = MODE2_OUTBRACKET; 2902 continue; 2903 } 2904 syntaxError(chars, "Unquoted '$'"); 2905 break; 2906 default: 2907 break; 2908 } 2909 } 2910 2911 // -------- Parse literal characters. This includes both 2912 // escaped chars ("\u4E01") and non-syntax characters 2913 // ("a"). 2914 2915 switch (lastItem) { 2916 case LAST0_START: 2917 if (op == '-' && lastString != null) { 2918 syntaxError(chars, "Invalid range"); 2919 } 2920 lastItem = LAST1_RANGE; 2921 lastChar = c; 2922 lastString = null; 2923 break; 2924 case LAST1_RANGE: 2925 if (op == '-') { 2926 if (lastString != null) { 2927 syntaxError(chars, "Invalid range"); 2928 } 2929 if (lastChar >= c) { 2930 // Don't allow redundant (a-a) or empty (b-a) ranges; 2931 // these are most likely typos. 2932 syntaxError(chars, "Invalid range"); 2933 } 2934 add_unchecked(lastChar, c); 2935 _appendToPat(patBuf, lastChar, false); 2936 patBuf.append(op); 2937 _appendToPat(patBuf, c, false); 2938 lastItem = LAST0_START; 2939 op = 0; 2940 } else { 2941 add_unchecked(lastChar, lastChar); 2942 _appendToPat(patBuf, lastChar, false); 2943 lastChar = c; 2944 } 2945 break; 2946 case LAST2_SET: 2947 if (op != 0) { 2948 syntaxError(chars, "Set expected after operator"); 2949 } 2950 lastChar = c; 2951 lastItem = LAST1_RANGE; 2952 break; 2953 } 2954 } 2955 2956 if (mode != MODE2_OUTBRACKET) { 2957 syntaxError(chars, "Missing ']'"); 2958 } 2959 2960 chars.skipIgnored(opts); 2961 2962 /** 2963 * Handle global flags (invert, case insensitivity). If this 2964 * pattern should be compiled case-insensitive, then we need 2965 * to close over case BEFORE COMPLEMENTING. This makes 2966 * patterns like /[^abc]/i work. 2967 */ 2968 if ((options & CASE) != 0) { 2969 closeOver(CASE); 2970 } 2971 if (invert) { 2972 complement().removeAllStrings(); // code point complement 2973 } 2974 2975 // Use the rebuilt pattern (pat) only if necessary. Prefer the 2976 // generated pattern. 2977 if (usePat) { 2978 append(rebuiltPat, patBuf.toString()); 2979 } else { 2980 appendNewPattern(rebuiltPat, false, true); 2981 } 2982 } 2983 syntaxError(RuleCharacterIterator chars, String msg)2984 private static void syntaxError(RuleCharacterIterator chars, String msg) { 2985 throw new IllegalArgumentException("Error: " + msg + " at \"" + 2986 Utility.escape(chars.toString()) + 2987 '"'); 2988 } 2989 2990 /** 2991 * Add the contents of the UnicodeSet (as strings) into a collection. 2992 * @param target collection to add into 2993 * @stable ICU 4.4 2994 */ addAllTo(T target)2995 public <T extends Collection<String>> T addAllTo(T target) { 2996 return addAllTo(this, target); 2997 } 2998 2999 3000 /** 3001 * Add the contents of the UnicodeSet (as strings) into a collection. 3002 * @param target collection to add into 3003 * @stable ICU 4.4 3004 */ addAllTo(String[] target)3005 public String[] addAllTo(String[] target) { 3006 return addAllTo(this, target); 3007 } 3008 3009 /** 3010 * Add the contents of the UnicodeSet (as strings) into an array. 3011 * @stable ICU 4.4 3012 */ toArray(UnicodeSet set)3013 public static String[] toArray(UnicodeSet set) { 3014 return addAllTo(set, new String[set.size()]); 3015 } 3016 3017 /** 3018 * Add the contents of the collection (as strings) into this UnicodeSet. 3019 * The collection must not contain null. 3020 * @param source the collection to add 3021 * @return a reference to this object 3022 * @stable ICU 4.4 3023 */ add(Iterable<?> source)3024 public UnicodeSet add(Iterable<?> source) { 3025 return addAll(source); 3026 } 3027 3028 /** 3029 * Add a collection (as strings) into this UnicodeSet. 3030 * Uses standard naming convention. 3031 * @param source collection to add into 3032 * @return a reference to this object 3033 * @stable ICU 4.4 3034 */ addAll(Iterable<?> source)3035 public UnicodeSet addAll(Iterable<?> source) { 3036 checkFrozen(); 3037 for (Object o : source) { 3038 add(o.toString()); 3039 } 3040 return this; 3041 } 3042 3043 //---------------------------------------------------------------- 3044 // Implementation: Utility methods 3045 //---------------------------------------------------------------- 3046 nextCapacity(int minCapacity)3047 private int nextCapacity(int minCapacity) { 3048 // Grow exponentially to reduce the frequency of allocations. 3049 if (minCapacity < INITIAL_CAPACITY) { 3050 return minCapacity + INITIAL_CAPACITY; 3051 } else if (minCapacity <= 2500) { 3052 return 5 * minCapacity; 3053 } else { 3054 int newCapacity = 2 * minCapacity; 3055 if (newCapacity > MAX_LENGTH) { 3056 newCapacity = MAX_LENGTH; 3057 } 3058 return newCapacity; 3059 } 3060 } 3061 ensureCapacity(int newLen)3062 private void ensureCapacity(int newLen) { 3063 if (newLen > MAX_LENGTH) { 3064 newLen = MAX_LENGTH; 3065 } 3066 if (newLen <= list.length) return; 3067 int newCapacity = nextCapacity(newLen); 3068 int[] temp = new int[newCapacity]; 3069 // Copy only the actual contents. 3070 System.arraycopy(list, 0, temp, 0, len); 3071 list = temp; 3072 } 3073 ensureBufferCapacity(int newLen)3074 private void ensureBufferCapacity(int newLen) { 3075 if (newLen > MAX_LENGTH) { 3076 newLen = MAX_LENGTH; 3077 } 3078 if (buffer != null && newLen <= buffer.length) return; 3079 int newCapacity = nextCapacity(newLen); 3080 buffer = new int[newCapacity]; 3081 // The buffer has no contents to be copied. 3082 // It is always filled from scratch after this call. 3083 } 3084 3085 /** 3086 * Assumes start <= end. 3087 */ range(int start, int end)3088 private int[] range(int start, int end) { 3089 if (rangeList == null) { 3090 rangeList = new int[] { start, end+1, HIGH }; 3091 } else { 3092 rangeList[0] = start; 3093 rangeList[1] = end+1; 3094 } 3095 return rangeList; 3096 } 3097 3098 //---------------------------------------------------------------- 3099 // Implementation: Fundamental operations 3100 //---------------------------------------------------------------- 3101 3102 // polarity = 0, 3 is normal: x xor y 3103 // polarity = 1, 2: x xor ~y == x === y 3104 xor(int[] other, int otherLen, int polarity)3105 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 3106 ensureBufferCapacity(len + otherLen); 3107 int i = 0, j = 0, k = 0; 3108 int a = list[i++]; 3109 int b; 3110 // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used 3111 // so the following if statement will not be called. 3112 ///CLOVER:OFF 3113 if (polarity == 1 || polarity == 2) { 3114 b = LOW; 3115 if (other[j] == LOW) { // skip base if already LOW 3116 ++j; 3117 b = other[j]; 3118 } 3119 ///CLOVER:ON 3120 } else { 3121 b = other[j++]; 3122 } 3123 // simplest of all the routines 3124 // sort the values, discarding identicals! 3125 while (true) { 3126 if (a < b) { 3127 buffer[k++] = a; 3128 a = list[i++]; 3129 } else if (b < a) { 3130 buffer[k++] = b; 3131 b = other[j++]; 3132 } else if (a != HIGH) { // at this point, a == b 3133 // discard both values! 3134 a = list[i++]; 3135 b = other[j++]; 3136 } else { // DONE! 3137 buffer[k++] = HIGH; 3138 len = k; 3139 break; 3140 } 3141 } 3142 // swap list and buffer 3143 int[] temp = list; 3144 list = buffer; 3145 buffer = temp; 3146 pat = null; 3147 return this; 3148 } 3149 3150 // polarity = 0 is normal: x union y 3151 // polarity = 2: x union ~y 3152 // polarity = 1: ~x union y 3153 // polarity = 3: ~x union ~y 3154 add(int[] other, int otherLen, int polarity)3155 private UnicodeSet add(int[] other, int otherLen, int polarity) { 3156 ensureBufferCapacity(len + otherLen); 3157 int i = 0, j = 0, k = 0; 3158 int a = list[i++]; 3159 int b = other[j++]; 3160 // change from xor is that we have to check overlapping pairs 3161 // polarity bit 1 means a is second, bit 2 means b is. 3162 main: 3163 while (true) { 3164 switch (polarity) { 3165 case 0: // both first; take lower if unequal 3166 if (a < b) { // take a 3167 // Back up over overlapping ranges in buffer[] 3168 if (k > 0 && a <= buffer[k-1]) { 3169 // Pick latter end value in buffer[] vs. list[] 3170 a = max(list[i], buffer[--k]); 3171 } else { 3172 // No overlap 3173 buffer[k++] = a; 3174 a = list[i]; 3175 } 3176 i++; // Common if/else code factored out 3177 polarity ^= 1; 3178 } else if (b < a) { // take b 3179 if (k > 0 && b <= buffer[k-1]) { 3180 b = max(other[j], buffer[--k]); 3181 } else { 3182 buffer[k++] = b; 3183 b = other[j]; 3184 } 3185 j++; 3186 polarity ^= 2; 3187 } else { // a == b, take a, drop b 3188 if (a == HIGH) break main; 3189 // This is symmetrical; it doesn't matter if 3190 // we backtrack with a or b. - liu 3191 if (k > 0 && a <= buffer[k-1]) { 3192 a = max(list[i], buffer[--k]); 3193 } else { 3194 // No overlap 3195 buffer[k++] = a; 3196 a = list[i]; 3197 } 3198 i++; 3199 polarity ^= 1; 3200 b = other[j++]; polarity ^= 2; 3201 } 3202 break; 3203 case 3: // both second; take higher if unequal, and drop other 3204 if (b <= a) { // take a 3205 if (a == HIGH) break main; 3206 buffer[k++] = a; 3207 } else { // take b 3208 if (b == HIGH) break main; 3209 buffer[k++] = b; 3210 } 3211 a = list[i++]; polarity ^= 1; // factored common code 3212 b = other[j++]; polarity ^= 2; 3213 break; 3214 case 1: // a second, b first; if b < a, overlap 3215 if (a < b) { // no overlap, take a 3216 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3217 } else if (b < a) { // OVERLAP, drop b 3218 b = other[j++]; polarity ^= 2; 3219 } else { // a == b, drop both! 3220 if (a == HIGH) break main; 3221 a = list[i++]; polarity ^= 1; 3222 b = other[j++]; polarity ^= 2; 3223 } 3224 break; 3225 case 2: // a first, b second; if a < b, overlap 3226 if (b < a) { // no overlap, take b 3227 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3228 } else if (a < b) { // OVERLAP, drop a 3229 a = list[i++]; polarity ^= 1; 3230 } else { // a == b, drop both! 3231 if (a == HIGH) break main; 3232 a = list[i++]; polarity ^= 1; 3233 b = other[j++]; polarity ^= 2; 3234 } 3235 break; 3236 } 3237 } 3238 buffer[k++] = HIGH; // terminate 3239 len = k; 3240 // swap list and buffer 3241 int[] temp = list; 3242 list = buffer; 3243 buffer = temp; 3244 pat = null; 3245 return this; 3246 } 3247 3248 // polarity = 0 is normal: x intersect y 3249 // polarity = 2: x intersect ~y == set-minus 3250 // polarity = 1: ~x intersect y 3251 // polarity = 3: ~x intersect ~y 3252 retain(int[] other, int otherLen, int polarity)3253 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 3254 ensureBufferCapacity(len + otherLen); 3255 int i = 0, j = 0, k = 0; 3256 int a = list[i++]; 3257 int b = other[j++]; 3258 // change from xor is that we have to check overlapping pairs 3259 // polarity bit 1 means a is second, bit 2 means b is. 3260 main: 3261 while (true) { 3262 switch (polarity) { 3263 case 0: // both first; drop the smaller 3264 if (a < b) { // drop a 3265 a = list[i++]; polarity ^= 1; 3266 } else if (b < a) { // drop b 3267 b = other[j++]; polarity ^= 2; 3268 } else { // a == b, take one, drop other 3269 if (a == HIGH) break main; 3270 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3271 b = other[j++]; polarity ^= 2; 3272 } 3273 break; 3274 case 3: // both second; take lower if unequal 3275 if (a < b) { // take a 3276 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3277 } else if (b < a) { // take b 3278 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3279 } else { // a == b, take one, drop other 3280 if (a == HIGH) break main; 3281 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3282 b = other[j++]; polarity ^= 2; 3283 } 3284 break; 3285 case 1: // a second, b first; 3286 if (a < b) { // NO OVERLAP, drop a 3287 a = list[i++]; polarity ^= 1; 3288 } else if (b < a) { // OVERLAP, take b 3289 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3290 } else { // a == b, drop both! 3291 if (a == HIGH) break main; 3292 a = list[i++]; polarity ^= 1; 3293 b = other[j++]; polarity ^= 2; 3294 } 3295 break; 3296 case 2: // a first, b second; if a < b, overlap 3297 if (b < a) { // no overlap, drop b 3298 b = other[j++]; polarity ^= 2; 3299 } else if (a < b) { // OVERLAP, take a 3300 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3301 } else { // a == b, drop both! 3302 if (a == HIGH) break main; 3303 a = list[i++]; polarity ^= 1; 3304 b = other[j++]; polarity ^= 2; 3305 } 3306 break; 3307 } 3308 } 3309 buffer[k++] = HIGH; // terminate 3310 len = k; 3311 // swap list and buffer 3312 int[] temp = list; 3313 list = buffer; 3314 buffer = temp; 3315 pat = null; 3316 return this; 3317 } 3318 max(int a, int b)3319 private static final int max(int a, int b) { 3320 return (a > b) ? a : b; 3321 } 3322 3323 //---------------------------------------------------------------- 3324 // Generic filter-based scanning code 3325 //---------------------------------------------------------------- 3326 3327 private static interface Filter { contains(int codePoint)3328 boolean contains(int codePoint); 3329 } 3330 3331 private static final class NumericValueFilter implements Filter { 3332 double value; NumericValueFilter(double value)3333 NumericValueFilter(double value) { this.value = value; } 3334 @Override contains(int ch)3335 public boolean contains(int ch) { 3336 return UCharacter.getUnicodeNumericValue(ch) == value; 3337 } 3338 } 3339 3340 private static final class GeneralCategoryMaskFilter implements Filter { 3341 int mask; GeneralCategoryMaskFilter(int mask)3342 GeneralCategoryMaskFilter(int mask) { this.mask = mask; } 3343 @Override contains(int ch)3344 public boolean contains(int ch) { 3345 return ((1 << UCharacter.getType(ch)) & mask) != 0; 3346 } 3347 } 3348 3349 private static final class IntPropertyFilter implements Filter { 3350 int prop; 3351 int value; IntPropertyFilter(int prop, int value)3352 IntPropertyFilter(int prop, int value) { 3353 this.prop = prop; 3354 this.value = value; 3355 } 3356 @Override contains(int ch)3357 public boolean contains(int ch) { 3358 return UCharacter.getIntPropertyValue(ch, prop) == value; 3359 } 3360 } 3361 3362 private static final class ScriptExtensionsFilter implements Filter { 3363 int script; ScriptExtensionsFilter(int script)3364 ScriptExtensionsFilter(int script) { this.script = script; } 3365 @Override contains(int c)3366 public boolean contains(int c) { 3367 return UScript.hasScript(c, script); 3368 } 3369 } 3370 3371 // VersionInfo for unassigned characters 3372 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 3373 3374 private static final class VersionFilter implements Filter { 3375 VersionInfo version; VersionFilter(VersionInfo version)3376 VersionFilter(VersionInfo version) { this.version = version; } 3377 @Override contains(int ch)3378 public boolean contains(int ch) { 3379 VersionInfo v = UCharacter.getAge(ch); 3380 // Reference comparison ok; VersionInfo caches and reuses 3381 // unique objects. 3382 return !Utility.sameObjects(v, NO_VERSION) && 3383 v.compareTo(version) <= 0; 3384 } 3385 } 3386 3387 /** 3388 * Generic filter-based scanning code for UCD property UnicodeSets. 3389 */ applyFilter(Filter filter, UnicodeSet inclusions)3390 private void applyFilter(Filter filter, UnicodeSet inclusions) { 3391 // Logically, walk through all Unicode characters, noting the start 3392 // and end of each range for which filter.contain(c) is 3393 // true. Add each range to a set. 3394 // 3395 // To improve performance, use an inclusions set which 3396 // encodes information about character ranges that are known 3397 // to have identical properties. 3398 // inclusions contains the first characters of 3399 // same-value ranges for the given property. 3400 3401 clear(); 3402 3403 int startHasProperty = -1; 3404 int limitRange = inclusions.getRangeCount(); 3405 3406 for (int j=0; j<limitRange; ++j) { 3407 // get current range 3408 int start = inclusions.getRangeStart(j); 3409 int end = inclusions.getRangeEnd(j); 3410 3411 // for all the code points in the range, process 3412 for (int ch = start; ch <= end; ++ch) { 3413 // only add to the unicodeset on inflection points -- 3414 // where the hasProperty value changes to false 3415 if (filter.contains(ch)) { 3416 if (startHasProperty < 0) { 3417 startHasProperty = ch; 3418 } 3419 } else if (startHasProperty >= 0) { 3420 add_unchecked(startHasProperty, ch-1); 3421 startHasProperty = -1; 3422 } 3423 } 3424 } 3425 if (startHasProperty >= 0) { 3426 add_unchecked(startHasProperty, 0x10FFFF); 3427 } 3428 } 3429 3430 /** 3431 * Remove leading and trailing Pattern_White_Space and compress 3432 * internal Pattern_White_Space to a single space character. 3433 */ mungeCharName(String source)3434 private static String mungeCharName(String source) { 3435 source = PatternProps.trimWhiteSpace(source); 3436 StringBuilder buf = null; 3437 for (int i=0; i<source.length(); ++i) { 3438 char ch = source.charAt(i); 3439 if (PatternProps.isWhiteSpace(ch)) { 3440 if (buf == null) { 3441 buf = new StringBuilder().append(source, 0, i); 3442 } else if (buf.charAt(buf.length() - 1) == ' ') { 3443 continue; 3444 } 3445 ch = ' '; // convert to ' ' 3446 } 3447 if (buf != null) { 3448 buf.append(ch); 3449 } 3450 } 3451 return buf == null ? source : buf.toString(); 3452 } 3453 3454 //---------------------------------------------------------------- 3455 // Property set API 3456 //---------------------------------------------------------------- 3457 3458 /** 3459 * Modifies this set to contain those code points which have the 3460 * given value for the given binary or enumerated property, as 3461 * returned by UCharacter.getIntPropertyValue. Prior contents of 3462 * this set are lost. 3463 * 3464 * @param prop a property in the range 3465 * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or 3466 * UProperty.INT_START..UProperty.INT_LIMIT-1 or. 3467 * UProperty.MASK_START..UProperty.MASK_LIMIT-1. 3468 * 3469 * @param value a value in the range 3470 * UCharacter.getIntPropertyMinValue(prop).. 3471 * UCharacter.getIntPropertyMaxValue(prop), with one exception. 3472 * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be 3473 * a UCharacter.getType() result, but rather a mask value produced 3474 * by logically ORing (1 << UCharacter.getType()) values together. 3475 * This allows grouped categories such as [:L:] to be represented. 3476 * 3477 * @return a reference to this set 3478 * 3479 * @stable ICU 2.4 3480 */ applyIntPropertyValue(int prop, int value)3481 public UnicodeSet applyIntPropertyValue(int prop, int value) { 3482 // All of the following include checkFrozen() before modifying this set. 3483 if (prop == UProperty.GENERAL_CATEGORY_MASK) { 3484 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3485 applyFilter(new GeneralCategoryMaskFilter(value), inclusions); 3486 } else if (prop == UProperty.SCRIPT_EXTENSIONS) { 3487 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3488 applyFilter(new ScriptExtensionsFilter(value), inclusions); 3489 } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) { 3490 if (value == 0 || value == 1) { 3491 set(CharacterProperties.getBinaryPropertySet(prop)); 3492 if (value == 0) { 3493 complement().removeAllStrings(); // code point complement 3494 } 3495 } else { 3496 clear(); 3497 } 3498 } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) { 3499 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3500 applyFilter(new IntPropertyFilter(prop, value), inclusions); 3501 } else { 3502 throw new IllegalArgumentException("unsupported property " + prop); 3503 } 3504 return this; 3505 } 3506 3507 3508 3509 /** 3510 * Modifies this set to contain those code points which have the 3511 * given value for the given property. Prior contents of this 3512 * set are lost. 3513 * 3514 * @param propertyAlias a property alias, either short or long. 3515 * The name is matched loosely. See PropertyAliases.txt for names 3516 * and a description of loose matching. If the value string is 3517 * empty, then this string is interpreted as either a 3518 * General_Category value alias, a Script value alias, a binary 3519 * property alias, or a special ID. Special IDs are matched 3520 * loosely and correspond to the following sets: 3521 * 3522 * "ANY" = [\\u0000-\\U0010FFFF], 3523 * "ASCII" = [\\u0000-\\u007F]. 3524 * 3525 * @param valueAlias a value alias, either short or long. The 3526 * name is matched loosely. See PropertyValueAliases.txt for 3527 * names and a description of loose matching. In addition to 3528 * aliases listed, numeric values and canonical combining classes 3529 * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", 3530 * "220"). The value string may also be empty. 3531 * 3532 * @return a reference to this set 3533 * 3534 * @stable ICU 2.4 3535 */ applyPropertyAlias(String propertyAlias, String valueAlias)3536 public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { 3537 return applyPropertyAlias(propertyAlias, valueAlias, null); 3538 } 3539 3540 /** 3541 * Modifies this set to contain those code points which have the 3542 * given value for the given property. Prior contents of this 3543 * set are lost. 3544 * @param propertyAlias A string of the property alias. 3545 * @param valueAlias A string of the value alias. 3546 * @param symbols if not null, then symbols are first called to see if a property 3547 * is available. If true, then everything else is skipped. 3548 * @return this set 3549 * @stable ICU 3.2 3550 */ applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3551 public UnicodeSet applyPropertyAlias(String propertyAlias, 3552 String valueAlias, SymbolTable symbols) { 3553 checkFrozen(); 3554 int p; 3555 int v; 3556 boolean invert = false; 3557 3558 if (symbols != null 3559 && (symbols instanceof XSymbolTable) 3560 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { 3561 return this; 3562 } 3563 3564 if (XSYMBOL_TABLE != null) { 3565 if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { 3566 return this; 3567 } 3568 } 3569 3570 if (valueAlias.length() > 0) { 3571 p = UCharacter.getPropertyEnum(propertyAlias); 3572 3573 // Treat gc as gcm 3574 if (p == UProperty.GENERAL_CATEGORY) { 3575 p = UProperty.GENERAL_CATEGORY_MASK; 3576 } 3577 3578 if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) || 3579 (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) || 3580 (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) { 3581 try { 3582 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3583 } catch (IllegalArgumentException e) { 3584 // Handle numeric CCC 3585 if (p == UProperty.CANONICAL_COMBINING_CLASS || 3586 p == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 3587 p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) { 3588 v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias)); 3589 // Anything between 0 and 255 is valid even if unused. 3590 if (v < 0 || v > 255) throw e; 3591 } else { 3592 throw e; 3593 } 3594 } 3595 } 3596 3597 else { 3598 switch (p) { 3599 case UProperty.NUMERIC_VALUE: 3600 { 3601 double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); 3602 applyFilter(new NumericValueFilter(value), 3603 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3604 return this; 3605 } 3606 case UProperty.NAME: 3607 { 3608 // Must munge name, since 3609 // UCharacter.charFromName() does not do 3610 // 'loose' matching. 3611 String buf = mungeCharName(valueAlias); 3612 int ch = UCharacter.getCharFromExtendedName(buf); 3613 if (ch == -1) { 3614 throw new IllegalArgumentException("Invalid character name"); 3615 } 3616 clear(); 3617 add_unchecked(ch); 3618 return this; 3619 } 3620 case UProperty.UNICODE_1_NAME: 3621 // ICU 49 deprecates the Unicode_1_Name property APIs. 3622 throw new IllegalArgumentException("Unicode_1_Name (na1) not supported"); 3623 case UProperty.AGE: 3624 { 3625 // Must munge name, since 3626 // VersionInfo.getInstance() does not do 3627 // 'loose' matching. 3628 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 3629 applyFilter(new VersionFilter(version), 3630 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3631 return this; 3632 } 3633 case UProperty.SCRIPT_EXTENSIONS: 3634 v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias); 3635 // fall through to calling applyIntPropertyValue() 3636 break; 3637 default: 3638 // p is a non-binary, non-enumerated property that we 3639 // don't support (yet). 3640 throw new IllegalArgumentException("Unsupported property"); 3641 } 3642 } 3643 } 3644 3645 else { 3646 // valueAlias is empty. Interpret as General Category, Script, 3647 // Binary property, or ANY or ASCII. Upon success, p and v will 3648 // be set. 3649 UPropertyAliases pnames = UPropertyAliases.INSTANCE; 3650 p = UProperty.GENERAL_CATEGORY_MASK; 3651 v = pnames.getPropertyValueEnum(p, propertyAlias); 3652 if (v == UProperty.UNDEFINED) { 3653 p = UProperty.SCRIPT; 3654 v = pnames.getPropertyValueEnum(p, propertyAlias); 3655 if (v == UProperty.UNDEFINED) { 3656 p = pnames.getPropertyEnum(propertyAlias); 3657 if (p == UProperty.UNDEFINED) { 3658 p = -1; 3659 } 3660 if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) { 3661 v = 1; 3662 } else if (p == -1) { 3663 if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) { 3664 set(MIN_VALUE, MAX_VALUE); 3665 return this; 3666 } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) { 3667 set(0, 0x7F); 3668 return this; 3669 } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) { 3670 // [:Assigned:]=[:^Cn:] 3671 p = UProperty.GENERAL_CATEGORY_MASK; 3672 v = (1<<UCharacter.UNASSIGNED); 3673 invert = true; 3674 } else { 3675 // Property name was never matched. 3676 throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias); 3677 } 3678 } else { 3679 // Valid property name, but it isn't binary, so the value 3680 // must be supplied. 3681 throw new IllegalArgumentException("Missing property value"); 3682 } 3683 } 3684 } 3685 } 3686 3687 applyIntPropertyValue(p, v); 3688 if(invert) { 3689 complement().removeAllStrings(); // code point complement 3690 } 3691 3692 return this; 3693 } 3694 3695 //---------------------------------------------------------------- 3696 // Property set patterns 3697 //---------------------------------------------------------------- 3698 3699 /** 3700 * Return true if the given position, in the given pattern, appears 3701 * to be the start of a property set pattern. 3702 */ resemblesPropertyPattern(String pattern, int pos)3703 private static boolean resemblesPropertyPattern(String pattern, int pos) { 3704 // Patterns are at least 5 characters long 3705 if ((pos+5) > pattern.length()) { 3706 return false; 3707 } 3708 3709 // Look for an opening [:, [:^, \p, or \P 3710 return pattern.regionMatches(pos, "[:", 0, 2) || 3711 pattern.regionMatches(true, pos, "\\p", 0, 2) || 3712 pattern.regionMatches(pos, "\\N", 0, 2); 3713 } 3714 3715 /** 3716 * Return true if the given iterator appears to point at a 3717 * property pattern. Regardless of the result, return with the 3718 * iterator unchanged. 3719 * @param chars iterator over the pattern characters. Upon return 3720 * it will be unchanged. 3721 * @param iterOpts RuleCharacterIterator options 3722 */ resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3723 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 3724 int iterOpts) { 3725 boolean result = false; 3726 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 3727 RuleCharacterIterator.Position pos = chars.getPos(null); 3728 int c = chars.next(iterOpts); 3729 if (c == '[' || c == '\\') { 3730 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 3731 result = (c == '[') ? (d == ':') : 3732 (d == 'N' || d == 'p' || d == 'P'); 3733 } 3734 chars.setPos(pos); 3735 return result; 3736 } 3737 3738 /** 3739 * Parse the given property pattern at the given parse position. 3740 * @param symbols TODO 3741 */ applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3742 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 3743 int pos = ppos.getIndex(); 3744 3745 // On entry, ppos should point to one of the following locations: 3746 3747 // Minimum length is 5 characters, e.g. \p{L} 3748 if ((pos+5) > pattern.length()) { 3749 return null; 3750 } 3751 3752 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 3753 boolean isName = false; // true for \N{pat}, o/w false 3754 boolean invert = false; 3755 3756 // Look for an opening [:, [:^, \p, or \P 3757 if (pattern.regionMatches(pos, "[:", 0, 2)) { 3758 posix = true; 3759 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3760 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 3761 ++pos; 3762 invert = true; 3763 } 3764 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 3765 pattern.regionMatches(pos, "\\N", 0, 2)) { 3766 char c = pattern.charAt(pos+1); 3767 invert = (c == 'P'); 3768 isName = (c == 'N'); 3769 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3770 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 3771 // Syntax error; "\p" or "\P" not followed by "{" 3772 return null; 3773 } 3774 } else { 3775 // Open delimiter not seen 3776 return null; 3777 } 3778 3779 // Look for the matching close delimiter, either :] or } 3780 int close = pattern.indexOf(posix ? ":]" : "}", pos); 3781 if (close < 0) { 3782 // Syntax error; close delimiter missing 3783 return null; 3784 } 3785 3786 // Look for an '=' sign. If this is present, we will parse a 3787 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 3788 // pattern. 3789 int equals = pattern.indexOf('=', pos); 3790 String propName, valueName; 3791 if (equals >= 0 && equals < close && !isName) { 3792 // Equals seen; parse medium/long pattern 3793 propName = pattern.substring(pos, equals); 3794 valueName = pattern.substring(equals+1, close); 3795 } 3796 3797 else { 3798 // Handle case where no '=' is seen, and \N{} 3799 propName = pattern.substring(pos, close); 3800 valueName = ""; 3801 3802 // Handle \N{name} 3803 if (isName) { 3804 // This is a little inefficient since it means we have to 3805 // parse "na" back to UProperty.NAME even though we already 3806 // know it's UProperty.NAME. If we refactor the API to 3807 // support args of (int, String) then we can remove 3808 // "na" and make this a little more efficient. 3809 valueName = propName; 3810 propName = "na"; 3811 } 3812 } 3813 3814 applyPropertyAlias(propName, valueName, symbols); 3815 3816 if (invert) { 3817 complement().removeAllStrings(); // code point complement 3818 } 3819 3820 // Move to the limit position after the close delimiter 3821 ppos.setIndex(close + (posix ? 2 : 1)); 3822 3823 return this; 3824 } 3825 3826 /** 3827 * Parse a property pattern. 3828 * @param chars iterator over the pattern characters. Upon return 3829 * it will be advanced to the first character after the parsed 3830 * pattern, or the end of the iteration if all characters are 3831 * parsed. 3832 * @param rebuiltPat the pattern that was parsed, rebuilt or 3833 * copied from the input pattern, as appropriate. 3834 * @param symbols TODO 3835 */ applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3836 private void applyPropertyPattern(RuleCharacterIterator chars, 3837 Appendable rebuiltPat, SymbolTable symbols) { 3838 String patStr = chars.getCurrentBuffer(); 3839 int start = chars.getCurrentBufferPos(); 3840 ParsePosition pos = new ParsePosition(start); 3841 applyPropertyPattern(patStr, pos, symbols); 3842 int length = pos.getIndex() - start; 3843 if (length == 0) { 3844 syntaxError(chars, "Invalid property pattern"); 3845 } 3846 chars.jumpahead(length); 3847 append(rebuiltPat, patStr.substring(start, pos.getIndex())); 3848 } 3849 3850 //---------------------------------------------------------------- 3851 // Case folding API 3852 //---------------------------------------------------------------- 3853 3854 /** 3855 * Bitmask for constructor and applyPattern() indicating that 3856 * white space should be ignored. If set, ignore Unicode Pattern_White_Space characters, 3857 * unless they are quoted or escaped. This may be ORed together 3858 * with other selectors. 3859 * @stable ICU 3.8 3860 */ 3861 public static final int IGNORE_SPACE = 1; 3862 3863 /** 3864 * Bitmask for constructor, applyPattern(), and closeOver() 3865 * indicating letter case. This may be ORed together with other 3866 * selectors. 3867 * 3868 * Enable case insensitive matching. E.g., "[ab]" with this flag 3869 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3870 * match all except 'a', 'A', 'b', and 'B'. This performs a full 3871 * closure over case mappings, e.g. U+017F for s. 3872 * 3873 * The resulting set is a superset of the input for the code points but 3874 * not for the strings. 3875 * It performs a case mapping closure of the code points and adds 3876 * full case folding strings for the code points, and reduces strings of 3877 * the original set to their full case folding equivalents. 3878 * 3879 * This is designed for case-insensitive matches, for example 3880 * in regular expressions. The full code point case closure allows checking of 3881 * an input character directly against the closure set. 3882 * Strings are matched by comparing the case-folded form from the closure 3883 * set with an incremental case folding of the string in question. 3884 * 3885 * The closure set will also contain single code points if the original 3886 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 3887 * This is not necessary (that is, redundant) for the above matching method 3888 * but results in the same closure sets regardless of whether the original 3889 * set contained the code point or a string. 3890 * @stable ICU 3.8 3891 */ 3892 public static final int CASE = 2; 3893 3894 /** 3895 * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C 3896 * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h). 3897 * @see #CASE 3898 * @stable ICU 3.4 3899 */ 3900 public static final int CASE_INSENSITIVE = 2; 3901 3902 /** 3903 * Bitmask for constructor, applyPattern(), and closeOver() 3904 * indicating letter case. This may be ORed together with other 3905 * selectors. 3906 * 3907 * Enable case insensitive matching. E.g., "[ab]" with this flag 3908 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3909 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 3910 * title-, and uppercase mappings as well as the case folding 3911 * of each existing element in the set. 3912 * @stable ICU 3.4 3913 */ 3914 public static final int ADD_CASE_MAPPINGS = 4; 3915 3916 // add the result of a full case mapping to the set 3917 // use str as a temporary string to avoid constructing one addCaseMapping(UnicodeSet set, int result, StringBuilder full)3918 private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { 3919 if(result >= 0) { 3920 if(result > UCaseProps.MAX_STRING_LENGTH) { 3921 // add a single-code point case mapping 3922 set.add(result); 3923 } else { 3924 // add a string case mapping from full with length result 3925 set.add(full.toString()); 3926 full.setLength(0); 3927 } 3928 } 3929 // result < 0: the code point mapped to itself, no need to add it 3930 // see UCaseProps 3931 } 3932 3933 /** 3934 * Close this set over the given attribute. For the attribute 3935 * CASE, the result is to modify this set so that: 3936 * 3937 * 1. For each character or string 'a' in this set, all strings 3938 * 'b' such that foldCase(a) == foldCase(b) are added to this set. 3939 * (For most 'a' that are single characters, 'b' will have 3940 * b.length() == 1.) 3941 * 3942 * 2. For each string 'e' in the resulting set, if e != 3943 * foldCase(e), 'e' will be removed. 3944 * 3945 * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] 3946 * 3947 * (Here foldCase(x) refers to the operation 3948 * UCharacter.foldCase(x, true), and a == b actually denotes 3949 * a.equals(b), not pointer comparison.) 3950 * 3951 * @param attribute bitmask for attributes to close over. 3952 * Currently only the CASE bit is supported. Any undefined bits 3953 * are ignored. 3954 * @return a reference to this set. 3955 * @stable ICU 3.8 3956 */ closeOver(int attribute)3957 public UnicodeSet closeOver(int attribute) { 3958 checkFrozen(); 3959 if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) { 3960 UCaseProps csp = UCaseProps.INSTANCE; 3961 UnicodeSet foldSet = new UnicodeSet(this); 3962 ULocale root = ULocale.ROOT; 3963 3964 // start with input set to guarantee inclusion 3965 // CASE: remove strings because the strings will actually be reduced (folded); 3966 // therefore, start with no strings and add only those needed 3967 if((attribute & CASE) != 0 && foldSet.hasStrings()) { 3968 foldSet.strings.clear(); 3969 } 3970 3971 int n = getRangeCount(); 3972 int result; 3973 StringBuilder full = new StringBuilder(); 3974 3975 for (int i=0; i<n; ++i) { 3976 int start = getRangeStart(i); 3977 int end = getRangeEnd(i); 3978 3979 if((attribute & CASE) != 0) { 3980 // full case closure 3981 for (int cp=start; cp<=end; ++cp) { 3982 csp.addCaseClosure(cp, foldSet); 3983 } 3984 } else { 3985 // add case mappings 3986 // (does not add long s for regular s, or Kelvin for k, for example) 3987 for (int cp=start; cp<=end; ++cp) { 3988 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); 3989 addCaseMapping(foldSet, result, full); 3990 3991 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); 3992 addCaseMapping(foldSet, result, full); 3993 3994 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); 3995 addCaseMapping(foldSet, result, full); 3996 3997 result = csp.toFullFolding(cp, full, 0); 3998 addCaseMapping(foldSet, result, full); 3999 } 4000 } 4001 } 4002 if (hasStrings()) { 4003 if ((attribute & CASE) != 0) { 4004 for (String s : strings) { 4005 String str = UCharacter.foldCase(s, 0); 4006 if(!csp.addStringCaseClosure(str, foldSet)) { 4007 foldSet.add(str); // does not map to code points: add the folded string itself 4008 } 4009 } 4010 } else { 4011 BreakIterator bi = BreakIterator.getWordInstance(root); 4012 for (String str : strings) { 4013 // TODO: call lower-level functions 4014 foldSet.add(UCharacter.toLowerCase(root, str)); 4015 foldSet.add(UCharacter.toTitleCase(root, str, bi)); 4016 foldSet.add(UCharacter.toUpperCase(root, str)); 4017 foldSet.add(UCharacter.foldCase(str, 0)); 4018 } 4019 } 4020 } 4021 set(foldSet); 4022 } 4023 return this; 4024 } 4025 4026 /** 4027 * Internal class for customizing UnicodeSet parsing of properties. 4028 * TODO: extend to allow customizing of codepoint ranges 4029 * @draft ICU3.8 (retain) 4030 * @author medavis 4031 */ 4032 abstract public static class XSymbolTable implements SymbolTable { 4033 /** 4034 * Default constructor 4035 * @draft ICU3.8 (retain) 4036 */ XSymbolTable()4037 public XSymbolTable(){} 4038 /** 4039 * Supplies default implementation for SymbolTable (no action). 4040 * @draft ICU3.8 (retain) 4041 */ 4042 @Override lookupMatcher(int i)4043 public UnicodeMatcher lookupMatcher(int i) { 4044 return null; 4045 } 4046 4047 /** 4048 * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style 4049 * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be. 4050 * <p> 4051 * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a 4052 * propertyName+propertyValue combination. 4053 * 4054 * @param propertyName 4055 * the name of the property 4056 * @param propertyValue 4057 * the name of the property value 4058 * @param result UnicodeSet value to change 4059 * a set to which the characters having the propertyName+propertyValue are to be added. 4060 * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters 4061 * with that property have been added to the UnicodeSet, and returns false if the 4062 * propertyName+propertyValue combination is not recognized (in which case result is unaltered). 4063 * @draft ICU3.8 (retain) 4064 */ applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)4065 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 4066 return false; 4067 } 4068 /** 4069 * Supplies default implementation for SymbolTable (no action). 4070 * @draft ICU3.8 (retain) 4071 */ 4072 @Override lookup(String s)4073 public char[] lookup(String s) { 4074 return null; 4075 } 4076 /** 4077 * Supplies default implementation for SymbolTable (no action). 4078 * @draft ICU3.8 (retain) 4079 */ 4080 @Override parseReference(String text, ParsePosition pos, int limit)4081 public String parseReference(String text, ParsePosition pos, int limit) { 4082 return null; 4083 } 4084 } 4085 4086 /** 4087 * Is this frozen, according to the Freezable interface? 4088 * 4089 * @return value 4090 * @stable ICU 3.8 4091 */ 4092 @Override isFrozen()4093 public boolean isFrozen() { 4094 return (bmpSet != null || stringSpan != null); 4095 } 4096 4097 /** 4098 * Freeze this class, according to the Freezable interface. 4099 * 4100 * @return this 4101 * @stable ICU 4.4 4102 */ 4103 @Override freeze()4104 public UnicodeSet freeze() { 4105 if (!isFrozen()) { 4106 compact(); 4107 4108 // Optimize contains() and span() and similar functions. 4109 if (hasStrings()) { 4110 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL); 4111 } 4112 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { 4113 // Optimize for code point spans. 4114 // There are no strings, or 4115 // all strings are irrelevant for span() etc. because 4116 // all of each string's code points are contained in this set. 4117 // However, fully contained strings are relevant for spanAndCount(), 4118 // so we create both objects. 4119 bmpSet = new BMPSet(list, len); 4120 } 4121 } 4122 return this; 4123 } 4124 4125 /** 4126 * Span a string using this UnicodeSet. 4127 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4128 * @param s The string to be spanned 4129 * @param spanCondition The span condition 4130 * @return the length of the span 4131 * @stable ICU 4.4 4132 */ span(CharSequence s, SpanCondition spanCondition)4133 public int span(CharSequence s, SpanCondition spanCondition) { 4134 return span(s, 0, spanCondition); 4135 } 4136 4137 /** 4138 * Span a string using this UnicodeSet. 4139 * If the start index is less than 0, span will start from 0. 4140 * If the start index is greater than the string length, span returns the string length. 4141 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4142 * @param s The string to be spanned 4143 * @param start The start index that the span begins 4144 * @param spanCondition The span condition 4145 * @return the string index which ends the span (i.e. exclusive) 4146 * @stable ICU 4.4 4147 */ span(CharSequence s, int start, SpanCondition spanCondition)4148 public int span(CharSequence s, int start, SpanCondition spanCondition) { 4149 int end = s.length(); 4150 if (start < 0) { 4151 start = 0; 4152 } else if (start >= end) { 4153 return end; 4154 } 4155 if (bmpSet != null) { 4156 // Frozen set without strings, or no string is relevant for span(). 4157 return bmpSet.span(s, start, spanCondition, null); 4158 } 4159 if (stringSpan != null) { 4160 return stringSpan.span(s, start, spanCondition); 4161 } else if (hasStrings()) { 4162 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4163 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4164 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4165 if (strSpan.needsStringSpanUTF16()) { 4166 return strSpan.span(s, start, spanCondition); 4167 } 4168 } 4169 4170 return spanCodePointsAndCount(s, start, spanCondition, null); 4171 } 4172 4173 /** 4174 * Same as span() but also counts the smallest number of set elements on any path across the span. 4175 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4176 * @param outCount An output-only object (must not be null) for returning the count. 4177 * @return the limit (exclusive end) of the span 4178 * @internal 4179 * @deprecated This API is ICU internal only. 4180 */ 4181 @Deprecated spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4182 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { 4183 if (outCount == null) { 4184 throw new IllegalArgumentException("outCount must not be null"); 4185 } 4186 int end = s.length(); 4187 if (start < 0) { 4188 start = 0; 4189 } else if (start >= end) { 4190 return end; 4191 } 4192 if (stringSpan != null) { 4193 // We might also have bmpSet != null, 4194 // but fully-contained strings are relevant for counting elements. 4195 return stringSpan.spanAndCount(s, start, spanCondition, outCount); 4196 } else if (bmpSet != null) { 4197 return bmpSet.span(s, start, spanCondition, outCount); 4198 } else if (hasStrings()) { 4199 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4200 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4201 which |= UnicodeSetStringSpan.WITH_COUNT; 4202 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4203 return strSpan.spanAndCount(s, start, spanCondition, outCount); 4204 } 4205 4206 return spanCodePointsAndCount(s, start, spanCondition, outCount); 4207 } 4208 spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4209 private int spanCodePointsAndCount(CharSequence s, int start, 4210 SpanCondition spanCondition, OutputInt outCount) { 4211 // Pin to 0/1 values. 4212 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4213 4214 int c; 4215 int next = start; 4216 int length = s.length(); 4217 int count = 0; 4218 do { 4219 c = Character.codePointAt(s, next); 4220 if (spanContained != contains(c)) { 4221 break; 4222 } 4223 ++count; 4224 next += Character.charCount(c); 4225 } while (next < length); 4226 if (outCount != null) { outCount.value = count; } 4227 return next; 4228 } 4229 4230 /** 4231 * Span a string backwards (from the end) using this UnicodeSet. 4232 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4233 * @param s The string to be spanned 4234 * @param spanCondition The span condition 4235 * @return The string index which starts the span (i.e. inclusive). 4236 * @stable ICU 4.4 4237 */ spanBack(CharSequence s, SpanCondition spanCondition)4238 public int spanBack(CharSequence s, SpanCondition spanCondition) { 4239 return spanBack(s, s.length(), spanCondition); 4240 } 4241 4242 /** 4243 * Span a string backwards (from the fromIndex) using this UnicodeSet. 4244 * If the fromIndex is less than 0, spanBack will return 0. 4245 * If fromIndex is greater than the string length, spanBack will start from the string length. 4246 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4247 * @param s The string to be spanned 4248 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards 4249 * @param spanCondition The span condition 4250 * @return The string index which starts the span (i.e. inclusive). 4251 * @stable ICU 4.4 4252 */ spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4253 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { 4254 if (fromIndex <= 0) { 4255 return 0; 4256 } 4257 if (fromIndex > s.length()) { 4258 fromIndex = s.length(); 4259 } 4260 if (bmpSet != null) { 4261 // Frozen set without strings, or no string is relevant for spanBack(). 4262 return bmpSet.spanBack(s, fromIndex, spanCondition); 4263 } 4264 if (stringSpan != null) { 4265 return stringSpan.spanBack(s, fromIndex, spanCondition); 4266 } else if (hasStrings()) { 4267 int which = (spanCondition == SpanCondition.NOT_CONTAINED) 4268 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED 4269 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; 4270 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4271 if (strSpan.needsStringSpanUTF16()) { 4272 return strSpan.spanBack(s, fromIndex, spanCondition); 4273 } 4274 } 4275 4276 // Pin to 0/1 values. 4277 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4278 4279 int c; 4280 int prev = fromIndex; 4281 do { 4282 c = Character.codePointBefore(s, prev); 4283 if (spanContained != contains(c)) { 4284 break; 4285 } 4286 prev -= Character.charCount(c); 4287 } while (prev > 0); 4288 return prev; 4289 } 4290 4291 /** 4292 * Clone a thawed version of this class, according to the Freezable interface. 4293 * @return the clone, not frozen 4294 * @stable ICU 4.4 4295 */ 4296 @Override cloneAsThawed()4297 public UnicodeSet cloneAsThawed() { 4298 UnicodeSet result = new UnicodeSet(this); 4299 assert !result.isFrozen(); 4300 return result; 4301 } 4302 4303 // internal function checkFrozen()4304 private void checkFrozen() { 4305 if (isFrozen()) { 4306 throw new UnsupportedOperationException("Attempt to modify frozen object"); 4307 } 4308 } 4309 4310 // ************************ 4311 // Additional methods for integration with Generics and Collections 4312 // ************************ 4313 4314 /** 4315 * A struct-like class used for iteration through ranges, for faster iteration than by String. 4316 * Read about the restrictions on usage in {@link UnicodeSet#ranges()}. 4317 * 4318 * @stable ICU 54 4319 */ 4320 public static class EntryRange { 4321 /** 4322 * The starting code point of the range. 4323 * 4324 * @stable ICU 54 4325 */ 4326 public int codepoint; 4327 /** 4328 * The ending code point of the range 4329 * 4330 * @stable ICU 54 4331 */ 4332 public int codepointEnd; 4333 EntryRange()4334 EntryRange() { 4335 } 4336 4337 /** 4338 * {@inheritDoc} 4339 * 4340 * @stable ICU 54 4341 */ 4342 @Override toString()4343 public String toString() { 4344 StringBuilder b = new StringBuilder(); 4345 return ( 4346 codepoint == codepointEnd ? _appendToPat(b, codepoint, false) 4347 : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) 4348 .toString(); 4349 } 4350 } 4351 4352 /** 4353 * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points. 4354 * The UnicodeSet must not be altered during the iteration. 4355 * The EntryRange instance is the same each time; the contents are just reset. 4356 * 4357 * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings. 4358 * 4359 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4360 * Do not alter the UnicodeSet while iterating. 4361 * 4362 * <pre> 4363 * // Sample code 4364 * for (EntryRange range : us1.ranges()) { 4365 * // do something with code points between range.codepoint and range.codepointEnd; 4366 * } 4367 * for (String s : us1.strings()) { 4368 * // do something with each string; 4369 * } 4370 * </pre> 4371 * 4372 * @stable ICU 54 4373 */ ranges()4374 public Iterable<EntryRange> ranges() { 4375 return new EntryRangeIterable(); 4376 } 4377 4378 private class EntryRangeIterable implements Iterable<EntryRange> { 4379 @Override iterator()4380 public Iterator<EntryRange> iterator() { 4381 return new EntryRangeIterator(); 4382 } 4383 } 4384 4385 private class EntryRangeIterator implements Iterator<EntryRange> { 4386 int pos; 4387 EntryRange result = new EntryRange(); 4388 4389 @Override hasNext()4390 public boolean hasNext() { 4391 return pos < len-1; 4392 } 4393 @Override next()4394 public EntryRange next() { 4395 if (pos < len-1) { 4396 result.codepoint = list[pos++]; 4397 result.codepointEnd = list[pos++]-1; 4398 } else { 4399 throw new NoSuchElementException(); 4400 } 4401 return result; 4402 } 4403 @Override remove()4404 public void remove() { 4405 throw new UnsupportedOperationException(); 4406 } 4407 } 4408 4409 4410 /** 4411 * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. 4412 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4413 * Do not alter the UnicodeSet while iterating. 4414 * @see java.util.Set#iterator() 4415 * @stable ICU 4.4 4416 */ 4417 @Override iterator()4418 public Iterator<String> iterator() { 4419 return new UnicodeSetIterator2(this); 4420 } 4421 4422 // Cover for string iteration. 4423 private static class UnicodeSetIterator2 implements Iterator<String> { 4424 // Invariants: 4425 // sourceList != null then sourceList[item] is a valid character 4426 // sourceList == null then delegates to stringIterator 4427 private int[] sourceList; 4428 private int len; 4429 private int item; 4430 private int current; 4431 private int limit; 4432 private SortedSet<String> sourceStrings; 4433 private Iterator<String> stringIterator; 4434 private char[] buffer; 4435 UnicodeSetIterator2(UnicodeSet source)4436 UnicodeSetIterator2(UnicodeSet source) { 4437 // set according to invariants 4438 len = source.len - 1; 4439 if (len > 0) { 4440 sourceStrings = source.strings; 4441 sourceList = source.list; 4442 current = sourceList[item++]; 4443 limit = sourceList[item++]; 4444 } else { 4445 stringIterator = source.strings.iterator(); 4446 sourceList = null; 4447 } 4448 } 4449 4450 /* (non-Javadoc) 4451 * @see java.util.Iterator#hasNext() 4452 */ 4453 @Override hasNext()4454 public boolean hasNext() { 4455 return sourceList != null || stringIterator.hasNext(); 4456 } 4457 4458 /* (non-Javadoc) 4459 * @see java.util.Iterator#next() 4460 */ 4461 @Override next()4462 public String next() { 4463 if (sourceList == null) { 4464 return stringIterator.next(); 4465 } 4466 int codepoint = current++; 4467 // we have the codepoint we need, but we may need to adjust the state 4468 if (current >= limit) { 4469 if (item >= len) { 4470 stringIterator = sourceStrings.iterator(); 4471 sourceList = null; 4472 } else { 4473 current = sourceList[item++]; 4474 limit = sourceList[item++]; 4475 } 4476 } 4477 // Now return. Single code point is easy 4478 if (codepoint <= 0xFFFF) { 4479 return String.valueOf((char)codepoint); 4480 } 4481 // But Java lacks a valueOfCodePoint, so we handle ourselves for speed 4482 // allocate a buffer the first time, to make conversion faster. 4483 if (buffer == null) { 4484 buffer = new char[2]; 4485 } 4486 // compute ourselves, to save tests and calls 4487 int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; 4488 buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); 4489 buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); 4490 return String.valueOf(buffer); 4491 } 4492 4493 /* (non-Javadoc) 4494 * @see java.util.Iterator#remove() 4495 */ 4496 @Override remove()4497 public void remove() { 4498 throw new UnsupportedOperationException(); 4499 } 4500 } 4501 4502 /** 4503 * @see #containsAll(com.ibm.icu.text.UnicodeSet) 4504 * @stable ICU 4.4 4505 */ containsAll(Iterable<T> collection)4506 public <T extends CharSequence> boolean containsAll(Iterable<T> collection) { 4507 for (T o : collection) { 4508 if (!contains(o)) { 4509 return false; 4510 } 4511 } 4512 return true; 4513 } 4514 4515 /** 4516 * @see #containsNone(com.ibm.icu.text.UnicodeSet) 4517 * @stable ICU 4.4 4518 */ containsNone(Iterable<T> collection)4519 public <T extends CharSequence> boolean containsNone(Iterable<T> collection) { 4520 for (T o : collection) { 4521 if (contains(o)) { 4522 return false; 4523 } 4524 } 4525 return true; 4526 } 4527 4528 /** 4529 * @see #containsAll(com.ibm.icu.text.UnicodeSet) 4530 * @stable ICU 4.4 4531 */ containsSome(Iterable<T> collection)4532 public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) { 4533 return !containsNone(collection); 4534 } 4535 4536 /** 4537 * @see #addAll(com.ibm.icu.text.UnicodeSet) 4538 * @stable ICU 4.4 4539 */ 4540 @SuppressWarnings("unchecked") // See ticket #11395, this is safe. addAll(T... collection)4541 public <T extends CharSequence> UnicodeSet addAll(T... collection) { 4542 checkFrozen(); 4543 for (T str : collection) { 4544 add(str); 4545 } 4546 return this; 4547 } 4548 4549 4550 /** 4551 * @see #removeAll(com.ibm.icu.text.UnicodeSet) 4552 * @stable ICU 4.4 4553 */ removeAll(Iterable<T> collection)4554 public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) { 4555 checkFrozen(); 4556 for (T o : collection) { 4557 remove(o); 4558 } 4559 return this; 4560 } 4561 4562 /** 4563 * @see #retainAll(com.ibm.icu.text.UnicodeSet) 4564 * @stable ICU 4.4 4565 */ retainAll(Iterable<T> collection)4566 public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) { 4567 checkFrozen(); 4568 // TODO optimize 4569 UnicodeSet toRetain = new UnicodeSet(); 4570 toRetain.addAll(collection); 4571 retainAll(toRetain); 4572 return this; 4573 } 4574 4575 /** 4576 * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}. 4577 * @stable ICU 4.4 4578 */ 4579 public enum ComparisonStyle { 4580 /** 4581 * @stable ICU 4.4 4582 */ 4583 SHORTER_FIRST, 4584 /** 4585 * @stable ICU 4.4 4586 */ 4587 LEXICOGRAPHIC, 4588 /** 4589 * @stable ICU 4.4 4590 */ 4591 LONGER_FIRST 4592 } 4593 4594 /** 4595 * Compares UnicodeSets, where shorter come first, and otherwise lexicographically 4596 * (according to the comparison of the first characters that differ). 4597 * @see java.lang.Comparable#compareTo(java.lang.Object) 4598 * @stable ICU 4.4 4599 */ 4600 @Override compareTo(UnicodeSet o)4601 public int compareTo(UnicodeSet o) { 4602 return compareTo(o, ComparisonStyle.SHORTER_FIRST); 4603 } 4604 /** 4605 * Compares UnicodeSets, in three different ways. 4606 * @see java.lang.Comparable#compareTo(java.lang.Object) 4607 * @stable ICU 4.4 4608 */ compareTo(UnicodeSet o, ComparisonStyle style)4609 public int compareTo(UnicodeSet o, ComparisonStyle style) { 4610 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4611 int diff = size() - o.size(); 4612 if (diff != 0) { 4613 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4614 } 4615 } 4616 int result; 4617 for (int i = 0; ; ++i) { 4618 if (0 != (result = list[i] - o.list[i])) { 4619 // if either list ran out, compare to the last string 4620 if (list[i] == HIGH) { 4621 if (!hasStrings()) return 1; 4622 String item = strings.first(); 4623 return compare(item, o.list[i]); 4624 } 4625 if (o.list[i] == HIGH) { 4626 if (!o.hasStrings()) return -1; 4627 String item = o.strings.first(); 4628 int compareResult = compare(item, list[i]); 4629 return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order. 4630 } 4631 // otherwise return the result if even index, or the reversal if not 4632 return (i & 1) == 0 ? result : -result; 4633 } 4634 if (list[i] == HIGH) { 4635 break; 4636 } 4637 } 4638 return compare(strings, o.strings); 4639 } 4640 4641 /** 4642 * @stable ICU 4.4 4643 */ compareTo(Iterable<String> other)4644 public int compareTo(Iterable<String> other) { 4645 return compare(this, other); 4646 } 4647 4648 /** 4649 * Utility to compare a string to a code point. 4650 * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) 4651 * and comparing, but much faster (no object creation). 4652 * Actually, there is one difference; a null compares as less. 4653 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4654 * @stable ICU 4.4 4655 */ 4656 compare(CharSequence string, int codePoint)4657 public static int compare(CharSequence string, int codePoint) { 4658 return CharSequences.compare(string, codePoint); 4659 } 4660 4661 /** 4662 * Utility to compare a string to a code point. 4663 * Same results as turning the code point into a string and comparing, but much faster (no object creation). 4664 * Actually, there is one difference; a null compares as less. 4665 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4666 * @stable ICU 4.4 4667 */ compare(int codePoint, CharSequence string)4668 public static int compare(int codePoint, CharSequence string) { 4669 return -CharSequences.compare(string, codePoint); 4670 } 4671 4672 4673 /** 4674 * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered, 4675 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4676 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4677 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4678 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4679 * @stable ICU 4.4 4680 */ compare(Iterable<T> collection1, Iterable<T> collection2)4681 public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) { 4682 return compare(collection1.iterator(), collection2.iterator()); 4683 } 4684 4685 /** 4686 * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, 4687 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4688 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4689 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4690 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4691 * @internal 4692 * @deprecated This API is ICU internal only. 4693 */ 4694 @Deprecated compare(Iterator<T> first, Iterator<T> other)4695 public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) { 4696 while (true) { 4697 if (!first.hasNext()) { 4698 return other.hasNext() ? -1 : 0; 4699 } else if (!other.hasNext()) { 4700 return 1; 4701 } 4702 T item1 = first.next(); 4703 T item2 = other.next(); 4704 int result = item1.compareTo(item2); 4705 if (result != 0) { 4706 return result; 4707 } 4708 } 4709 } 4710 4711 4712 /** 4713 * Utility to compare two collections, optionally by size, and then lexicographically. 4714 * @stable ICU 4.4 4715 */ compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4716 public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) { 4717 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4718 int diff = collection1.size() - collection2.size(); 4719 if (diff != 0) { 4720 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4721 } 4722 } 4723 return compare(collection1, collection2); 4724 } 4725 4726 /** 4727 * Utility for adding the contents of an iterable to a collection. 4728 * @stable ICU 4.4 4729 */ addAllTo(Iterable<T> source, U target)4730 public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) { 4731 for (T item : source) { 4732 target.add(item); 4733 } 4734 return target; 4735 } 4736 4737 /** 4738 * Utility for adding the contents of an iterable to a collection. 4739 * @stable ICU 4.4 4740 */ addAllTo(Iterable<T> source, T[] target)4741 public static <T> T[] addAllTo(Iterable<T> source, T[] target) { 4742 int i = 0; 4743 for (T item : source) { 4744 target[i++] = item; 4745 } 4746 return target; 4747 } 4748 4749 /** 4750 * For iterating through the strings in the set. Example: 4751 * <pre> 4752 * for (String key : myUnicodeSet.strings()) { 4753 * doSomethingWith(key); 4754 * } 4755 * </pre> 4756 * @stable ICU 4.4 4757 */ strings()4758 public Collection<String> strings() { 4759 if (hasStrings()) { 4760 return Collections.unmodifiableSortedSet(strings); 4761 } else { 4762 return EMPTY_STRINGS; 4763 } 4764 } 4765 4766 /** 4767 * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE. 4768 * @internal 4769 * @deprecated This API is ICU internal only. 4770 */ 4771 @Deprecated getSingleCodePoint(CharSequence s)4772 public static int getSingleCodePoint(CharSequence s) { 4773 return CharSequences.getSingleCodePoint(s); 4774 } 4775 4776 /** 4777 * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set. 4778 * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E 4779 * if the dontCare set includes unassigned characters (for a particular version of Unicode). 4780 * @param dontCare Set with the don't-care characters for spanning 4781 * @return the input set, modified 4782 * @internal 4783 * @deprecated This API is ICU internal only. 4784 */ 4785 @Deprecated addBridges(UnicodeSet dontCare)4786 public UnicodeSet addBridges(UnicodeSet dontCare) { 4787 UnicodeSet notInInput = new UnicodeSet(this).complement().removeAllStrings(); 4788 for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) { 4789 if (it.codepoint != 0 && it.codepointEnd != 0x10FFFF && 4790 dontCare.contains(it.codepoint, it.codepointEnd)) { 4791 add(it.codepoint,it.codepointEnd); 4792 } 4793 } 4794 return this; 4795 } 4796 4797 /** 4798 * Find the first index at or after fromIndex where the UnicodeSet matches at that index. 4799 * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. 4800 * If there is no match, length is returned. 4801 * @internal 4802 * @deprecated This API is ICU internal only. Use span instead. 4803 */ 4804 @Deprecated findIn(CharSequence value, int fromIndex, boolean findNot)4805 public int findIn(CharSequence value, int fromIndex, boolean findNot) { 4806 //TODO add strings, optimize, using ICU4C algorithms 4807 int cp; 4808 for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) { 4809 cp = UTF16.charAt(value, fromIndex); 4810 if (contains(cp) != findNot) { 4811 break; 4812 } 4813 } 4814 return fromIndex; 4815 } 4816 4817 /** 4818 * Find the last index before fromIndex where the UnicodeSet matches at that index. 4819 * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match. 4820 * If there is no match, -1 is returned. 4821 * BEFORE index is not in the UnicodeSet. 4822 * @internal 4823 * @deprecated This API is ICU internal only. Use spanBack instead. 4824 */ 4825 @Deprecated findLastIn(CharSequence value, int fromIndex, boolean findNot)4826 public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { 4827 //TODO add strings, optimize, using ICU4C algorithms 4828 int cp; 4829 fromIndex -= 1; 4830 for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) { 4831 cp = UTF16.charAt(value, fromIndex); 4832 if (contains(cp) != findNot) { 4833 break; 4834 } 4835 } 4836 return fromIndex < 0 ? -1 : fromIndex; 4837 } 4838 4839 /** 4840 * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match. 4841 * @param source The source of the CharSequence to strip from. 4842 * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. 4843 * @return The string after it has been stripped. 4844 * @internal 4845 * @deprecated This API is ICU internal only. Use replaceFrom. 4846 */ 4847 @Deprecated stripFrom(CharSequence source, boolean matches)4848 public String stripFrom(CharSequence source, boolean matches) { 4849 StringBuilder result = new StringBuilder(); 4850 for (int pos = 0; pos < source.length();) { 4851 int inside = findIn(source, pos, !matches); 4852 result.append(source.subSequence(pos, inside)); 4853 pos = findIn(source, inside, matches); // get next start 4854 } 4855 return result.toString(); 4856 } 4857 4858 /** 4859 * Argument values for whether span() and similar functions continue while the current character is contained vs. 4860 * not contained in the set. 4861 * <p> 4862 * The functionality is straightforward for sets with only single code points, without strings (which is the common 4863 * case): 4864 * <ul> 4865 * <li>CONTAINED and SIMPLE work the same. 4866 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. 4867 * <li>span() and spanBack() partition any string the 4868 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). 4869 * <li>Using a 4870 * complemented (inverted) set and the opposite span conditions yields the same results. 4871 * </ul> 4872 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in 4873 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with 4874 * strings: 4875 * <ul> 4876 * <li>The complement of the set contains the opposite set of code points, but the same set of strings. 4877 * Therefore, complementing both the set and the span conditions may yield different results. 4878 * <li>When starting spans 4879 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 4880 * because a set string may start before the later position. 4881 * <li>span(SIMPLE) may be shorter than 4882 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which 4883 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", 4884 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). 4885 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, 4886 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield 4887 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. 4888 * </ul> 4889 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then 4890 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could 4891 * be used. 4892 * <p> 4893 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point 4894 * boundaries, never in the middle of a surrogate pair. 4895 * 4896 * @stable ICU 4.4 4897 */ 4898 public enum SpanCondition { 4899 /** 4900 * Continues a span() while there is no set element at the current position. 4901 * Increments by one code point at a time. 4902 * Stops before the first set element (character or string). 4903 * (For code points only, this is like while contains(current)==false). 4904 * <p> 4905 * When span() returns, the substring between where it started and the position it returned consists only of 4906 * characters that are not in the set, and none of its strings overlap with the span. 4907 * 4908 * @stable ICU 4.4 4909 */ 4910 NOT_CONTAINED, 4911 4912 /** 4913 * Spans the longest substring that is a concatenation of set elements (characters or strings). 4914 * (For characters only, this is like while contains(current)==true). 4915 * <p> 4916 * When span() returns, the substring between where it started and the position it returned consists only of set 4917 * elements (characters or strings) that are in the set. 4918 * <p> 4919 * If a set contains strings, then the span will be the longest substring for which there 4920 * exists at least one non-overlapping concatenation of set elements (characters or strings). 4921 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 4922 * (Java/ICU/Perl regex stops at the first match of an OR.) 4923 * 4924 * @stable ICU 4.4 4925 */ 4926 CONTAINED, 4927 4928 /** 4929 * Continues a span() while there is a set element at the current position. 4930 * Increments by the longest matching element at each position. 4931 * (For characters only, this is like while contains(current)==true). 4932 * <p> 4933 * When span() returns, the substring between where it started and the position it returned consists only of set 4934 * elements (characters or strings) that are in the set. 4935 * <p> 4936 * If a set only contains single characters, then this is the same as CONTAINED. 4937 * <p> 4938 * If a set contains strings, then the span will be the longest substring with a match at each position with the 4939 * longest single set element (character or string). 4940 * <p> 4941 * Use this span condition together with other longest-match algorithms, such as ICU converters 4942 * (ucnv_getUnicodeSet()). 4943 * 4944 * @stable ICU 4.4 4945 */ 4946 SIMPLE, 4947 4948 /** 4949 * One more than the last span condition. 4950 * 4951 * @stable ICU 4.4 4952 */ 4953 CONDITION_COUNT 4954 } 4955 4956 /** 4957 * Get the default symbol table. Null means ordinary processing. For internal use only. 4958 * @return the symbol table 4959 * @internal 4960 * @deprecated This API is ICU internal only. 4961 */ 4962 @Deprecated getDefaultXSymbolTable()4963 public static XSymbolTable getDefaultXSymbolTable() { 4964 return XSYMBOL_TABLE; 4965 } 4966 4967 /** 4968 * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing 4969 * of UnicodeSets. 4970 * <p> 4971 * WARNING: If this function is used with a UnicodeProperty, and the 4972 * Unassigned characters (gc=Cn) are different than in ICU, you MUST call 4973 * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} 4974 * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. 4975 * 4976 * @param xSymbolTable the new default symbol table. 4977 * @internal 4978 * @deprecated This API is ICU internal only. 4979 */ 4980 @Deprecated setDefaultXSymbolTable(XSymbolTable xSymbolTable)4981 public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { 4982 // If the properties override inclusions, these have to be regenerated. 4983 // TODO: Check if the Unicode Tools or Unicode Utilities really need this. 4984 CharacterPropertiesImpl.clear(); 4985 XSYMBOL_TABLE = xSymbolTable; 4986 } 4987 } 4988 //eof 4989