1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.io.IOException; 12 import java.text.ParsePosition; 13 import java.util.ArrayList; 14 import java.util.Arrays; 15 import java.util.Collection; 16 import java.util.Collections; 17 import java.util.Iterator; 18 import java.util.NoSuchElementException; 19 import java.util.SortedSet; 20 import java.util.TreeSet; 21 22 import com.ibm.icu.impl.BMPSet; 23 import com.ibm.icu.impl.CharacterPropertiesImpl; 24 import com.ibm.icu.impl.PatternProps; 25 import com.ibm.icu.impl.RuleCharacterIterator; 26 import com.ibm.icu.impl.SortedSetRelation; 27 import com.ibm.icu.impl.StringRange; 28 import com.ibm.icu.impl.UCaseProps; 29 import com.ibm.icu.impl.UPropertyAliases; 30 import com.ibm.icu.impl.UnicodeSetStringSpan; 31 import com.ibm.icu.impl.Utility; 32 import com.ibm.icu.lang.CharSequences; 33 import com.ibm.icu.lang.CharacterProperties; 34 import com.ibm.icu.lang.UCharacter; 35 import com.ibm.icu.lang.UProperty; 36 import com.ibm.icu.lang.UScript; 37 import com.ibm.icu.util.Freezable; 38 import com.ibm.icu.util.ICUUncheckedIOException; 39 import com.ibm.icu.util.OutputInt; 40 import com.ibm.icu.util.ULocale; 41 import com.ibm.icu.util.VersionInfo; 42 43 /** 44 * A mutable set of Unicode characters and multicharacter strings. 45 * Objects of this class represent <em>character classes</em> used 46 * in regular expressions. A character specifies a subset of Unicode 47 * code points. Legal code points are U+0000 to U+10FFFF, inclusive. 48 * 49 * Note: method freeze() will not only make the set immutable, but 50 * also makes important methods much higher performance: 51 * contains(c), containsNone(...), span(...), spanBack(...) etc. 52 * After the object is frozen, any subsequent call that wants to change 53 * the object will throw UnsupportedOperationException. 54 * 55 * <p>The UnicodeSet class is not designed to be subclassed. 56 * 57 * <p><code>UnicodeSet</code> supports two APIs. The first is the 58 * <em>operand</em> API that allows the caller to modify the value of 59 * a <code>UnicodeSet</code> object. It conforms to Java 2's 60 * <code>java.util.Set</code> interface, although 61 * <code>UnicodeSet</code> does not actually implement that 62 * interface. All methods of <code>Set</code> are supported, with the 63 * modification that they take a character range or single character 64 * instead of an <code>Object</code>, and they take a 65 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 66 * operand API may be thought of in terms of boolean logic: a boolean 67 * OR is implemented by <code>add</code>, a boolean AND is implemented 68 * by <code>retain</code>, a boolean XOR is implemented by 69 * <code>complement</code> taking an argument, and a boolean NOT is 70 * implemented by <code>complement</code> with no argument. In terms 71 * of traditional set theory function names, <code>add</code> is a 72 * union, <code>retain</code> is an intersection, <code>remove</code> 73 * is an asymmetric difference, and <code>complement</code> with no 74 * argument is a set complement with respect to the superset range 75 * <code>MIN_VALUE-MAX_VALUE</code> 76 * 77 * <p>The second API is the 78 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 79 * <code>java.text.Format</code>-derived classes. Unlike the 80 * methods that add characters, add categories, and control the logic 81 * of the set, the method <code>applyPattern()</code> sets all 82 * attributes of a <code>UnicodeSet</code> at once, based on a 83 * string pattern. 84 * 85 * <p><b>Pattern syntax</b></p> 86 * 87 * Patterns are accepted by the constructors and the 88 * <code>applyPattern()</code> methods and returned by the 89 * <code>toPattern()</code> method. These patterns follow a syntax 90 * similar to that employed by version 8 regular expression character 91 * classes. Here are some simple examples: 92 * 93 * <blockquote> 94 * <table> 95 * <tr style="vertical-align: top"> 96 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td> 97 * <td style="vertical-align: top;">No characters</td> 98 * </tr><tr style="vertical-align: top"> 99 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td> 100 * <td style="vertical-align: top;">The character 'a'</td> 101 * </tr><tr style="vertical-align: top"> 102 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td> 103 * <td style="vertical-align: top;">The characters 'a' and 'e'</td> 104 * </tr> 105 * <tr> 106 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td> 107 * <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code 108 * point order</td> 109 * </tr> 110 * <tr> 111 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td> 112 * <td style="vertical-align: top;">The character U+4E01</td> 113 * </tr> 114 * <tr> 115 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td> 116 * <td style="vertical-align: top;">The character 'a' and the multicharacter strings "ab" and 117 * "ac"</td> 118 * </tr> 119 * <tr> 120 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td> 121 * <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td> 122 * </tr> 123 * </table> 124 * </blockquote> 125 * 126 * Any character may be preceded by a backslash in order to remove any special 127 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are 128 * ignored, unless they are escaped. 129 * 130 * <p>Property patterns specify a set of characters having a certain 131 * property as defined by the Unicode standard. Both the POSIX-like 132 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 133 * complete list of supported property patterns, see the User's Guide 134 * for UnicodeSet at 135 * <a href="http://www.icu-project.org/userguide/unicodeSet.html"> 136 * http://www.icu-project.org/userguide/unicodeSet.html</a>. 137 * Actual determination of property data is defined by the underlying 138 * Unicode database as implemented by UCharacter. 139 * 140 * <p>Patterns specify individual characters, ranges of characters, and 141 * Unicode property sets. When elements are concatenated, they 142 * specify their union. To complement a set, place a '^' immediately 143 * after the opening '['. Property patterns are inverted by modifying 144 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 145 * '^' has no special meaning. 146 * 147 * <p>Ranges are indicated by placing two a '-' between two 148 * characters, as in "a-z". This specifies the range of all 149 * characters from the left to the right, in Unicode order. If the 150 * left character is greater than or equal to the 151 * right character it is a syntax error. If a '-' occurs as the first 152 * character after the opening '[' or '[^', or if it occurs as the 153 * last character before the closing ']', then it is taken as a 154 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 155 * set of three characters, 'a', 'b', and '-'. 156 * 157 * <p>Sets may be intersected using the '&' operator or the asymmetric 158 * set difference may be taken using the '-' operator, for example, 159 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 160 * with values less than 4096. Operators ('&' and '|') have equal 161 * precedence and bind left-to-right. Thus 162 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 163 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 164 * difference; intersection is commutative. 165 * 166 * <table> 167 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a' 168 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a' 169 * through 'z' and all letters in between, in Unicode order 170 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing 171 * all characters but 'a' through 'z', 172 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 173 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 174 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 175 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 176 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 177 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 178 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 179 * <em>pat2</em> 180 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code> 181 * <td>The set of characters having the specified 182 * Unicode property; in 183 * this case, Unicode uppercase letters 184 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code> 185 * <td>The set of characters <em>not</em> having the given 186 * Unicode property 187 * </table> 188 * 189 * <p><b>Formal syntax</b></p> 190 * 191 * <blockquote> 192 * <table> 193 * <tr style="vertical-align: top"> 194 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern := </code></td> 195 * <td style="vertical-align: top;"><code>('[' '^'? item* ']') | 196 * property</code></td> 197 * </tr> 198 * <tr style="vertical-align: top"> 199 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item := </code></td> 200 * <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br> 201 * </code></td> 202 * </tr> 203 * <tr style="vertical-align: top"> 204 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr := </code></td> 205 * <td style="vertical-align: top;"><code>pattern | pattern-expr pattern | 206 * pattern-expr op pattern<br> 207 * </code></td> 208 * </tr> 209 * <tr style="vertical-align: top"> 210 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op := </code></td> 211 * <td style="vertical-align: top;"><code>'&' | '-'<br> 212 * </code></td> 213 * </tr> 214 * <tr style="vertical-align: top"> 215 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special := </code></td> 216 * <td style="vertical-align: top;"><code>'[' | ']' | '-'<br> 217 * </code></td> 218 * </tr> 219 * <tr style="vertical-align: top"> 220 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char := </code></td> 221 * <td style="vertical-align: top;"><em>any character that is not</em><code> special<br> 222 * | ('\\' </code><em>any character</em><code>)<br> 223 * | ('\u' hex hex hex hex)<br> 224 * </code></td> 225 * </tr> 226 * <tr style="vertical-align: top"> 227 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> 228 * <td style="vertical-align: top;"><em>any character for which 229 * </em><code>Character.digit(c, 16)</code><em> 230 * returns a non-negative result</em></td> 231 * </tr> 232 * <tr> 233 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td> 234 * <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td> 235 * </tr> 236 * </table> 237 * <br> 238 * <table border="1"> 239 * <tr> 240 * <td>Legend: <table> 241 * <tr> 242 * <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td> 243 * <td style="width: 20; vertical-align: top;"> </td> 244 * <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td> 245 * </tr> 246 * <tr> 247 * <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td> 248 * <td style="vertical-align: top;"></td> 249 * <td style="vertical-align: top;">zero or one instance of <code>a</code><br> 250 * </td> 251 * </tr> 252 * <tr> 253 * <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td> 254 * <td style="vertical-align: top;"></td> 255 * <td style="vertical-align: top;">one or more instances of <code>a</code><br> 256 * </td> 257 * </tr> 258 * <tr> 259 * <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td> 260 * <td style="vertical-align: top;"></td> 261 * <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br> 262 * </td> 263 * </tr> 264 * <tr> 265 * <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td> 266 * <td style="vertical-align: top;"></td> 267 * <td style="vertical-align: top;">the literal string between the quotes </td> 268 * </tr> 269 * </table> 270 * </td> 271 * </tr> 272 * </table> 273 * </blockquote> 274 * <p>To iterate over contents of UnicodeSet, the following are available: 275 * <ul><li>{@link #ranges()} to iterate through the ranges</li> 276 * <li>{@link #strings()} to iterate through the strings</li> 277 * <li>{@link #iterator()} to iterate through the entire contents in a single loop. 278 * That method is, however, not particularly efficient, since it "boxes" each code point into a String. 279 * </ul> 280 * All of the above can be used in <b>for</b> loops. 281 * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. 282 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 283 * 284 * @author Alan Liu 285 * @stable ICU 2.0 286 * @see UnicodeSetIterator 287 * @see UnicodeSetSpanner 288 */ 289 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> { 290 private static final SortedSet<String> EMPTY_STRINGS = 291 Collections.unmodifiableSortedSet(new TreeSet<String>()); 292 293 /** 294 * Constant for the empty set. 295 * @stable ICU 4.8 296 */ 297 public static final UnicodeSet EMPTY = new UnicodeSet().freeze(); 298 /** 299 * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.) 300 * @stable ICU 4.8 301 */ 302 public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); 303 304 private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing 305 306 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 307 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 308 // 110000 for codepoints 309 310 /** 311 * Enough for sets with few ranges. 312 * For example, White_Space has 10 ranges, list length 21. 313 */ 314 private static final int INITIAL_CAPACITY = 25; 315 316 /** Max list [0, 1, 2, ..., max code point, HIGH] */ 317 private static final int MAX_LENGTH = HIGH + 1; 318 319 /** 320 * Minimum value that can be stored in a UnicodeSet. 321 * @stable ICU 2.0 322 */ 323 public static final int MIN_VALUE = LOW; 324 325 /** 326 * Maximum value that can be stored in a UnicodeSet. 327 * @stable ICU 2.0 328 */ 329 public static final int MAX_VALUE = HIGH - 1; 330 331 private int len; // length used; list may be longer to minimize reallocs 332 private int[] list; // MUST be terminated with HIGH 333 private int[] rangeList; // internal buffer 334 private int[] buffer; // internal buffer 335 336 // is not private so that UnicodeSetIterator can get access 337 SortedSet<String> strings = EMPTY_STRINGS; 338 339 /** 340 * The pattern representation of this set. This may not be the 341 * most economical pattern. It is the pattern supplied to 342 * applyPattern(), with variables substituted and whitespace 343 * removed. For sets constructed without applyPattern(), or 344 * modified using the non-pattern API, this string will be null, 345 * indicating that toPattern() must generate a pattern 346 * representation from the inversion list. 347 */ 348 private String pat = null; 349 350 // Special property set IDs 351 private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF] 352 private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] 353 private static final String ASSIGNED = "Assigned"; // [:^Cn:] 354 355 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. 356 private volatile UnicodeSetStringSpan stringSpan; 357 //---------------------------------------------------------------- 358 // Public API 359 //---------------------------------------------------------------- 360 361 /** 362 * Constructs an empty set. 363 * @stable ICU 2.0 364 */ UnicodeSet()365 public UnicodeSet() { 366 list = new int[INITIAL_CAPACITY]; 367 list[0] = HIGH; 368 len = 1; 369 } 370 371 /** 372 * Constructs a copy of an existing set. 373 * @stable ICU 2.0 374 */ UnicodeSet(UnicodeSet other)375 public UnicodeSet(UnicodeSet other) { 376 set(other); 377 } 378 379 /** 380 * Constructs a set containing the given range. If <code>end > 381 * start</code> then an empty set is created. 382 * 383 * @param start first character, inclusive, of range 384 * @param end last character, inclusive, of range 385 * @stable ICU 2.0 386 */ UnicodeSet(int start, int end)387 public UnicodeSet(int start, int end) { 388 this(); 389 add(start, end); 390 } 391 392 /** 393 * Quickly constructs a set from a set of ranges <s0, e0, s1, e1, s2, e2, ..., sn, en>. 394 * There must be an even number of integers, and they must be all greater than zero, 395 * all less than or equal to Character.MAX_CODE_POINT. 396 * In each pair (..., si, ei, ...) it must be true that si <= ei 397 * Between adjacent pairs (...ei, sj...), it must be true that ei+1 < sj 398 * @param pairs pairs of character representing ranges 399 * @stable ICU 4.4 400 */ UnicodeSet(int... pairs)401 public UnicodeSet(int... pairs) { 402 if ((pairs.length & 1) != 0) { 403 throw new IllegalArgumentException("Must have even number of integers"); 404 } 405 list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set. 406 len = list.length; 407 int last = -1; // used to ensure that the results are monotonically increasing. 408 int i = 0; 409 while (i < pairs.length) { 410 int start = pairs[i]; 411 if (last >= start) { 412 throw new IllegalArgumentException("Must be monotonically increasing."); 413 } 414 list[i++] = start; 415 int limit = pairs[i] + 1; 416 if (start >= limit) { 417 throw new IllegalArgumentException("Must be monotonically increasing."); 418 } 419 list[i++] = last = limit; 420 } 421 list[i] = HIGH; // terminate 422 } 423 424 /** 425 * Constructs a set from the given pattern. See the class description 426 * for the syntax of the pattern language. Whitespace is ignored. 427 * @param pattern a string specifying what characters are in the set 428 * @exception java.lang.IllegalArgumentException if the pattern contains 429 * a syntax error. 430 * @stable ICU 2.0 431 */ UnicodeSet(String pattern)432 public UnicodeSet(String pattern) { 433 this(); 434 applyPattern(pattern, null, null, IGNORE_SPACE); 435 } 436 437 /** 438 * Constructs a set from the given pattern. See the class description 439 * for the syntax of the pattern language. 440 * @param pattern a string specifying what characters are in the set 441 * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters 442 * @exception java.lang.IllegalArgumentException if the pattern contains 443 * a syntax error. 444 * @stable ICU 2.0 445 */ UnicodeSet(String pattern, boolean ignoreWhitespace)446 public UnicodeSet(String pattern, boolean ignoreWhitespace) { 447 this(); 448 applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 449 } 450 451 /** 452 * Constructs a set from the given pattern. See the class description 453 * for the syntax of the pattern language. 454 * @param pattern a string specifying what characters are in the set 455 * @param options a bitmask indicating which options to apply. 456 * Valid options are IGNORE_SPACE and CASE. 457 * @exception java.lang.IllegalArgumentException if the pattern contains 458 * a syntax error. 459 * @stable ICU 3.8 460 */ UnicodeSet(String pattern, int options)461 public UnicodeSet(String pattern, int options) { 462 this(); 463 applyPattern(pattern, null, null, options); 464 } 465 466 /** 467 * Constructs a set from the given pattern. See the class description 468 * for the syntax of the pattern language. 469 * @param pattern a string specifying what characters are in the set 470 * @param pos on input, the position in pattern at which to start parsing. 471 * On output, the position after the last character parsed. 472 * @param symbols a symbol table mapping variables to char[] arrays 473 * and chars to UnicodeSets 474 * @exception java.lang.IllegalArgumentException if the pattern 475 * contains a syntax error. 476 * @stable ICU 2.0 477 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)478 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) { 479 this(); 480 applyPattern(pattern, pos, symbols, IGNORE_SPACE); 481 } 482 483 /** 484 * Constructs a set from the given pattern. See the class description 485 * for the syntax of the pattern language. 486 * @param pattern a string specifying what characters are in the set 487 * @param pos on input, the position in pattern at which to start parsing. 488 * On output, the position after the last character parsed. 489 * @param symbols a symbol table mapping variables to char[] arrays 490 * and chars to UnicodeSets 491 * @param options a bitmask indicating which options to apply. 492 * Valid options are IGNORE_SPACE and CASE. 493 * @exception java.lang.IllegalArgumentException if the pattern 494 * contains a syntax error. 495 * @stable ICU 3.2 496 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)497 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) { 498 this(); 499 applyPattern(pattern, pos, symbols, options); 500 } 501 502 503 /** 504 * Return a new set that is equivalent to this one. 505 * @stable ICU 2.0 506 */ 507 @Override clone()508 public Object clone() { 509 if (isFrozen()) { 510 return this; 511 } 512 return new UnicodeSet(this); 513 } 514 515 /** 516 * Make this object represent the range <code>start - end</code>. 517 * If <code>start > end</code> then this object is set to an empty range. 518 * 519 * @param start first character in the set, inclusive 520 * @param end last character in the set, inclusive 521 * @stable ICU 2.0 522 */ set(int start, int end)523 public UnicodeSet set(int start, int end) { 524 checkFrozen(); 525 clear(); 526 complement(start, end); 527 return this; 528 } 529 530 /** 531 * Make this object represent the same set as <code>other</code>. 532 * @param other a <code>UnicodeSet</code> whose value will be 533 * copied to this object 534 * @stable ICU 2.0 535 */ set(UnicodeSet other)536 public UnicodeSet set(UnicodeSet other) { 537 checkFrozen(); 538 list = Arrays.copyOf(other.list, other.len); 539 len = other.len; 540 pat = other.pat; 541 if (other.hasStrings()) { 542 strings = new TreeSet<>(other.strings); 543 } else { 544 strings = EMPTY_STRINGS; 545 } 546 return this; 547 } 548 549 /** 550 * Modifies this set to represent the set specified by the given pattern. 551 * See the class description for the syntax of the pattern language. 552 * Whitespace is ignored. 553 * @param pattern a string specifying what characters are in the set 554 * @exception java.lang.IllegalArgumentException if the pattern 555 * contains a syntax error. 556 * @stable ICU 2.0 557 */ applyPattern(String pattern)558 public final UnicodeSet applyPattern(String pattern) { 559 checkFrozen(); 560 return applyPattern(pattern, null, null, IGNORE_SPACE); 561 } 562 563 /** 564 * Modifies this set to represent the set specified by the given pattern, 565 * optionally ignoring whitespace. 566 * See the class description for the syntax of the pattern language. 567 * @param pattern a string specifying what characters are in the set 568 * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored 569 * @exception java.lang.IllegalArgumentException if the pattern 570 * contains a syntax error. 571 * @stable ICU 2.0 572 */ applyPattern(String pattern, boolean ignoreWhitespace)573 public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) { 574 checkFrozen(); 575 return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 576 } 577 578 /** 579 * Modifies this set to represent the set specified by the given pattern, 580 * optionally ignoring whitespace. 581 * See the class description for the syntax of the pattern language. 582 * @param pattern a string specifying what characters are in the set 583 * @param options a bitmask indicating which options to apply. 584 * Valid options are IGNORE_SPACE and CASE. 585 * @exception java.lang.IllegalArgumentException if the pattern 586 * contains a syntax error. 587 * @stable ICU 3.8 588 */ applyPattern(String pattern, int options)589 public UnicodeSet applyPattern(String pattern, int options) { 590 checkFrozen(); 591 return applyPattern(pattern, null, null, options); 592 } 593 594 /** 595 * Return true if the given position, in the given pattern, appears 596 * to be the start of a UnicodeSet pattern. 597 * @stable ICU 2.0 598 */ resemblesPattern(String pattern, int pos)599 public static boolean resemblesPattern(String pattern, int pos) { 600 return ((pos+1) < pattern.length() && 601 pattern.charAt(pos) == '[') || 602 resemblesPropertyPattern(pattern, pos); 603 } 604 605 /** 606 * TODO: create Appendable version of UTF16.append(buf, c), 607 * maybe in new class Appendables? 608 * @throws IOException 609 */ appendCodePoint(Appendable app, int c)610 private static void appendCodePoint(Appendable app, int c) { 611 assert 0 <= c && c <= 0x10ffff; 612 try { 613 if (c <= 0xffff) { 614 app.append((char) c); 615 } else { 616 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c)); 617 } 618 } catch (IOException e) { 619 throw new ICUUncheckedIOException(e); 620 } 621 } 622 623 /** 624 * TODO: create class Appendables? 625 * @throws IOException 626 */ append(Appendable app, CharSequence s)627 private static void append(Appendable app, CharSequence s) { 628 try { 629 app.append(s); 630 } catch (IOException e) { 631 throw new ICUUncheckedIOException(e); 632 } 633 } 634 635 /** 636 * Append the <code>toPattern()</code> representation of a 637 * string to the given <code>Appendable</code>. 638 */ _appendToPat(T buf, String s, boolean escapeUnprintable)639 private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) { 640 int cp; 641 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 642 cp = s.codePointAt(i); 643 _appendToPat(buf, cp, escapeUnprintable); 644 } 645 return buf; 646 } 647 648 /** 649 * Append the <code>toPattern()</code> representation of a 650 * character to the given <code>Appendable</code>. 651 */ _appendToPat(T buf, int c, boolean escapeUnprintable)652 private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) { 653 try { 654 if (escapeUnprintable && Utility.isUnprintable(c)) { 655 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 656 // unprintable 657 if (Utility.escapeUnprintable(buf, c)) { 658 return buf; 659 } 660 } 661 // Okay to let ':' pass through 662 switch (c) { 663 case '[': // SET_OPEN: 664 case ']': // SET_CLOSE: 665 case '-': // HYPHEN: 666 case '^': // COMPLEMENT: 667 case '&': // INTERSECTION: 668 case '\\': //BACKSLASH: 669 case '{': 670 case '}': 671 case '$': 672 case ':': 673 buf.append('\\'); 674 break; 675 default: 676 // Escape whitespace 677 if (PatternProps.isWhiteSpace(c)) { 678 buf.append('\\'); 679 } 680 break; 681 } 682 appendCodePoint(buf, c); 683 return buf; 684 } catch (IOException e) { 685 throw new ICUUncheckedIOException(e); 686 } 687 } 688 689 /** 690 * Returns a string representation of this set. If the result of 691 * calling this function is passed to a UnicodeSet constructor, it 692 * will produce another set that is equal to this one. 693 * @stable ICU 2.0 694 */ 695 @Override toPattern(boolean escapeUnprintable)696 public String toPattern(boolean escapeUnprintable) { 697 if (pat != null && !escapeUnprintable) { 698 return pat; 699 } 700 StringBuilder result = new StringBuilder(); 701 return _toPattern(result, escapeUnprintable).toString(); 702 } 703 704 /** 705 * Append a string representation of this set to result. This will be 706 * a cleaned version of the string passed to applyPattern(), if there 707 * is one. Otherwise it will be generated. 708 */ _toPattern(T result, boolean escapeUnprintable)709 private <T extends Appendable> T _toPattern(T result, 710 boolean escapeUnprintable) { 711 if (pat == null) { 712 return appendNewPattern(result, escapeUnprintable, true); 713 } 714 try { 715 if (!escapeUnprintable) { 716 result.append(pat); 717 return result; 718 } 719 boolean oddNumberOfBackslashes = false; 720 for (int i=0; i<pat.length(); ) { 721 int c = pat.codePointAt(i); 722 i += Character.charCount(c); 723 if (Utility.isUnprintable(c)) { 724 // If the unprintable character is preceded by an odd 725 // number of backslashes, then it has been escaped 726 // and we omit the last backslash. 727 Utility.escapeUnprintable(result, c); 728 oddNumberOfBackslashes = false; 729 } else if (!oddNumberOfBackslashes && c == '\\') { 730 // Temporarily withhold an odd-numbered backslash. 731 oddNumberOfBackslashes = true; 732 } else { 733 if (oddNumberOfBackslashes) { 734 result.append('\\'); 735 } 736 appendCodePoint(result, c); 737 oddNumberOfBackslashes = false; 738 } 739 } 740 if (oddNumberOfBackslashes) { 741 result.append('\\'); 742 } 743 return result; 744 } catch (IOException e) { 745 throw new ICUUncheckedIOException(e); 746 } 747 } 748 749 /** 750 * Generate and append a string representation of this set to result. 751 * This does not use this.pat, the cleaned up copy of the string 752 * passed to applyPattern(). 753 * @param result the buffer into which to generate the pattern 754 * @param escapeUnprintable escape unprintable characters if true 755 * @stable ICU 2.0 756 */ _generatePattern(StringBuffer result, boolean escapeUnprintable)757 public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { 758 return _generatePattern(result, escapeUnprintable, true); 759 } 760 761 /** 762 * Generate and append a string representation of this set to result. 763 * This does not use this.pat, the cleaned up copy of the string 764 * passed to applyPattern(). 765 * @param includeStrings if false, doesn't include the strings. 766 * @stable ICU 3.8 767 */ _generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)768 public StringBuffer _generatePattern(StringBuffer result, 769 boolean escapeUnprintable, boolean includeStrings) { 770 return appendNewPattern(result, escapeUnprintable, includeStrings); 771 } 772 appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)773 private <T extends Appendable> T appendNewPattern( 774 T result, boolean escapeUnprintable, boolean includeStrings) { 775 try { 776 result.append('['); 777 778 int count = getRangeCount(); 779 780 // If the set contains at least 2 intervals and includes both 781 // MIN_VALUE and MAX_VALUE, then the inverse representation will 782 // be more economical. 783 if (count > 1 && 784 getRangeStart(0) == MIN_VALUE && 785 getRangeEnd(count-1) == MAX_VALUE) { 786 787 // Emit the inverse 788 result.append('^'); 789 790 for (int i = 1; i < count; ++i) { 791 int start = getRangeEnd(i-1)+1; 792 int end = getRangeStart(i)-1; 793 _appendToPat(result, start, escapeUnprintable); 794 if (start != end) { 795 if ((start+1) != end) { 796 result.append('-'); 797 } 798 _appendToPat(result, end, escapeUnprintable); 799 } 800 } 801 } 802 803 // Default; emit the ranges as pairs 804 else { 805 for (int i = 0; i < count; ++i) { 806 int start = getRangeStart(i); 807 int end = getRangeEnd(i); 808 _appendToPat(result, start, escapeUnprintable); 809 if (start != end) { 810 if ((start+1) != end) { 811 result.append('-'); 812 } 813 _appendToPat(result, end, escapeUnprintable); 814 } 815 } 816 } 817 818 if (includeStrings && hasStrings()) { 819 for (String s : strings) { 820 result.append('{'); 821 _appendToPat(result, s, escapeUnprintable); 822 result.append('}'); 823 } 824 } 825 result.append(']'); 826 return result; 827 } catch (IOException e) { 828 throw new ICUUncheckedIOException(e); 829 } 830 } 831 hasStrings()832 boolean hasStrings() { 833 return !strings.isEmpty(); 834 } 835 836 /** 837 * Returns the number of elements in this set (its cardinality) 838 * Note than the elements of a set may include both individual 839 * codepoints and strings. 840 * 841 * @return the number of elements in this set (its cardinality). 842 * @stable ICU 2.0 843 */ size()844 public int size() { 845 int n = 0; 846 int count = getRangeCount(); 847 for (int i = 0; i < count; ++i) { 848 n += getRangeEnd(i) - getRangeStart(i) + 1; 849 } 850 return n + strings.size(); 851 } 852 853 /** 854 * Returns <tt>true</tt> if this set contains no elements. 855 * 856 * @return <tt>true</tt> if this set contains no elements. 857 * @stable ICU 2.0 858 */ isEmpty()859 public boolean isEmpty() { 860 return len == 1 && !hasStrings(); 861 } 862 863 /** 864 * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if 865 * this set contains any character whose low byte is the given 866 * value. This is used by <tt>RuleBasedTransliterator</tt> for 867 * indexing. 868 * @stable ICU 2.0 869 */ 870 @Override matchesIndexValue(int v)871 public boolean matchesIndexValue(int v) { 872 /* The index value v, in the range [0,255], is contained in this set if 873 * it is contained in any pair of this set. Pairs either have the high 874 * bytes equal, or unequal. If the high bytes are equal, then we have 875 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 876 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 877 * Then v is contained if xx <= v || v <= yy. (This is identical to the 878 * time zone month containment logic.) 879 */ 880 for (int i=0; i<getRangeCount(); ++i) { 881 int low = getRangeStart(i); 882 int high = getRangeEnd(i); 883 if ((low & ~0xFF) == (high & ~0xFF)) { 884 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 885 return true; 886 } 887 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 888 return true; 889 } 890 } 891 if (hasStrings()) { 892 for (String s : strings) { 893 if (s.isEmpty()) { 894 continue; // skip the empty string 895 } 896 int c = UTF16.charAt(s, 0); 897 if ((c & 0xFF) == v) { 898 return true; 899 } 900 } 901 } 902 return false; 903 } 904 905 /** 906 * Implementation of UnicodeMatcher.matches(). Always matches the 907 * longest possible multichar string. 908 * @stable ICU 2.0 909 */ 910 @Override matches(Replaceable text, int[] offset, int limit, boolean incremental)911 public int matches(Replaceable text, 912 int[] offset, 913 int limit, 914 boolean incremental) { 915 916 if (offset[0] == limit) { 917 if (contains(UnicodeMatcher.ETHER)) { 918 return incremental ? U_PARTIAL_MATCH : U_MATCH; 919 } else { 920 return U_MISMATCH; 921 } 922 } else { 923 if (hasStrings()) { // try strings first 924 925 // might separate forward and backward loops later 926 // for now they are combined 927 928 // TODO Improve efficiency of this, at least in the forward 929 // direction, if not in both. In the forward direction we 930 // can assume the strings are sorted. 931 932 boolean forward = offset[0] < limit; 933 934 // firstChar is the leftmost char to match in the 935 // forward direction or the rightmost char to match in 936 // the reverse direction. 937 char firstChar = text.charAt(offset[0]); 938 939 // If there are multiple strings that can match we 940 // return the longest match. 941 int highWaterLength = 0; 942 943 for (String trial : strings) { 944 if (trial.isEmpty()) { 945 continue; // skip the empty string 946 } 947 948 char c = trial.charAt(forward ? 0 : trial.length() - 1); 949 950 // Strings are sorted, so we can optimize in the 951 // forward direction. 952 if (forward && c > firstChar) break; 953 if (c != firstChar) continue; 954 955 int length = matchRest(text, offset[0], limit, trial); 956 957 if (incremental) { 958 int maxLen = forward ? limit-offset[0] : offset[0]-limit; 959 if (length == maxLen) { 960 // We have successfully matched but only up to limit. 961 return U_PARTIAL_MATCH; 962 } 963 } 964 965 if (length == trial.length()) { 966 // We have successfully matched the whole string. 967 if (length > highWaterLength) { 968 highWaterLength = length; 969 } 970 // In the forward direction we know strings 971 // are sorted so we can bail early. 972 if (forward && length < highWaterLength) { 973 break; 974 } 975 continue; 976 } 977 } 978 979 // We've checked all strings without a partial match. 980 // If we have full matches, return the longest one. 981 if (highWaterLength != 0) { 982 offset[0] += forward ? highWaterLength : -highWaterLength; 983 return U_MATCH; 984 } 985 } 986 return super.matches(text, offset, limit, incremental); 987 } 988 } 989 990 /** 991 * Returns the longest match for s in text at the given position. 992 * If limit > start then match forward from start+1 to limit 993 * matching all characters except s.charAt(0). If limit < start, 994 * go backward starting from start-1 matching all characters 995 * except s.charAt(s.length()-1). This method assumes that the 996 * first character, text.charAt(start), matches s, so it does not 997 * check it. 998 * @param text the text to match 999 * @param start the first character to match. In the forward 1000 * direction, text.charAt(start) is matched against s.charAt(0). 1001 * In the reverse direction, it is matched against 1002 * s.charAt(s.length()-1). 1003 * @param limit the limit offset for matching, either last+1 in 1004 * the forward direction, or last-1 in the reverse direction, 1005 * where last is the index of the last character to match. 1006 * @return If part of s matches up to the limit, return |limit - 1007 * start|. If all of s matches before reaching the limit, return 1008 * s.length(). If there is a mismatch between s and text, return 1009 * 0 1010 */ matchRest(Replaceable text, int start, int limit, String s)1011 private static int matchRest (Replaceable text, int start, int limit, String s) { 1012 int maxLen; 1013 int slen = s.length(); 1014 if (start < limit) { 1015 maxLen = limit - start; 1016 if (maxLen > slen) maxLen = slen; 1017 for (int i = 1; i < maxLen; ++i) { 1018 if (text.charAt(start + i) != s.charAt(i)) return 0; 1019 } 1020 } else { 1021 maxLen = start - limit; 1022 if (maxLen > slen) maxLen = slen; 1023 --slen; // <=> slen = s.length() - 1; 1024 for (int i = 1; i < maxLen; ++i) { 1025 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1026 } 1027 } 1028 return maxLen; 1029 } 1030 1031 /** 1032 * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. 1033 * @internal 1034 * @deprecated This API is ICU internal only. 1035 */ 1036 @Deprecated matchesAt(CharSequence text, int offset)1037 public int matchesAt(CharSequence text, int offset) { 1038 int lastLen = -1; 1039 strings: 1040 if (hasStrings()) { 1041 char firstChar = text.charAt(offset); 1042 String trial = null; 1043 // find the first string starting with firstChar 1044 Iterator<String> it = strings.iterator(); 1045 while (it.hasNext()) { 1046 trial = it.next(); 1047 char firstStringChar = trial.charAt(0); 1048 if (firstStringChar < firstChar) continue; 1049 if (firstStringChar > firstChar) break strings; 1050 } 1051 1052 // now keep checking string until we get the longest one 1053 for (;;) { 1054 int tempLen = matchesAt(text, offset, trial); 1055 if (lastLen > tempLen) break strings; 1056 lastLen = tempLen; 1057 if (!it.hasNext()) break; 1058 trial = it.next(); 1059 } 1060 } 1061 1062 if (lastLen < 2) { 1063 int cp = UTF16.charAt(text, offset); 1064 if (contains(cp)) lastLen = UTF16.getCharCount(cp); 1065 } 1066 1067 return offset+lastLen; 1068 } 1069 1070 /** 1071 * Does one string contain another, starting at a specific offset? 1072 * @param text text to match 1073 * @param offsetInText offset within that text 1074 * @param substring substring to match at offset in text 1075 * @return -1 if match fails, otherwise other.length() 1076 */ 1077 // Note: This method was moved from CollectionUtilities matchesAt(CharSequence text, int offsetInText, CharSequence substring)1078 private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) { 1079 int len = substring.length(); 1080 int textLength = text.length(); 1081 if (textLength + offsetInText > len) { 1082 return -1; 1083 } 1084 int i = 0; 1085 for (int j = offsetInText; i < len; ++i, ++j) { 1086 char pc = substring.charAt(i); 1087 char tc = text.charAt(j); 1088 if (pc != tc) return -1; 1089 } 1090 return i; 1091 } 1092 1093 /** 1094 * Implementation of UnicodeMatcher API. Union the set of all 1095 * characters that may be matched by this object into the given 1096 * set. 1097 * @param toUnionTo the set into which to union the source characters 1098 * @stable ICU 2.2 1099 */ 1100 @Override addMatchSetTo(UnicodeSet toUnionTo)1101 public void addMatchSetTo(UnicodeSet toUnionTo) { 1102 toUnionTo.addAll(this); 1103 } 1104 1105 /** 1106 * Returns the index of the given character within this set, where 1107 * the set is ordered by ascending code point. If the character 1108 * is not in this set, return -1. The inverse of this method is 1109 * <code>charAt()</code>. 1110 * @return an index from 0..size()-1, or -1 1111 * @stable ICU 2.0 1112 */ indexOf(int c)1113 public int indexOf(int c) { 1114 if (c < MIN_VALUE || c > MAX_VALUE) { 1115 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1116 } 1117 int i = 0; 1118 int n = 0; 1119 for (;;) { 1120 int start = list[i++]; 1121 if (c < start) { 1122 return -1; 1123 } 1124 int limit = list[i++]; 1125 if (c < limit) { 1126 return n + c - start; 1127 } 1128 n += limit - start; 1129 } 1130 } 1131 1132 /** 1133 * Returns the character at the given index within this set, where 1134 * the set is ordered by ascending code point. If the index is 1135 * out of range, return -1. The inverse of this method is 1136 * <code>indexOf()</code>. 1137 * @param index an index from 0..size()-1 1138 * @return the character at the given index, or -1. 1139 * @stable ICU 2.0 1140 */ charAt(int index)1141 public int charAt(int index) { 1142 if (index >= 0) { 1143 // len2 is the largest even integer <= len, that is, it is len 1144 // for even values and len-1 for odd values. With odd values 1145 // the last entry is UNICODESET_HIGH. 1146 int len2 = len & ~1; 1147 for (int i=0; i < len2;) { 1148 int start = list[i++]; 1149 int count = list[i++] - start; 1150 if (index < count) { 1151 return start + index; 1152 } 1153 index -= count; 1154 } 1155 } 1156 return -1; 1157 } 1158 1159 /** 1160 * Adds the specified range to this set if it is not already 1161 * present. If this set already contains the specified range, 1162 * the call leaves this set unchanged. If <code>start > end</code> 1163 * then an empty range is added, leaving the set unchanged. 1164 * 1165 * @param start first character, inclusive, of range to be added 1166 * to this set. 1167 * @param end last character, inclusive, of range to be added 1168 * to this set. 1169 * @stable ICU 2.0 1170 */ add(int start, int end)1171 public UnicodeSet add(int start, int end) { 1172 checkFrozen(); 1173 return add_unchecked(start, end); 1174 } 1175 1176 /** 1177 * Adds all characters in range (uses preferred naming convention). 1178 * @param start The index of where to start on adding all characters. 1179 * @param end The index of where to end on adding all characters. 1180 * @return a reference to this object 1181 * @stable ICU 4.4 1182 */ addAll(int start, int end)1183 public UnicodeSet addAll(int start, int end) { 1184 checkFrozen(); 1185 return add_unchecked(start, end); 1186 } 1187 1188 // for internal use, after checkFrozen has been called add_unchecked(int start, int end)1189 private UnicodeSet add_unchecked(int start, int end) { 1190 if (start < MIN_VALUE || start > MAX_VALUE) { 1191 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1192 } 1193 if (end < MIN_VALUE || end > MAX_VALUE) { 1194 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1195 } 1196 if (start < end) { 1197 int limit = end + 1; 1198 // Fast path for adding a new range after the last one. 1199 // Odd list length: [..., lastStart, lastLimit, HIGH] 1200 if ((len & 1) != 0) { 1201 // If the list is empty, set lastLimit low enough to not be adjacent to 0. 1202 int lastLimit = len == 1 ? -2 : list[len - 2]; 1203 if (lastLimit <= start) { 1204 checkFrozen(); 1205 if (lastLimit == start) { 1206 // Extend the last range. 1207 list[len - 2] = limit; 1208 if (limit == HIGH) { 1209 --len; 1210 } 1211 } else { 1212 list[len - 1] = start; 1213 if (limit < HIGH) { 1214 ensureCapacity(len + 2); 1215 list[len++] = limit; 1216 list[len++] = HIGH; 1217 } else { // limit == HIGH 1218 ensureCapacity(len + 1); 1219 list[len++] = HIGH; 1220 } 1221 } 1222 pat = null; 1223 return this; 1224 } 1225 } 1226 // This is slow. Could be much faster using findCodePoint(start) 1227 // and modifying the list, dealing with adjacent & overlapping ranges. 1228 add(range(start, end), 2, 0); 1229 } else if (start == end) { 1230 add(start); 1231 } 1232 return this; 1233 } 1234 1235 // /** 1236 // * Format out the inversion list as a string, for debugging. Uncomment when 1237 // * needed. 1238 // */ 1239 // public final String dump() { 1240 // StringBuffer buf = new StringBuffer("["); 1241 // for (int i=0; i<len; ++i) { 1242 // if (i != 0) buf.append(", "); 1243 // int c = list[i]; 1244 // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') { 1245 // // buf.append((char) c); 1246 // //} else { 1247 // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6)); 1248 // //} 1249 // } 1250 // buf.append("]"); 1251 // return buf.toString(); 1252 // } 1253 1254 /** 1255 * Adds the specified character to this set if it is not already 1256 * present. If this set already contains the specified character, 1257 * the call leaves this set unchanged. 1258 * @stable ICU 2.0 1259 */ add(int c)1260 public final UnicodeSet add(int c) { 1261 checkFrozen(); 1262 return add_unchecked(c); 1263 } 1264 1265 // for internal use only, after checkFrozen has been called add_unchecked(int c)1266 private final UnicodeSet add_unchecked(int c) { 1267 if (c < MIN_VALUE || c > MAX_VALUE) { 1268 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1269 } 1270 1271 // find smallest i such that c < list[i] 1272 // if odd, then it is IN the set 1273 // if even, then it is OUT of the set 1274 int i = findCodePoint(c); 1275 1276 // already in set? 1277 if ((i & 1) != 0) return this; 1278 1279 // HIGH is 0x110000 1280 // assert(list[len-1] == HIGH); 1281 1282 // empty = [HIGH] 1283 // [start_0, limit_0, start_1, limit_1, HIGH] 1284 1285 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1286 // ^ 1287 // list[i] 1288 1289 // i == 0 means c is before the first range 1290 // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into 1291 // add_unchecked, the maximum value that "c" will be compared to 1292 // is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will 1293 // never be reached according to this logic. 1294 if (c == list[i]-1) { 1295 // c is before start of next range 1296 list[i] = c; 1297 // if we touched the HIGH mark, then add a new one 1298 if (c == MAX_VALUE) { 1299 ensureCapacity(len+1); 1300 list[len++] = HIGH; 1301 } 1302 if (i > 0 && c == list[i-1]) { 1303 // collapse adjacent ranges 1304 1305 // [..., start_k-1, c, c, limit_k, ..., HIGH] 1306 // ^ 1307 // list[i] 1308 System.arraycopy(list, i+1, list, i-1, len-i-1); 1309 len -= 2; 1310 } 1311 } 1312 1313 else if (i > 0 && c == list[i-1]) { 1314 // c is after end of prior range 1315 list[i-1]++; 1316 // no need to check for collapse here 1317 } 1318 1319 else { 1320 // At this point we know the new char is not adjacent to 1321 // any existing ranges, and it is not 10FFFF. 1322 1323 1324 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1325 // ^ 1326 // list[i] 1327 1328 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1329 // ^ 1330 // list[i] 1331 1332 // Don't use ensureCapacity() to save on copying. 1333 // NOTE: This has no measurable impact on performance, 1334 // but it might help in some usage patterns. 1335 if (len+2 > list.length) { 1336 int[] temp = new int[nextCapacity(len + 2)]; 1337 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 1338 System.arraycopy(list, i, temp, i+2, len-i); 1339 list = temp; 1340 } else { 1341 System.arraycopy(list, i, list, i+2, len-i); 1342 } 1343 1344 list[i] = c; 1345 list[i+1] = c+1; 1346 len += 2; 1347 } 1348 1349 pat = null; 1350 return this; 1351 } 1352 1353 /** 1354 * Adds the specified multicharacter to this set if it is not already 1355 * present. If this set already contains the multicharacter, 1356 * the call leaves this set unchanged. 1357 * Thus "ch" => {"ch"} 1358 * 1359 * @param s the source string 1360 * @return this object, for chaining 1361 * @stable ICU 2.0 1362 */ add(CharSequence s)1363 public final UnicodeSet add(CharSequence s) { 1364 checkFrozen(); 1365 int cp = getSingleCP(s); 1366 if (cp < 0) { 1367 String str = s.toString(); 1368 if (!strings.contains(str)) { 1369 addString(str); 1370 pat = null; 1371 } 1372 } else { 1373 add_unchecked(cp, cp); 1374 } 1375 return this; 1376 } 1377 addString(CharSequence s)1378 private void addString(CharSequence s) { 1379 if (strings == EMPTY_STRINGS) { 1380 strings = new TreeSet<>(); 1381 } 1382 strings.add(s.toString()); 1383 } 1384 1385 /** 1386 * Utility for getting code point from single code point CharSequence. 1387 * See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE). 1388 * 1389 * @return a code point IF the string consists of a single one. 1390 * otherwise returns -1. 1391 * @param s to test 1392 */ getSingleCP(CharSequence s)1393 private static int getSingleCP(CharSequence s) { 1394 if (s.length() == 1) return s.charAt(0); 1395 if (s.length() == 2) { 1396 int cp = Character.codePointAt(s, 0); 1397 if (cp > 0xFFFF) { // is surrogate pair 1398 return cp; 1399 } 1400 } 1401 return -1; 1402 } 1403 1404 /** 1405 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1406 * If this set already any particular character, it has no effect on that character. 1407 * @param s the source string 1408 * @return this object, for chaining 1409 * @stable ICU 2.0 1410 */ addAll(CharSequence s)1411 public final UnicodeSet addAll(CharSequence s) { 1412 checkFrozen(); 1413 int cp; 1414 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1415 cp = UTF16.charAt(s, i); 1416 add_unchecked(cp, cp); 1417 } 1418 return this; 1419 } 1420 1421 /** 1422 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1423 * If this set already any particular character, it has no effect on that character. 1424 * @param s the source string 1425 * @return this object, for chaining 1426 * @stable ICU 2.0 1427 */ retainAll(CharSequence s)1428 public final UnicodeSet retainAll(CharSequence s) { 1429 return retainAll(fromAll(s)); 1430 } 1431 1432 /** 1433 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1434 * If this set already any particular character, it has no effect on that character. 1435 * @param s the source string 1436 * @return this object, for chaining 1437 * @stable ICU 2.0 1438 */ complementAll(CharSequence s)1439 public final UnicodeSet complementAll(CharSequence s) { 1440 return complementAll(fromAll(s)); 1441 } 1442 1443 /** 1444 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1445 * If this set already any particular character, it has no effect on that character. 1446 * @param s the source string 1447 * @return this object, for chaining 1448 * @stable ICU 2.0 1449 */ removeAll(CharSequence s)1450 public final UnicodeSet removeAll(CharSequence s) { 1451 return removeAll(fromAll(s)); 1452 } 1453 1454 /** 1455 * Remove all strings from this UnicodeSet 1456 * @return this object, for chaining 1457 * @stable ICU 4.2 1458 */ removeAllStrings()1459 public final UnicodeSet removeAllStrings() { 1460 checkFrozen(); 1461 if (hasStrings()) { 1462 strings.clear(); 1463 pat = null; 1464 } 1465 return this; 1466 } 1467 1468 /** 1469 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1470 * 1471 * @param s the source string 1472 * @return a newly created set containing the given string 1473 * @stable ICU 2.0 1474 */ from(CharSequence s)1475 public static UnicodeSet from(CharSequence s) { 1476 return new UnicodeSet().add(s); 1477 } 1478 1479 1480 /** 1481 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1482 * @param s the source string 1483 * @return a newly created set containing the given characters 1484 * @stable ICU 2.0 1485 */ fromAll(CharSequence s)1486 public static UnicodeSet fromAll(CharSequence s) { 1487 return new UnicodeSet().addAll(s); 1488 } 1489 1490 1491 /** 1492 * Retain only the elements in this set that are contained in the 1493 * specified range. If <code>start > end</code> then an empty range is 1494 * retained, leaving the set empty. 1495 * 1496 * @param start first character, inclusive, of range 1497 * @param end last character, inclusive, of range 1498 * @stable ICU 2.0 1499 */ retain(int start, int end)1500 public UnicodeSet retain(int start, int end) { 1501 checkFrozen(); 1502 if (start < MIN_VALUE || start > MAX_VALUE) { 1503 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1504 } 1505 if (end < MIN_VALUE || end > MAX_VALUE) { 1506 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1507 } 1508 if (start <= end) { 1509 retain(range(start, end), 2, 0); 1510 } else { 1511 clear(); 1512 } 1513 return this; 1514 } 1515 1516 /** 1517 * Retain the specified character from this set if it is present. 1518 * Upon return this set will be empty if it did not contain c, or 1519 * will only contain c if it did contain c. 1520 * @param c the character to be retained 1521 * @return this object, for chaining 1522 * @stable ICU 2.0 1523 */ retain(int c)1524 public final UnicodeSet retain(int c) { 1525 return retain(c, c); 1526 } 1527 1528 /** 1529 * Retain the specified string in this set if it is present. 1530 * Upon return this set will be empty if it did not contain s, or 1531 * will only contain s if it did contain s. 1532 * @param cs the string to be retained 1533 * @return this object, for chaining 1534 * @stable ICU 2.0 1535 */ retain(CharSequence cs)1536 public final UnicodeSet retain(CharSequence cs) { 1537 int cp = getSingleCP(cs); 1538 if (cp < 0) { 1539 checkFrozen(); 1540 String s = cs.toString(); 1541 boolean isIn = strings.contains(s); 1542 // Check for getRangeCount() first to avoid somewhat-expensive size() 1543 // when there are single code points. 1544 if (isIn && getRangeCount() == 0 && size() == 1) { 1545 return this; 1546 } 1547 clear(); 1548 if (isIn) { 1549 addString(s); 1550 } 1551 pat = null; 1552 } else { 1553 retain(cp, cp); 1554 } 1555 return this; 1556 } 1557 1558 /** 1559 * Removes the specified range from this set if it is present. 1560 * The set will not contain the specified range once the call 1561 * returns. If <code>start > end</code> then an empty range is 1562 * removed, leaving the set unchanged. 1563 * 1564 * @param start first character, inclusive, of range to be removed 1565 * from this set. 1566 * @param end last character, inclusive, of range to be removed 1567 * from this set. 1568 * @stable ICU 2.0 1569 */ remove(int start, int end)1570 public UnicodeSet remove(int start, int end) { 1571 checkFrozen(); 1572 if (start < MIN_VALUE || start > MAX_VALUE) { 1573 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1574 } 1575 if (end < MIN_VALUE || end > MAX_VALUE) { 1576 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1577 } 1578 if (start <= end) { 1579 retain(range(start, end), 2, 2); 1580 } 1581 return this; 1582 } 1583 1584 /** 1585 * Removes the specified character from this set if it is present. 1586 * The set will not contain the specified character once the call 1587 * returns. 1588 * @param c the character to be removed 1589 * @return this object, for chaining 1590 * @stable ICU 2.0 1591 */ remove(int c)1592 public final UnicodeSet remove(int c) { 1593 return remove(c, c); 1594 } 1595 1596 /** 1597 * Removes the specified string from this set if it is present. 1598 * The set will not contain the specified string once the call 1599 * returns. 1600 * @param s the string to be removed 1601 * @return this object, for chaining 1602 * @stable ICU 2.0 1603 */ remove(CharSequence s)1604 public final UnicodeSet remove(CharSequence s) { 1605 int cp = getSingleCP(s); 1606 if (cp < 0) { 1607 checkFrozen(); 1608 String str = s.toString(); 1609 if (strings.contains(str)) { 1610 strings.remove(str); 1611 pat = null; 1612 } 1613 } else { 1614 remove(cp, cp); 1615 } 1616 return this; 1617 } 1618 1619 /** 1620 * Complements the specified range in this set. Any character in 1621 * the range will be removed if it is in this set, or will be 1622 * added if it is not in this set. If <code>start > end</code> 1623 * then an empty range is complemented, leaving the set unchanged. 1624 * 1625 * @param start first character, inclusive, of range 1626 * @param end last character, inclusive, of range 1627 * @stable ICU 2.0 1628 */ complement(int start, int end)1629 public UnicodeSet complement(int start, int end) { 1630 checkFrozen(); 1631 if (start < MIN_VALUE || start > MAX_VALUE) { 1632 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1633 } 1634 if (end < MIN_VALUE || end > MAX_VALUE) { 1635 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1636 } 1637 if (start <= end) { 1638 xor(range(start, end), 2, 0); 1639 } 1640 pat = null; 1641 return this; 1642 } 1643 1644 /** 1645 * Complements the specified character in this set. The character 1646 * will be removed if it is in this set, or will be added if it is 1647 * not in this set. 1648 * @stable ICU 2.0 1649 */ complement(int c)1650 public final UnicodeSet complement(int c) { 1651 return complement(c, c); 1652 } 1653 1654 /** 1655 * This is equivalent to 1656 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1657 * @stable ICU 2.0 1658 */ complement()1659 public UnicodeSet complement() { 1660 checkFrozen(); 1661 if (list[0] == LOW) { 1662 System.arraycopy(list, 1, list, 0, len-1); 1663 --len; 1664 } else { 1665 ensureCapacity(len+1); 1666 System.arraycopy(list, 0, list, 1, len); 1667 list[0] = LOW; 1668 ++len; 1669 } 1670 pat = null; 1671 return this; 1672 } 1673 1674 /** 1675 * Complement the specified string in this set. 1676 * The set will not contain the specified string once the call 1677 * returns. 1678 * 1679 * @param s the string to complement 1680 * @return this object, for chaining 1681 * @stable ICU 2.0 1682 */ complement(CharSequence s)1683 public final UnicodeSet complement(CharSequence s) { 1684 checkFrozen(); 1685 int cp = getSingleCP(s); 1686 if (cp < 0) { 1687 String s2 = s.toString(); 1688 if (strings.contains(s2)) { 1689 strings.remove(s2); 1690 } else { 1691 addString(s2); 1692 } 1693 pat = null; 1694 } else { 1695 complement(cp, cp); 1696 } 1697 return this; 1698 } 1699 1700 /** 1701 * Returns true if this set contains the given character. 1702 * @param c character to be checked for containment 1703 * @return true if the test condition is met 1704 * @stable ICU 2.0 1705 */ 1706 @Override contains(int c)1707 public boolean contains(int c) { 1708 if (c < MIN_VALUE || c > MAX_VALUE) { 1709 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1710 } 1711 if (bmpSet != null) { 1712 return bmpSet.contains(c); 1713 } 1714 if (stringSpan != null) { 1715 return stringSpan.contains(c); 1716 } 1717 1718 /* 1719 // Set i to the index of the start item greater than ch 1720 // We know we will terminate without length test! 1721 int i = -1; 1722 while (true) { 1723 if (c < list[++i]) break; 1724 } 1725 */ 1726 1727 int i = findCodePoint(c); 1728 1729 return ((i & 1) != 0); // return true if odd 1730 } 1731 1732 /** 1733 * Returns the smallest value i such that c < list[i]. Caller 1734 * must ensure that c is a legal value or this method will enter 1735 * an infinite loop. This method performs a binary search. 1736 * @param c a character in the range MIN_VALUE..MAX_VALUE 1737 * inclusive 1738 * @return the smallest integer i in the range 0..len-1, 1739 * inclusive, such that c < list[i] 1740 */ findCodePoint(int c)1741 private final int findCodePoint(int c) { 1742 /* Examples: 1743 findCodePoint(c) 1744 set list[] c=0 1 3 4 7 8 1745 === ============== =========== 1746 [] [110000] 0 0 0 0 0 0 1747 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1748 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1749 [:all:] [0, 110000] 1 1 1 1 1 1 1750 */ 1751 1752 // Return the smallest i such that c < list[i]. Assume 1753 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1754 if (c < list[0]) return 0; 1755 // High runner test. c is often after the last range, so an 1756 // initial check for this condition pays off. 1757 if (len >= 2 && c >= list[len-2]) return len-1; 1758 int lo = 0; 1759 int hi = len - 1; 1760 // invariant: c >= list[lo] 1761 // invariant: c < list[hi] 1762 for (;;) { 1763 int i = (lo + hi) >>> 1; 1764 if (i == lo) return hi; 1765 if (c < list[i]) { 1766 hi = i; 1767 } else { 1768 lo = i; 1769 } 1770 } 1771 } 1772 1773 // //---------------------------------------------------------------- 1774 // // Unrolled binary search 1775 // //---------------------------------------------------------------- 1776 // 1777 // private int validLen = -1; // validated value of len 1778 // private int topOfLow; 1779 // private int topOfHigh; 1780 // private int power; 1781 // private int deltaStart; 1782 // 1783 // private void validate() { 1784 // if (len <= 1) { 1785 // throw new IllegalArgumentException("list.len==" + len + "; must be >1"); 1786 // } 1787 // 1788 // // find greatest power of 2 less than or equal to len 1789 // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {} 1790 // 1791 // // assert(exp2[power] <= len); 1792 // 1793 // // determine the starting points 1794 // topOfLow = exp2[power] - 1; 1795 // topOfHigh = len - 1; 1796 // deltaStart = exp2[power-1]; 1797 // validLen = len; 1798 // } 1799 // 1800 // private static final int exp2[] = { 1801 // 0x1, 0x2, 0x4, 0x8, 1802 // 0x10, 0x20, 0x40, 0x80, 1803 // 0x100, 0x200, 0x400, 0x800, 1804 // 0x1000, 0x2000, 0x4000, 0x8000, 1805 // 0x10000, 0x20000, 0x40000, 0x80000, 1806 // 0x100000, 0x200000, 0x400000, 0x800000, 1807 // 0x1000000, 0x2000000, 0x4000000, 0x8000000, 1808 // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java 1809 // }; 1810 // 1811 // /** 1812 // * Unrolled lowest index GT. 1813 // */ 1814 // private final int leastIndexGT(int searchValue) { 1815 // 1816 // if (len != validLen) { 1817 // if (len == 1) return 0; 1818 // validate(); 1819 // } 1820 // int temp; 1821 // 1822 // // set up initial range to search. Each subrange is a power of two in length 1823 // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh; 1824 // 1825 // // Completely unrolled binary search, folhighing "Programming Pearls" 1826 // // Each case deliberately falls through to the next 1827 // // Logically, list[-1] < all_search_values && list[count] > all_search_values 1828 // // although the values -1 and count are never actually touched. 1829 // 1830 // // The bounds at each point are low & high, 1831 // // where low == high - delta*2 1832 // // so high - delta is the midpoint 1833 // 1834 // // The invariant AFTER each line is that list[low] < searchValue <= list[high] 1835 // 1836 // switch (power) { 1837 // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java 1838 // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp; 1839 // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp; 1840 // 1841 // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp; 1842 // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp; 1843 // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp; 1844 // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp; 1845 // 1846 // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp; 1847 // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp; 1848 // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp; 1849 // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp; 1850 // 1851 // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp; 1852 // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp; 1853 // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp; 1854 // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp; 1855 // 1856 // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp; 1857 // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp; 1858 // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp; 1859 // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp; 1860 // 1861 // case 12: if (searchValue < list[temp = high- 0x800]) high = temp; 1862 // case 11: if (searchValue < list[temp = high- 0x400]) high = temp; 1863 // case 10: if (searchValue < list[temp = high- 0x200]) high = temp; 1864 // case 9: if (searchValue < list[temp = high- 0x100]) high = temp; 1865 // 1866 // case 8: if (searchValue < list[temp = high- 0x80]) high = temp; 1867 // case 7: if (searchValue < list[temp = high- 0x40]) high = temp; 1868 // case 6: if (searchValue < list[temp = high- 0x20]) high = temp; 1869 // case 5: if (searchValue < list[temp = high- 0x10]) high = temp; 1870 // 1871 // case 4: if (searchValue < list[temp = high- 0x8]) high = temp; 1872 // case 3: if (searchValue < list[temp = high- 0x4]) high = temp; 1873 // case 2: if (searchValue < list[temp = high- 0x2]) high = temp; 1874 // case 1: if (searchValue < list[temp = high- 0x1]) high = temp; 1875 // } 1876 // 1877 // return high; 1878 // } 1879 // 1880 // // For debugging only 1881 // public int len() { 1882 // return len; 1883 // } 1884 // 1885 // //---------------------------------------------------------------- 1886 // //---------------------------------------------------------------- 1887 1888 /** 1889 * Returns true if this set contains every character 1890 * of the given range. 1891 * @param start first character, inclusive, of the range 1892 * @param end last character, inclusive, of the range 1893 * @return true if the test condition is met 1894 * @stable ICU 2.0 1895 */ contains(int start, int end)1896 public boolean contains(int start, int end) { 1897 if (start < MIN_VALUE || start > MAX_VALUE) { 1898 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1899 } 1900 if (end < MIN_VALUE || end > MAX_VALUE) { 1901 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1902 } 1903 //int i = -1; 1904 //while (true) { 1905 // if (start < list[++i]) break; 1906 //} 1907 int i = findCodePoint(start); 1908 return ((i & 1) != 0 && end < list[i]); 1909 } 1910 1911 /** 1912 * Returns <tt>true</tt> if this set contains the given 1913 * multicharacter string. 1914 * @param s string to be checked for containment 1915 * @return <tt>true</tt> if this set contains the specified string 1916 * @stable ICU 2.0 1917 */ contains(CharSequence s)1918 public final boolean contains(CharSequence s) { 1919 1920 int cp = getSingleCP(s); 1921 if (cp < 0) { 1922 return strings.contains(s.toString()); 1923 } else { 1924 return contains(cp); 1925 } 1926 } 1927 1928 /** 1929 * Returns true if this set contains all the characters and strings 1930 * of the given set. 1931 * @param b set to be checked for containment 1932 * @return true if the test condition is met 1933 * @stable ICU 2.0 1934 */ containsAll(UnicodeSet b)1935 public boolean containsAll(UnicodeSet b) { 1936 // The specified set is a subset if all of its pairs are contained in 1937 // this set. This implementation accesses the lists directly for speed. 1938 // TODO: this could be faster if size() were cached. But that would affect building speed 1939 // so it needs investigation. 1940 int[] listB = b.list; 1941 boolean needA = true; 1942 boolean needB = true; 1943 int aPtr = 0; 1944 int bPtr = 0; 1945 int aLen = len - 1; 1946 int bLen = b.len - 1; 1947 int startA = 0, startB = 0, limitA = 0, limitB = 0; 1948 while (true) { 1949 // double iterations are such a pain... 1950 if (needA) { 1951 if (aPtr >= aLen) { 1952 // ran out of A. If B is also exhausted, then break; 1953 if (needB && bPtr >= bLen) { 1954 break; 1955 } 1956 return false; 1957 } 1958 startA = list[aPtr++]; 1959 limitA = list[aPtr++]; 1960 } 1961 if (needB) { 1962 if (bPtr >= bLen) { 1963 // ran out of B. Since we got this far, we have an A and we are ok so far 1964 break; 1965 } 1966 startB = listB[bPtr++]; 1967 limitB = listB[bPtr++]; 1968 } 1969 // if B doesn't overlap and is greater than A, get new A 1970 if (startB >= limitA) { 1971 needA = true; 1972 needB = false; 1973 continue; 1974 } 1975 // if B is wholy contained in A, then get a new B 1976 if (startB >= startA && limitB <= limitA) { 1977 needA = false; 1978 needB = true; 1979 continue; 1980 } 1981 // all other combinations mean we fail 1982 return false; 1983 } 1984 1985 if (!strings.containsAll(b.strings)) return false; 1986 return true; 1987 } 1988 1989 // /** 1990 // * Returns true if this set contains all the characters and strings 1991 // * of the given set. 1992 // * @param c set to be checked for containment 1993 // * @return true if the test condition is met 1994 // * @stable ICU 2.0 1995 // */ 1996 // public boolean containsAllOld(UnicodeSet c) { 1997 // // The specified set is a subset if all of its pairs are contained in 1998 // // this set. It's possible to code this more efficiently in terms of 1999 // // direct manipulation of the inversion lists if the need arises. 2000 // int n = c.getRangeCount(); 2001 // for (int i=0; i<n; ++i) { 2002 // if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 2003 // return false; 2004 // } 2005 // } 2006 // if (!strings.containsAll(c.strings)) return false; 2007 // return true; 2008 // } 2009 2010 /** 2011 * Returns true if there is a partition of the string such that this set contains each of the partitioned strings. 2012 * For example, for the Unicode set [a{bc}{cd}]<br> 2013 * containsAll is true for each of: "a", "bc", ""cdbca"<br> 2014 * containsAll is false for each of: "acb", "bcda", "bcx"<br> 2015 * @param s string containing characters to be checked for containment 2016 * @return true if the test condition is met 2017 * @stable ICU 2.0 2018 */ containsAll(String s)2019 public boolean containsAll(String s) { 2020 int cp; 2021 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 2022 cp = UTF16.charAt(s, i); 2023 if (!contains(cp)) { 2024 if (!hasStrings()) { 2025 return false; 2026 } 2027 return containsAll(s, 0); 2028 } 2029 } 2030 return true; 2031 } 2032 2033 /** 2034 * Recursive routine called if we fail to find a match in containsAll, and there are strings 2035 * @param s source string 2036 * @param i point to match to the end on 2037 * @return true if ok 2038 */ containsAll(String s, int i)2039 private boolean containsAll(String s, int i) { 2040 if (i >= s.length()) { 2041 return true; 2042 } 2043 int cp= UTF16.charAt(s, i); 2044 if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) { 2045 return true; 2046 } 2047 for (String setStr : strings) { 2048 if (!setStr.isEmpty() && // skip the empty string 2049 s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) { 2050 return true; 2051 } 2052 } 2053 return false; 2054 2055 } 2056 2057 /** 2058 * Get the Regex equivalent for this UnicodeSet 2059 * @return regex pattern equivalent to this UnicodeSet 2060 * @internal 2061 * @deprecated This API is ICU internal only. 2062 */ 2063 @Deprecated getRegexEquivalent()2064 public String getRegexEquivalent() { 2065 if (!hasStrings()) { 2066 return toString(); 2067 } 2068 StringBuilder result = new StringBuilder("(?:"); 2069 appendNewPattern(result, true, false); 2070 for (String s : strings) { 2071 result.append('|'); 2072 _appendToPat(result, s, true); 2073 } 2074 return result.append(")").toString(); 2075 } 2076 2077 /** 2078 * Returns true if this set contains none of the characters 2079 * of the given range. 2080 * @param start first character, inclusive, of the range 2081 * @param end last character, inclusive, of the range 2082 * @return true if the test condition is met 2083 * @stable ICU 2.0 2084 */ containsNone(int start, int end)2085 public boolean containsNone(int start, int end) { 2086 if (start < MIN_VALUE || start > MAX_VALUE) { 2087 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 2088 } 2089 if (end < MIN_VALUE || end > MAX_VALUE) { 2090 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 2091 } 2092 int i = -1; 2093 while (true) { 2094 if (start < list[++i]) break; 2095 } 2096 return ((i & 1) == 0 && end < list[i]); 2097 } 2098 2099 /** 2100 * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2101 * For example, for the Unicode set [a{bc}{cd}]<br> 2102 * containsNone is true for: "xy", "cb"<br> 2103 * containsNone is false for: "a", "bc", "bcd"<br> 2104 * @param b set to be checked for containment 2105 * @return true if the test condition is met 2106 * @stable ICU 2.0 2107 */ containsNone(UnicodeSet b)2108 public boolean containsNone(UnicodeSet b) { 2109 // The specified set is a subset if some of its pairs overlap with some of this set's pairs. 2110 // This implementation accesses the lists directly for speed. 2111 int[] listB = b.list; 2112 boolean needA = true; 2113 boolean needB = true; 2114 int aPtr = 0; 2115 int bPtr = 0; 2116 int aLen = len - 1; 2117 int bLen = b.len - 1; 2118 int startA = 0, startB = 0, limitA = 0, limitB = 0; 2119 while (true) { 2120 // double iterations are such a pain... 2121 if (needA) { 2122 if (aPtr >= aLen) { 2123 // ran out of A: break so we test strings 2124 break; 2125 } 2126 startA = list[aPtr++]; 2127 limitA = list[aPtr++]; 2128 } 2129 if (needB) { 2130 if (bPtr >= bLen) { 2131 // ran out of B: break so we test strings 2132 break; 2133 } 2134 startB = listB[bPtr++]; 2135 limitB = listB[bPtr++]; 2136 } 2137 // if B is higher than any part of A, get new A 2138 if (startB >= limitA) { 2139 needA = true; 2140 needB = false; 2141 continue; 2142 } 2143 // if A is higher than any part of B, get new B 2144 if (startA >= limitB) { 2145 needA = false; 2146 needB = true; 2147 continue; 2148 } 2149 // all other combinations mean we fail 2150 return false; 2151 } 2152 2153 if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false; 2154 return true; 2155 } 2156 2157 // /** 2158 // * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2159 // * For example, for the Unicode set [a{bc}{cd}]<br> 2160 // * containsNone is true for: "xy", "cb"<br> 2161 // * containsNone is false for: "a", "bc", "bcd"<br> 2162 // * @param c set to be checked for containment 2163 // * @return true if the test condition is met 2164 // * @stable ICU 2.0 2165 // */ 2166 // public boolean containsNoneOld(UnicodeSet c) { 2167 // // The specified set is a subset if all of its pairs are contained in 2168 // // this set. It's possible to code this more efficiently in terms of 2169 // // direct manipulation of the inversion lists if the need arises. 2170 // int n = c.getRangeCount(); 2171 // for (int i=0; i<n; ++i) { 2172 // if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 2173 // return false; 2174 // } 2175 // } 2176 // if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false; 2177 // return true; 2178 // } 2179 2180 /** 2181 * Returns true if this set contains none of the characters 2182 * of the given string. 2183 * @param s string containing characters to be checked for containment 2184 * @return true if the test condition is met 2185 * @stable ICU 2.0 2186 */ containsNone(CharSequence s)2187 public boolean containsNone(CharSequence s) { 2188 return span(s, SpanCondition.NOT_CONTAINED) == s.length(); 2189 } 2190 2191 /** 2192 * Returns true if this set contains one or more of the characters 2193 * in the given range. 2194 * @param start first character, inclusive, of the range 2195 * @param end last character, inclusive, of the range 2196 * @return true if the condition is met 2197 * @stable ICU 2.0 2198 */ containsSome(int start, int end)2199 public final boolean containsSome(int start, int end) { 2200 return !containsNone(start, end); 2201 } 2202 2203 /** 2204 * Returns true if this set contains one or more of the characters 2205 * and strings of the given set. 2206 * @param s set to be checked for containment 2207 * @return true if the condition is met 2208 * @stable ICU 2.0 2209 */ containsSome(UnicodeSet s)2210 public final boolean containsSome(UnicodeSet s) { 2211 return !containsNone(s); 2212 } 2213 2214 /** 2215 * Returns true if this set contains one or more of the characters 2216 * of the given string. 2217 * @param s string containing characters to be checked for containment 2218 * @return true if the condition is met 2219 * @stable ICU 2.0 2220 */ containsSome(CharSequence s)2221 public final boolean containsSome(CharSequence s) { 2222 return !containsNone(s); 2223 } 2224 2225 2226 /** 2227 * Adds all of the elements in the specified set to this set if 2228 * they're not already present. This operation effectively 2229 * modifies this set so that its value is the <i>union</i> of the two 2230 * sets. The behavior of this operation is unspecified if the specified 2231 * collection is modified while the operation is in progress. 2232 * 2233 * @param c set whose elements are to be added to this set. 2234 * @stable ICU 2.0 2235 */ addAll(UnicodeSet c)2236 public UnicodeSet addAll(UnicodeSet c) { 2237 checkFrozen(); 2238 add(c.list, c.len, 0); 2239 if (c.hasStrings()) { 2240 if (strings == EMPTY_STRINGS) { 2241 strings = new TreeSet<>(c.strings); 2242 } else { 2243 strings.addAll(c.strings); 2244 } 2245 } 2246 return this; 2247 } 2248 2249 /** 2250 * Retains only the elements in this set that are contained in the 2251 * specified set. In other words, removes from this set all of 2252 * its elements that are not contained in the specified set. This 2253 * operation effectively modifies this set so that its value is 2254 * the <i>intersection</i> of the two sets. 2255 * 2256 * @param c set that defines which elements this set will retain. 2257 * @stable ICU 2.0 2258 */ retainAll(UnicodeSet c)2259 public UnicodeSet retainAll(UnicodeSet c) { 2260 checkFrozen(); 2261 retain(c.list, c.len, 0); 2262 if (hasStrings()) { 2263 if (!c.hasStrings()) { 2264 strings.clear(); 2265 } else { 2266 strings.retainAll(c.strings); 2267 } 2268 } 2269 return this; 2270 } 2271 2272 /** 2273 * Removes from this set all of its elements that are contained in the 2274 * specified set. This operation effectively modifies this 2275 * set so that its value is the <i>asymmetric set difference</i> of 2276 * the two sets. 2277 * 2278 * @param c set that defines which elements will be removed from 2279 * this set. 2280 * @stable ICU 2.0 2281 */ removeAll(UnicodeSet c)2282 public UnicodeSet removeAll(UnicodeSet c) { 2283 checkFrozen(); 2284 retain(c.list, c.len, 2); 2285 if (hasStrings() && c.hasStrings()) { 2286 strings.removeAll(c.strings); 2287 } 2288 return this; 2289 } 2290 2291 /** 2292 * Complements in this set all elements contained in the specified 2293 * set. Any character in the other set will be removed if it is 2294 * in this set, or will be added if it is not in this set. 2295 * 2296 * @param c set that defines which elements will be complemented from 2297 * this set. 2298 * @stable ICU 2.0 2299 */ complementAll(UnicodeSet c)2300 public UnicodeSet complementAll(UnicodeSet c) { 2301 checkFrozen(); 2302 xor(c.list, c.len, 0); 2303 if (c.hasStrings()) { 2304 if (strings == EMPTY_STRINGS) { 2305 strings = new TreeSet<>(c.strings); 2306 } else { 2307 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings); 2308 } 2309 } 2310 return this; 2311 } 2312 2313 /** 2314 * Removes all of the elements from this set. This set will be 2315 * empty after this call returns. 2316 * @stable ICU 2.0 2317 */ clear()2318 public UnicodeSet clear() { 2319 checkFrozen(); 2320 list[0] = HIGH; 2321 len = 1; 2322 pat = null; 2323 if (hasStrings()) { 2324 strings.clear(); 2325 } 2326 return this; 2327 } 2328 2329 /** 2330 * Iteration method that returns the number of ranges contained in 2331 * this set. 2332 * @see #getRangeStart 2333 * @see #getRangeEnd 2334 * @stable ICU 2.0 2335 */ getRangeCount()2336 public int getRangeCount() { 2337 return len/2; 2338 } 2339 2340 /** 2341 * Iteration method that returns the first character in the 2342 * specified range of this set. 2343 * @exception ArrayIndexOutOfBoundsException if index is outside 2344 * the range <code>0..getRangeCount()-1</code> 2345 * @see #getRangeCount 2346 * @see #getRangeEnd 2347 * @stable ICU 2.0 2348 */ getRangeStart(int index)2349 public int getRangeStart(int index) { 2350 return list[index*2]; 2351 } 2352 2353 /** 2354 * Iteration method that returns the last character in the 2355 * specified range of this set. 2356 * @exception ArrayIndexOutOfBoundsException if index is outside 2357 * the range <code>0..getRangeCount()-1</code> 2358 * @see #getRangeStart 2359 * @see #getRangeEnd 2360 * @stable ICU 2.0 2361 */ getRangeEnd(int index)2362 public int getRangeEnd(int index) { 2363 return (list[index*2 + 1] - 1); 2364 } 2365 2366 /** 2367 * Reallocate this objects internal structures to take up the least 2368 * possible space, without changing this object's value. 2369 * @stable ICU 2.0 2370 */ compact()2371 public UnicodeSet compact() { 2372 checkFrozen(); 2373 if ((len + 7) < list.length) { 2374 // If we have more than a little unused capacity, shrink it to len. 2375 list = Arrays.copyOf(list, len); 2376 } 2377 rangeList = null; 2378 buffer = null; 2379 if (strings != EMPTY_STRINGS && strings.isEmpty()) { 2380 strings = EMPTY_STRINGS; 2381 } 2382 return this; 2383 } 2384 2385 /** 2386 * Compares the specified object with this set for equality. Returns 2387 * <tt>true</tt> if the specified object is also a set, the two sets 2388 * have the same size, and every member of the specified set is 2389 * contained in this set (or equivalently, every member of this set is 2390 * contained in the specified set). 2391 * 2392 * @param o Object to be compared for equality with this set. 2393 * @return <tt>true</tt> if the specified Object is equal to this set. 2394 * @stable ICU 2.0 2395 */ 2396 @Override equals(Object o)2397 public boolean equals(Object o) { 2398 if (o == null) { 2399 return false; 2400 } 2401 if (this == o) { 2402 return true; 2403 } 2404 try { 2405 UnicodeSet that = (UnicodeSet) o; 2406 if (len != that.len) return false; 2407 for (int i = 0; i < len; ++i) { 2408 if (list[i] != that.list[i]) return false; 2409 } 2410 if (!strings.equals(that.strings)) return false; 2411 } catch (Exception e) { 2412 return false; 2413 } 2414 return true; 2415 } 2416 2417 /** 2418 * Returns the hash code value for this set. 2419 * 2420 * @return the hash code value for this set. 2421 * @see java.lang.Object#hashCode() 2422 * @stable ICU 2.0 2423 */ 2424 @Override hashCode()2425 public int hashCode() { 2426 int result = len; 2427 for (int i = 0; i < len; ++i) { 2428 result *= 1000003; 2429 result += list[i]; 2430 } 2431 return result; 2432 } 2433 2434 /** 2435 * Return a programmer-readable string representation of this object. 2436 * @stable ICU 2.0 2437 */ 2438 @Override toString()2439 public String toString() { 2440 return toPattern(true); 2441 } 2442 2443 //---------------------------------------------------------------- 2444 // Implementation: Pattern parsing 2445 //---------------------------------------------------------------- 2446 2447 /** 2448 * Parses the given pattern, starting at the given position. The character 2449 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 2450 * Parsing continues until the corresponding closing ']'. If a syntax error 2451 * is encountered between the opening and closing brace, the parse fails. 2452 * Upon return from a successful parse, the ParsePosition is updated to 2453 * point to the character following the closing ']', and an inversion 2454 * list for the parsed pattern is returned. This method 2455 * calls itself recursively to parse embedded subpatterns. 2456 * 2457 * @param pattern the string containing the pattern to be parsed. The 2458 * portion of the string from pos.getIndex(), which must be a '[', to the 2459 * corresponding closing ']', is parsed. 2460 * @param pos upon entry, the position at which to being parsing. The 2461 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 2462 * from a successful parse, pos.getIndex() is either the character after the 2463 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 2464 * is the last character of the pattern string. 2465 * @return an inversion list for the parsed substring 2466 * of <code>pattern</code> 2467 * @exception java.lang.IllegalArgumentException if the parse fails. 2468 * @internal 2469 * @deprecated This API is ICU internal only. 2470 */ 2471 @Deprecated applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2472 public UnicodeSet applyPattern(String pattern, 2473 ParsePosition pos, 2474 SymbolTable symbols, 2475 int options) { 2476 2477 // Need to build the pattern in a temporary string because 2478 // _applyPattern calls add() etc., which set pat to empty. 2479 boolean parsePositionWasNull = pos == null; 2480 if (parsePositionWasNull) { 2481 pos = new ParsePosition(0); 2482 } 2483 2484 StringBuilder rebuiltPat = new StringBuilder(); 2485 RuleCharacterIterator chars = 2486 new RuleCharacterIterator(pattern, symbols, pos); 2487 applyPattern(chars, symbols, rebuiltPat, options, 0); 2488 if (chars.inVariable()) { 2489 syntaxError(chars, "Extra chars in variable value"); 2490 } 2491 pat = rebuiltPat.toString(); 2492 if (parsePositionWasNull) { 2493 int i = pos.getIndex(); 2494 2495 // Skip over trailing whitespace 2496 if ((options & IGNORE_SPACE) != 0) { 2497 i = PatternProps.skipWhiteSpace(pattern, i); 2498 } 2499 2500 if (i != pattern.length()) { 2501 throw new IllegalArgumentException("Parse of \"" + pattern + 2502 "\" failed at " + i); 2503 } 2504 } 2505 return this; 2506 } 2507 2508 // Add constants to make the applyPattern() code easier to follow. 2509 2510 private static final int LAST0_START = 0, 2511 LAST1_RANGE = 1, 2512 LAST2_SET = 2; 2513 2514 private static final int MODE0_NONE = 0, 2515 MODE1_INBRACKET = 1, 2516 MODE2_OUTBRACKET = 2; 2517 2518 private static final int SETMODE0_NONE = 0, 2519 SETMODE1_UNICODESET = 1, 2520 SETMODE2_PROPERTYPAT = 2, 2521 SETMODE3_PREPARSED = 3; 2522 2523 private static final int MAX_DEPTH = 100; 2524 2525 /** 2526 * Parse the pattern from the given RuleCharacterIterator. The 2527 * iterator is advanced over the parsed pattern. 2528 * @param chars iterator over the pattern characters. Upon return 2529 * it will be advanced to the first character after the parsed 2530 * pattern, or the end of the iteration if all characters are 2531 * parsed. 2532 * @param symbols symbol table to use to parse and dereference 2533 * variables, or null if none. 2534 * @param rebuiltPat the pattern that was parsed, rebuilt or 2535 * copied from the input pattern, as appropriate. 2536 * @param options a bit mask of zero or more of the following: 2537 * IGNORE_SPACE, CASE. 2538 */ applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth)2539 private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 2540 Appendable rebuiltPat, int options, int depth) { 2541 if (depth > MAX_DEPTH) { 2542 syntaxError(chars, "Pattern nested too deeply"); 2543 } 2544 2545 // Syntax characters: [ ] ^ - & { } 2546 2547 // Recognized special forms for chars, sets: c-c s-s s&s 2548 2549 int opts = RuleCharacterIterator.PARSE_VARIABLES | 2550 RuleCharacterIterator.PARSE_ESCAPES; 2551 if ((options & IGNORE_SPACE) != 0) { 2552 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 2553 } 2554 2555 StringBuilder patBuf = new StringBuilder(), buf = null; 2556 boolean usePat = false; 2557 UnicodeSet scratch = null; 2558 Object backup = null; 2559 2560 // mode: 0=before [, 1=between [...], 2=after ] 2561 // lastItem: 0=none, 1=char, 2=set 2562 int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; 2563 char op = 0; 2564 2565 boolean invert = false; 2566 2567 clear(); 2568 String lastString = null; 2569 2570 while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { 2571 //Eclipse stated the following is "dead code" 2572 /* 2573 if (false) { 2574 // Debugging assertion 2575 if (!((lastItem == 0 && op == 0) || 2576 (lastItem == 1 && (op == 0 || op == '-')) || 2577 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 2578 throw new IllegalArgumentException(); 2579 } 2580 }*/ 2581 2582 int c = 0; 2583 boolean literal = false; 2584 UnicodeSet nested = null; 2585 2586 // -------- Check for property pattern 2587 2588 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 2589 int setMode = SETMODE0_NONE; 2590 if (resemblesPropertyPattern(chars, opts)) { 2591 setMode = SETMODE2_PROPERTYPAT; 2592 } 2593 2594 // -------- Parse '[' of opening delimiter OR nested set. 2595 // If there is a nested set, use `setMode' to define how 2596 // the set should be parsed. If the '[' is part of the 2597 // opening delimiter for this pattern, parse special 2598 // strings "[", "[^", "[-", and "[^-". Check for stand-in 2599 // characters representing a nested set in the symbol 2600 // table. 2601 2602 else { 2603 // Prepare to backup if necessary 2604 backup = chars.getPos(backup); 2605 c = chars.next(opts); 2606 literal = chars.isEscaped(); 2607 2608 if (c == '[' && !literal) { 2609 if (mode == MODE1_INBRACKET) { 2610 chars.setPos(backup); // backup 2611 setMode = SETMODE1_UNICODESET; 2612 } else { 2613 // Handle opening '[' delimiter 2614 mode = MODE1_INBRACKET; 2615 patBuf.append('['); 2616 backup = chars.getPos(backup); // prepare to backup 2617 c = chars.next(opts); 2618 literal = chars.isEscaped(); 2619 if (c == '^' && !literal) { 2620 invert = true; 2621 patBuf.append('^'); 2622 backup = chars.getPos(backup); // prepare to backup 2623 c = chars.next(opts); 2624 literal = chars.isEscaped(); 2625 } 2626 // Fall through to handle special leading '-'; 2627 // otherwise restart loop for nested [], \p{}, etc. 2628 if (c == '-') { 2629 literal = true; 2630 // Fall through to handle literal '-' below 2631 } else { 2632 chars.setPos(backup); // backup 2633 continue; 2634 } 2635 } 2636 } else if (symbols != null) { 2637 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 2638 if (m != null) { 2639 try { 2640 nested = (UnicodeSet) m; 2641 setMode = SETMODE3_PREPARSED; 2642 } catch (ClassCastException e) { 2643 syntaxError(chars, "Syntax error"); 2644 } 2645 } 2646 } 2647 } 2648 2649 // -------- Handle a nested set. This either is inline in 2650 // the pattern or represented by a stand-in that has 2651 // previously been parsed and was looked up in the symbol 2652 // table. 2653 2654 if (setMode != SETMODE0_NONE) { 2655 if (lastItem == LAST1_RANGE) { 2656 if (op != 0) { 2657 syntaxError(chars, "Char expected after operator"); 2658 } 2659 add_unchecked(lastChar, lastChar); 2660 _appendToPat(patBuf, lastChar, false); 2661 lastItem = LAST0_START; 2662 op = 0; 2663 } 2664 2665 if (op == '-' || op == '&') { 2666 patBuf.append(op); 2667 } 2668 2669 if (nested == null) { 2670 if (scratch == null) scratch = new UnicodeSet(); 2671 nested = scratch; 2672 } 2673 switch (setMode) { 2674 case SETMODE1_UNICODESET: 2675 nested.applyPattern(chars, symbols, patBuf, options, depth + 1); 2676 break; 2677 case SETMODE2_PROPERTYPAT: 2678 chars.skipIgnored(opts); 2679 nested.applyPropertyPattern(chars, patBuf, symbols); 2680 break; 2681 case SETMODE3_PREPARSED: // `nested' already parsed 2682 nested._toPattern(patBuf, false); 2683 break; 2684 } 2685 2686 usePat = true; 2687 2688 if (mode == MODE0_NONE) { 2689 // Entire pattern is a category; leave parse loop 2690 set(nested); 2691 mode = MODE2_OUTBRACKET; 2692 break; 2693 } 2694 2695 switch (op) { 2696 case '-': 2697 removeAll(nested); 2698 break; 2699 case '&': 2700 retainAll(nested); 2701 break; 2702 case 0: 2703 addAll(nested); 2704 break; 2705 } 2706 2707 op = 0; 2708 lastItem = LAST2_SET; 2709 2710 continue; 2711 } 2712 2713 if (mode == MODE0_NONE) { 2714 syntaxError(chars, "Missing '['"); 2715 } 2716 2717 // -------- Parse special (syntax) characters. If the 2718 // current character is not special, or if it is escaped, 2719 // then fall through and handle it below. 2720 2721 if (!literal) { 2722 switch (c) { 2723 case ']': 2724 if (lastItem == LAST1_RANGE) { 2725 add_unchecked(lastChar, lastChar); 2726 _appendToPat(patBuf, lastChar, false); 2727 } 2728 // Treat final trailing '-' as a literal 2729 if (op == '-') { 2730 add_unchecked(op, op); 2731 patBuf.append(op); 2732 } else if (op == '&') { 2733 syntaxError(chars, "Trailing '&'"); 2734 } 2735 patBuf.append(']'); 2736 mode = MODE2_OUTBRACKET; 2737 continue; 2738 case '-': 2739 if (op == 0) { 2740 if (lastItem != LAST0_START) { 2741 op = (char) c; 2742 continue; 2743 } else if (lastString != null) { 2744 op = (char) c; 2745 continue; 2746 } else { 2747 // Treat final trailing '-' as a literal 2748 add_unchecked(c, c); 2749 c = chars.next(opts); 2750 literal = chars.isEscaped(); 2751 if (c == ']' && !literal) { 2752 patBuf.append("-]"); 2753 mode = MODE2_OUTBRACKET; 2754 continue; 2755 } 2756 } 2757 } 2758 syntaxError(chars, "'-' not after char, string, or set"); 2759 break; 2760 case '&': 2761 if (lastItem == LAST2_SET && op == 0) { 2762 op = (char) c; 2763 continue; 2764 } 2765 syntaxError(chars, "'&' not after set"); 2766 break; 2767 case '^': 2768 syntaxError(chars, "'^' not after '['"); 2769 break; 2770 case '{': 2771 if (op != 0 && op != '-') { 2772 syntaxError(chars, "Missing operand after operator"); 2773 } 2774 if (lastItem == LAST1_RANGE) { 2775 add_unchecked(lastChar, lastChar); 2776 _appendToPat(patBuf, lastChar, false); 2777 } 2778 lastItem = LAST0_START; 2779 if (buf == null) { 2780 buf = new StringBuilder(); 2781 } else { 2782 buf.setLength(0); 2783 } 2784 boolean ok = false; 2785 while (!chars.atEnd()) { 2786 c = chars.next(opts); 2787 literal = chars.isEscaped(); 2788 if (c == '}' && !literal) { 2789 ok = true; 2790 break; 2791 } 2792 appendCodePoint(buf, c); 2793 } 2794 if (!ok) { 2795 syntaxError(chars, "Invalid multicharacter string"); 2796 } 2797 // We have new string. Add it to set and continue; 2798 // we don't need to drop through to the further 2799 // processing 2800 String curString = buf.toString(); 2801 if (op == '-') { 2802 int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); 2803 int curSingle = CharSequences.getSingleCodePoint(curString); 2804 if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { 2805 add(lastSingle,curSingle); 2806 } else { 2807 if (strings == EMPTY_STRINGS) { 2808 strings = new TreeSet<>(); 2809 } 2810 try { 2811 StringRange.expand(lastString, curString, true, strings); 2812 } catch (Exception e) { 2813 syntaxError(chars, e.getMessage()); 2814 } 2815 } 2816 lastString = null; 2817 op = 0; 2818 } else { 2819 add(curString); 2820 lastString = curString; 2821 } 2822 patBuf.append('{'); 2823 _appendToPat(patBuf, curString, false); 2824 patBuf.append('}'); 2825 continue; 2826 case SymbolTable.SYMBOL_REF: 2827 // symbols nosymbols 2828 // [a-$] error error (ambiguous) 2829 // [a$] anchor anchor 2830 // [a-$x] var "x"* literal '$' 2831 // [a-$.] error literal '$' 2832 // *We won't get here in the case of var "x" 2833 backup = chars.getPos(backup); 2834 c = chars.next(opts); 2835 literal = chars.isEscaped(); 2836 boolean anchor = (c == ']' && !literal); 2837 if (symbols == null && !anchor) { 2838 c = SymbolTable.SYMBOL_REF; 2839 chars.setPos(backup); 2840 break; // literal '$' 2841 } 2842 if (anchor && op == 0) { 2843 if (lastItem == LAST1_RANGE) { 2844 add_unchecked(lastChar, lastChar); 2845 _appendToPat(patBuf, lastChar, false); 2846 } 2847 add_unchecked(UnicodeMatcher.ETHER); 2848 usePat = true; 2849 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 2850 mode = MODE2_OUTBRACKET; 2851 continue; 2852 } 2853 syntaxError(chars, "Unquoted '$'"); 2854 break; 2855 default: 2856 break; 2857 } 2858 } 2859 2860 // -------- Parse literal characters. This includes both 2861 // escaped chars ("\u4E01") and non-syntax characters 2862 // ("a"). 2863 2864 switch (lastItem) { 2865 case LAST0_START: 2866 if (op == '-' && lastString != null) { 2867 syntaxError(chars, "Invalid range"); 2868 } 2869 lastItem = LAST1_RANGE; 2870 lastChar = c; 2871 lastString = null; 2872 break; 2873 case LAST1_RANGE: 2874 if (op == '-') { 2875 if (lastString != null) { 2876 syntaxError(chars, "Invalid range"); 2877 } 2878 if (lastChar >= c) { 2879 // Don't allow redundant (a-a) or empty (b-a) ranges; 2880 // these are most likely typos. 2881 syntaxError(chars, "Invalid range"); 2882 } 2883 add_unchecked(lastChar, c); 2884 _appendToPat(patBuf, lastChar, false); 2885 patBuf.append(op); 2886 _appendToPat(patBuf, c, false); 2887 lastItem = LAST0_START; 2888 op = 0; 2889 } else { 2890 add_unchecked(lastChar, lastChar); 2891 _appendToPat(patBuf, lastChar, false); 2892 lastChar = c; 2893 } 2894 break; 2895 case LAST2_SET: 2896 if (op != 0) { 2897 syntaxError(chars, "Set expected after operator"); 2898 } 2899 lastChar = c; 2900 lastItem = LAST1_RANGE; 2901 break; 2902 } 2903 } 2904 2905 if (mode != MODE2_OUTBRACKET) { 2906 syntaxError(chars, "Missing ']'"); 2907 } 2908 2909 chars.skipIgnored(opts); 2910 2911 /** 2912 * Handle global flags (invert, case insensitivity). If this 2913 * pattern should be compiled case-insensitive, then we need 2914 * to close over case BEFORE COMPLEMENTING. This makes 2915 * patterns like /[^abc]/i work. 2916 */ 2917 if ((options & CASE) != 0) { 2918 closeOver(CASE); 2919 } 2920 if (invert) { 2921 complement(); 2922 } 2923 2924 // Use the rebuilt pattern (pat) only if necessary. Prefer the 2925 // generated pattern. 2926 if (usePat) { 2927 append(rebuiltPat, patBuf.toString()); 2928 } else { 2929 appendNewPattern(rebuiltPat, false, true); 2930 } 2931 } 2932 syntaxError(RuleCharacterIterator chars, String msg)2933 private static void syntaxError(RuleCharacterIterator chars, String msg) { 2934 throw new IllegalArgumentException("Error: " + msg + " at \"" + 2935 Utility.escape(chars.toString()) + 2936 '"'); 2937 } 2938 2939 /** 2940 * Add the contents of the UnicodeSet (as strings) into a collection. 2941 * @param target collection to add into 2942 * @stable ICU 4.4 2943 */ addAllTo(T target)2944 public <T extends Collection<String>> T addAllTo(T target) { 2945 return addAllTo(this, target); 2946 } 2947 2948 2949 /** 2950 * Add the contents of the UnicodeSet (as strings) into a collection. 2951 * @param target collection to add into 2952 * @stable ICU 4.4 2953 */ addAllTo(String[] target)2954 public String[] addAllTo(String[] target) { 2955 return addAllTo(this, target); 2956 } 2957 2958 /** 2959 * Add the contents of the UnicodeSet (as strings) into an array. 2960 * @stable ICU 4.4 2961 */ toArray(UnicodeSet set)2962 public static String[] toArray(UnicodeSet set) { 2963 return addAllTo(set, new String[set.size()]); 2964 } 2965 2966 /** 2967 * Add the contents of the collection (as strings) into this UnicodeSet. 2968 * The collection must not contain null. 2969 * @param source the collection to add 2970 * @return a reference to this object 2971 * @stable ICU 4.4 2972 */ add(Iterable<?> source)2973 public UnicodeSet add(Iterable<?> source) { 2974 return addAll(source); 2975 } 2976 2977 /** 2978 * Add a collection (as strings) into this UnicodeSet. 2979 * Uses standard naming convention. 2980 * @param source collection to add into 2981 * @return a reference to this object 2982 * @stable ICU 4.4 2983 */ addAll(Iterable<?> source)2984 public UnicodeSet addAll(Iterable<?> source) { 2985 checkFrozen(); 2986 for (Object o : source) { 2987 add(o.toString()); 2988 } 2989 return this; 2990 } 2991 2992 //---------------------------------------------------------------- 2993 // Implementation: Utility methods 2994 //---------------------------------------------------------------- 2995 nextCapacity(int minCapacity)2996 private int nextCapacity(int minCapacity) { 2997 // Grow exponentially to reduce the frequency of allocations. 2998 if (minCapacity < INITIAL_CAPACITY) { 2999 return minCapacity + INITIAL_CAPACITY; 3000 } else if (minCapacity <= 2500) { 3001 return 5 * minCapacity; 3002 } else { 3003 int newCapacity = 2 * minCapacity; 3004 if (newCapacity > MAX_LENGTH) { 3005 newCapacity = MAX_LENGTH; 3006 } 3007 return newCapacity; 3008 } 3009 } 3010 ensureCapacity(int newLen)3011 private void ensureCapacity(int newLen) { 3012 if (newLen > MAX_LENGTH) { 3013 newLen = MAX_LENGTH; 3014 } 3015 if (newLen <= list.length) return; 3016 int newCapacity = nextCapacity(newLen); 3017 int[] temp = new int[newCapacity]; 3018 // Copy only the actual contents. 3019 System.arraycopy(list, 0, temp, 0, len); 3020 list = temp; 3021 } 3022 ensureBufferCapacity(int newLen)3023 private void ensureBufferCapacity(int newLen) { 3024 if (newLen > MAX_LENGTH) { 3025 newLen = MAX_LENGTH; 3026 } 3027 if (buffer != null && newLen <= buffer.length) return; 3028 int newCapacity = nextCapacity(newLen); 3029 buffer = new int[newCapacity]; 3030 // The buffer has no contents to be copied. 3031 // It is always filled from scratch after this call. 3032 } 3033 3034 /** 3035 * Assumes start <= end. 3036 */ range(int start, int end)3037 private int[] range(int start, int end) { 3038 if (rangeList == null) { 3039 rangeList = new int[] { start, end+1, HIGH }; 3040 } else { 3041 rangeList[0] = start; 3042 rangeList[1] = end+1; 3043 } 3044 return rangeList; 3045 } 3046 3047 //---------------------------------------------------------------- 3048 // Implementation: Fundamental operations 3049 //---------------------------------------------------------------- 3050 3051 // polarity = 0, 3 is normal: x xor y 3052 // polarity = 1, 2: x xor ~y == x === y 3053 xor(int[] other, int otherLen, int polarity)3054 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 3055 ensureBufferCapacity(len + otherLen); 3056 int i = 0, j = 0, k = 0; 3057 int a = list[i++]; 3058 int b; 3059 // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used 3060 // so the following if statement will not be called. 3061 ///CLOVER:OFF 3062 if (polarity == 1 || polarity == 2) { 3063 b = LOW; 3064 if (other[j] == LOW) { // skip base if already LOW 3065 ++j; 3066 b = other[j]; 3067 } 3068 ///CLOVER:ON 3069 } else { 3070 b = other[j++]; 3071 } 3072 // simplest of all the routines 3073 // sort the values, discarding identicals! 3074 while (true) { 3075 if (a < b) { 3076 buffer[k++] = a; 3077 a = list[i++]; 3078 } else if (b < a) { 3079 buffer[k++] = b; 3080 b = other[j++]; 3081 } else if (a != HIGH) { // at this point, a == b 3082 // discard both values! 3083 a = list[i++]; 3084 b = other[j++]; 3085 } else { // DONE! 3086 buffer[k++] = HIGH; 3087 len = k; 3088 break; 3089 } 3090 } 3091 // swap list and buffer 3092 int[] temp = list; 3093 list = buffer; 3094 buffer = temp; 3095 pat = null; 3096 return this; 3097 } 3098 3099 // polarity = 0 is normal: x union y 3100 // polarity = 2: x union ~y 3101 // polarity = 1: ~x union y 3102 // polarity = 3: ~x union ~y 3103 add(int[] other, int otherLen, int polarity)3104 private UnicodeSet add(int[] other, int otherLen, int polarity) { 3105 ensureBufferCapacity(len + otherLen); 3106 int i = 0, j = 0, k = 0; 3107 int a = list[i++]; 3108 int b = other[j++]; 3109 // change from xor is that we have to check overlapping pairs 3110 // polarity bit 1 means a is second, bit 2 means b is. 3111 main: 3112 while (true) { 3113 switch (polarity) { 3114 case 0: // both first; take lower if unequal 3115 if (a < b) { // take a 3116 // Back up over overlapping ranges in buffer[] 3117 if (k > 0 && a <= buffer[k-1]) { 3118 // Pick latter end value in buffer[] vs. list[] 3119 a = max(list[i], buffer[--k]); 3120 } else { 3121 // No overlap 3122 buffer[k++] = a; 3123 a = list[i]; 3124 } 3125 i++; // Common if/else code factored out 3126 polarity ^= 1; 3127 } else if (b < a) { // take b 3128 if (k > 0 && b <= buffer[k-1]) { 3129 b = max(other[j], buffer[--k]); 3130 } else { 3131 buffer[k++] = b; 3132 b = other[j]; 3133 } 3134 j++; 3135 polarity ^= 2; 3136 } else { // a == b, take a, drop b 3137 if (a == HIGH) break main; 3138 // This is symmetrical; it doesn't matter if 3139 // we backtrack with a or b. - liu 3140 if (k > 0 && a <= buffer[k-1]) { 3141 a = max(list[i], buffer[--k]); 3142 } else { 3143 // No overlap 3144 buffer[k++] = a; 3145 a = list[i]; 3146 } 3147 i++; 3148 polarity ^= 1; 3149 b = other[j++]; polarity ^= 2; 3150 } 3151 break; 3152 case 3: // both second; take higher if unequal, and drop other 3153 if (b <= a) { // take a 3154 if (a == HIGH) break main; 3155 buffer[k++] = a; 3156 } else { // take b 3157 if (b == HIGH) break main; 3158 buffer[k++] = b; 3159 } 3160 a = list[i++]; polarity ^= 1; // factored common code 3161 b = other[j++]; polarity ^= 2; 3162 break; 3163 case 1: // a second, b first; if b < a, overlap 3164 if (a < b) { // no overlap, take a 3165 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3166 } else if (b < a) { // OVERLAP, drop b 3167 b = other[j++]; polarity ^= 2; 3168 } else { // a == b, drop both! 3169 if (a == HIGH) break main; 3170 a = list[i++]; polarity ^= 1; 3171 b = other[j++]; polarity ^= 2; 3172 } 3173 break; 3174 case 2: // a first, b second; if a < b, overlap 3175 if (b < a) { // no overlap, take b 3176 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3177 } else if (a < b) { // OVERLAP, drop a 3178 a = list[i++]; polarity ^= 1; 3179 } else { // a == b, drop both! 3180 if (a == HIGH) break main; 3181 a = list[i++]; polarity ^= 1; 3182 b = other[j++]; polarity ^= 2; 3183 } 3184 break; 3185 } 3186 } 3187 buffer[k++] = HIGH; // terminate 3188 len = k; 3189 // swap list and buffer 3190 int[] temp = list; 3191 list = buffer; 3192 buffer = temp; 3193 pat = null; 3194 return this; 3195 } 3196 3197 // polarity = 0 is normal: x intersect y 3198 // polarity = 2: x intersect ~y == set-minus 3199 // polarity = 1: ~x intersect y 3200 // polarity = 3: ~x intersect ~y 3201 retain(int[] other, int otherLen, int polarity)3202 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 3203 ensureBufferCapacity(len + otherLen); 3204 int i = 0, j = 0, k = 0; 3205 int a = list[i++]; 3206 int b = other[j++]; 3207 // change from xor is that we have to check overlapping pairs 3208 // polarity bit 1 means a is second, bit 2 means b is. 3209 main: 3210 while (true) { 3211 switch (polarity) { 3212 case 0: // both first; drop the smaller 3213 if (a < b) { // drop a 3214 a = list[i++]; polarity ^= 1; 3215 } else if (b < a) { // drop b 3216 b = other[j++]; polarity ^= 2; 3217 } else { // a == b, take one, drop other 3218 if (a == HIGH) break main; 3219 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3220 b = other[j++]; polarity ^= 2; 3221 } 3222 break; 3223 case 3: // both second; take lower if unequal 3224 if (a < b) { // take a 3225 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3226 } else if (b < a) { // take b 3227 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3228 } else { // a == b, take one, drop other 3229 if (a == HIGH) break main; 3230 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3231 b = other[j++]; polarity ^= 2; 3232 } 3233 break; 3234 case 1: // a second, b first; 3235 if (a < b) { // NO OVERLAP, drop a 3236 a = list[i++]; polarity ^= 1; 3237 } else if (b < a) { // OVERLAP, take b 3238 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3239 } else { // a == b, drop both! 3240 if (a == HIGH) break main; 3241 a = list[i++]; polarity ^= 1; 3242 b = other[j++]; polarity ^= 2; 3243 } 3244 break; 3245 case 2: // a first, b second; if a < b, overlap 3246 if (b < a) { // no overlap, drop b 3247 b = other[j++]; polarity ^= 2; 3248 } else if (a < b) { // OVERLAP, take a 3249 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3250 } else { // a == b, drop both! 3251 if (a == HIGH) break main; 3252 a = list[i++]; polarity ^= 1; 3253 b = other[j++]; polarity ^= 2; 3254 } 3255 break; 3256 } 3257 } 3258 buffer[k++] = HIGH; // terminate 3259 len = k; 3260 // swap list and buffer 3261 int[] temp = list; 3262 list = buffer; 3263 buffer = temp; 3264 pat = null; 3265 return this; 3266 } 3267 max(int a, int b)3268 private static final int max(int a, int b) { 3269 return (a > b) ? a : b; 3270 } 3271 3272 //---------------------------------------------------------------- 3273 // Generic filter-based scanning code 3274 //---------------------------------------------------------------- 3275 3276 private static interface Filter { contains(int codePoint)3277 boolean contains(int codePoint); 3278 } 3279 3280 private static final class NumericValueFilter implements Filter { 3281 double value; NumericValueFilter(double value)3282 NumericValueFilter(double value) { this.value = value; } 3283 @Override contains(int ch)3284 public boolean contains(int ch) { 3285 return UCharacter.getUnicodeNumericValue(ch) == value; 3286 } 3287 } 3288 3289 private static final class GeneralCategoryMaskFilter implements Filter { 3290 int mask; GeneralCategoryMaskFilter(int mask)3291 GeneralCategoryMaskFilter(int mask) { this.mask = mask; } 3292 @Override contains(int ch)3293 public boolean contains(int ch) { 3294 return ((1 << UCharacter.getType(ch)) & mask) != 0; 3295 } 3296 } 3297 3298 private static final class IntPropertyFilter implements Filter { 3299 int prop; 3300 int value; IntPropertyFilter(int prop, int value)3301 IntPropertyFilter(int prop, int value) { 3302 this.prop = prop; 3303 this.value = value; 3304 } 3305 @Override contains(int ch)3306 public boolean contains(int ch) { 3307 return UCharacter.getIntPropertyValue(ch, prop) == value; 3308 } 3309 } 3310 3311 private static final class ScriptExtensionsFilter implements Filter { 3312 int script; ScriptExtensionsFilter(int script)3313 ScriptExtensionsFilter(int script) { this.script = script; } 3314 @Override contains(int c)3315 public boolean contains(int c) { 3316 return UScript.hasScript(c, script); 3317 } 3318 } 3319 3320 // VersionInfo for unassigned characters 3321 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 3322 3323 private static final class VersionFilter implements Filter { 3324 VersionInfo version; VersionFilter(VersionInfo version)3325 VersionFilter(VersionInfo version) { this.version = version; } 3326 @Override contains(int ch)3327 public boolean contains(int ch) { 3328 VersionInfo v = UCharacter.getAge(ch); 3329 // Reference comparison ok; VersionInfo caches and reuses 3330 // unique objects. 3331 return !Utility.sameObjects(v, NO_VERSION) && 3332 v.compareTo(version) <= 0; 3333 } 3334 } 3335 3336 /** 3337 * Generic filter-based scanning code for UCD property UnicodeSets. 3338 */ applyFilter(Filter filter, UnicodeSet inclusions)3339 private void applyFilter(Filter filter, UnicodeSet inclusions) { 3340 // Logically, walk through all Unicode characters, noting the start 3341 // and end of each range for which filter.contain(c) is 3342 // true. Add each range to a set. 3343 // 3344 // To improve performance, use an inclusions set which 3345 // encodes information about character ranges that are known 3346 // to have identical properties. 3347 // inclusions contains the first characters of 3348 // same-value ranges for the given property. 3349 3350 clear(); 3351 3352 int startHasProperty = -1; 3353 int limitRange = inclusions.getRangeCount(); 3354 3355 for (int j=0; j<limitRange; ++j) { 3356 // get current range 3357 int start = inclusions.getRangeStart(j); 3358 int end = inclusions.getRangeEnd(j); 3359 3360 // for all the code points in the range, process 3361 for (int ch = start; ch <= end; ++ch) { 3362 // only add to the unicodeset on inflection points -- 3363 // where the hasProperty value changes to false 3364 if (filter.contains(ch)) { 3365 if (startHasProperty < 0) { 3366 startHasProperty = ch; 3367 } 3368 } else if (startHasProperty >= 0) { 3369 add_unchecked(startHasProperty, ch-1); 3370 startHasProperty = -1; 3371 } 3372 } 3373 } 3374 if (startHasProperty >= 0) { 3375 add_unchecked(startHasProperty, 0x10FFFF); 3376 } 3377 } 3378 3379 /** 3380 * Remove leading and trailing Pattern_White_Space and compress 3381 * internal Pattern_White_Space to a single space character. 3382 */ mungeCharName(String source)3383 private static String mungeCharName(String source) { 3384 source = PatternProps.trimWhiteSpace(source); 3385 StringBuilder buf = null; 3386 for (int i=0; i<source.length(); ++i) { 3387 char ch = source.charAt(i); 3388 if (PatternProps.isWhiteSpace(ch)) { 3389 if (buf == null) { 3390 buf = new StringBuilder().append(source, 0, i); 3391 } else if (buf.charAt(buf.length() - 1) == ' ') { 3392 continue; 3393 } 3394 ch = ' '; // convert to ' ' 3395 } 3396 if (buf != null) { 3397 buf.append(ch); 3398 } 3399 } 3400 return buf == null ? source : buf.toString(); 3401 } 3402 3403 //---------------------------------------------------------------- 3404 // Property set API 3405 //---------------------------------------------------------------- 3406 3407 /** 3408 * Modifies this set to contain those code points which have the 3409 * given value for the given binary or enumerated property, as 3410 * returned by UCharacter.getIntPropertyValue. Prior contents of 3411 * this set are lost. 3412 * 3413 * @param prop a property in the range 3414 * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or 3415 * UProperty.INT_START..UProperty.INT_LIMIT-1 or. 3416 * UProperty.MASK_START..UProperty.MASK_LIMIT-1. 3417 * 3418 * @param value a value in the range 3419 * UCharacter.getIntPropertyMinValue(prop).. 3420 * UCharacter.getIntPropertyMaxValue(prop), with one exception. 3421 * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be 3422 * a UCharacter.getType() result, but rather a mask value produced 3423 * by logically ORing (1 << UCharacter.getType()) values together. 3424 * This allows grouped categories such as [:L:] to be represented. 3425 * 3426 * @return a reference to this set 3427 * 3428 * @stable ICU 2.4 3429 */ applyIntPropertyValue(int prop, int value)3430 public UnicodeSet applyIntPropertyValue(int prop, int value) { 3431 // All of the following include checkFrozen() before modifying this set. 3432 if (prop == UProperty.GENERAL_CATEGORY_MASK) { 3433 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3434 applyFilter(new GeneralCategoryMaskFilter(value), inclusions); 3435 } else if (prop == UProperty.SCRIPT_EXTENSIONS) { 3436 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3437 applyFilter(new ScriptExtensionsFilter(value), inclusions); 3438 } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) { 3439 if (value == 0 || value == 1) { 3440 set(CharacterProperties.getBinaryPropertySet(prop)); 3441 if (value == 0) { 3442 complement(); 3443 } 3444 } else { 3445 clear(); 3446 } 3447 } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) { 3448 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3449 applyFilter(new IntPropertyFilter(prop, value), inclusions); 3450 } else { 3451 throw new IllegalArgumentException("unsupported property " + prop); 3452 } 3453 return this; 3454 } 3455 3456 3457 3458 /** 3459 * Modifies this set to contain those code points which have the 3460 * given value for the given property. Prior contents of this 3461 * set are lost. 3462 * 3463 * @param propertyAlias a property alias, either short or long. 3464 * The name is matched loosely. See PropertyAliases.txt for names 3465 * and a description of loose matching. If the value string is 3466 * empty, then this string is interpreted as either a 3467 * General_Category value alias, a Script value alias, a binary 3468 * property alias, or a special ID. Special IDs are matched 3469 * loosely and correspond to the following sets: 3470 * 3471 * "ANY" = [\\u0000-\\U0010FFFF], 3472 * "ASCII" = [\\u0000-\\u007F]. 3473 * 3474 * @param valueAlias a value alias, either short or long. The 3475 * name is matched loosely. See PropertyValueAliases.txt for 3476 * names and a description of loose matching. In addition to 3477 * aliases listed, numeric values and canonical combining classes 3478 * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", 3479 * "220"). The value string may also be empty. 3480 * 3481 * @return a reference to this set 3482 * 3483 * @stable ICU 2.4 3484 */ applyPropertyAlias(String propertyAlias, String valueAlias)3485 public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { 3486 return applyPropertyAlias(propertyAlias, valueAlias, null); 3487 } 3488 3489 /** 3490 * Modifies this set to contain those code points which have the 3491 * given value for the given property. Prior contents of this 3492 * set are lost. 3493 * @param propertyAlias A string of the property alias. 3494 * @param valueAlias A string of the value alias. 3495 * @param symbols if not null, then symbols are first called to see if a property 3496 * is available. If true, then everything else is skipped. 3497 * @return this set 3498 * @stable ICU 3.2 3499 */ applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3500 public UnicodeSet applyPropertyAlias(String propertyAlias, 3501 String valueAlias, SymbolTable symbols) { 3502 checkFrozen(); 3503 int p; 3504 int v; 3505 boolean invert = false; 3506 3507 if (symbols != null 3508 && (symbols instanceof XSymbolTable) 3509 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { 3510 return this; 3511 } 3512 3513 if (XSYMBOL_TABLE != null) { 3514 if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { 3515 return this; 3516 } 3517 } 3518 3519 if (valueAlias.length() > 0) { 3520 p = UCharacter.getPropertyEnum(propertyAlias); 3521 3522 // Treat gc as gcm 3523 if (p == UProperty.GENERAL_CATEGORY) { 3524 p = UProperty.GENERAL_CATEGORY_MASK; 3525 } 3526 3527 if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) || 3528 (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) || 3529 (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) { 3530 try { 3531 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3532 } catch (IllegalArgumentException e) { 3533 // Handle numeric CCC 3534 if (p == UProperty.CANONICAL_COMBINING_CLASS || 3535 p == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 3536 p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) { 3537 v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias)); 3538 // Anything between 0 and 255 is valid even if unused. 3539 if (v < 0 || v > 255) throw e; 3540 } else { 3541 throw e; 3542 } 3543 } 3544 } 3545 3546 else { 3547 switch (p) { 3548 case UProperty.NUMERIC_VALUE: 3549 { 3550 double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); 3551 applyFilter(new NumericValueFilter(value), 3552 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3553 return this; 3554 } 3555 case UProperty.NAME: 3556 { 3557 // Must munge name, since 3558 // UCharacter.charFromName() does not do 3559 // 'loose' matching. 3560 String buf = mungeCharName(valueAlias); 3561 int ch = UCharacter.getCharFromExtendedName(buf); 3562 if (ch == -1) { 3563 throw new IllegalArgumentException("Invalid character name"); 3564 } 3565 clear(); 3566 add_unchecked(ch); 3567 return this; 3568 } 3569 case UProperty.UNICODE_1_NAME: 3570 // ICU 49 deprecates the Unicode_1_Name property APIs. 3571 throw new IllegalArgumentException("Unicode_1_Name (na1) not supported"); 3572 case UProperty.AGE: 3573 { 3574 // Must munge name, since 3575 // VersionInfo.getInstance() does not do 3576 // 'loose' matching. 3577 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 3578 applyFilter(new VersionFilter(version), 3579 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3580 return this; 3581 } 3582 case UProperty.SCRIPT_EXTENSIONS: 3583 v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias); 3584 // fall through to calling applyIntPropertyValue() 3585 break; 3586 default: 3587 // p is a non-binary, non-enumerated property that we 3588 // don't support (yet). 3589 throw new IllegalArgumentException("Unsupported property"); 3590 } 3591 } 3592 } 3593 3594 else { 3595 // valueAlias is empty. Interpret as General Category, Script, 3596 // Binary property, or ANY or ASCII. Upon success, p and v will 3597 // be set. 3598 UPropertyAliases pnames = UPropertyAliases.INSTANCE; 3599 p = UProperty.GENERAL_CATEGORY_MASK; 3600 v = pnames.getPropertyValueEnum(p, propertyAlias); 3601 if (v == UProperty.UNDEFINED) { 3602 p = UProperty.SCRIPT; 3603 v = pnames.getPropertyValueEnum(p, propertyAlias); 3604 if (v == UProperty.UNDEFINED) { 3605 p = pnames.getPropertyEnum(propertyAlias); 3606 if (p == UProperty.UNDEFINED) { 3607 p = -1; 3608 } 3609 if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) { 3610 v = 1; 3611 } else if (p == -1) { 3612 if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) { 3613 set(MIN_VALUE, MAX_VALUE); 3614 return this; 3615 } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) { 3616 set(0, 0x7F); 3617 return this; 3618 } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) { 3619 // [:Assigned:]=[:^Cn:] 3620 p = UProperty.GENERAL_CATEGORY_MASK; 3621 v = (1<<UCharacter.UNASSIGNED); 3622 invert = true; 3623 } else { 3624 // Property name was never matched. 3625 throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias); 3626 } 3627 } else { 3628 // Valid propery name, but it isn't binary, so the value 3629 // must be supplied. 3630 throw new IllegalArgumentException("Missing property value"); 3631 } 3632 } 3633 } 3634 } 3635 3636 applyIntPropertyValue(p, v); 3637 if(invert) { 3638 complement(); 3639 } 3640 3641 return this; 3642 } 3643 3644 //---------------------------------------------------------------- 3645 // Property set patterns 3646 //---------------------------------------------------------------- 3647 3648 /** 3649 * Return true if the given position, in the given pattern, appears 3650 * to be the start of a property set pattern. 3651 */ resemblesPropertyPattern(String pattern, int pos)3652 private static boolean resemblesPropertyPattern(String pattern, int pos) { 3653 // Patterns are at least 5 characters long 3654 if ((pos+5) > pattern.length()) { 3655 return false; 3656 } 3657 3658 // Look for an opening [:, [:^, \p, or \P 3659 return pattern.regionMatches(pos, "[:", 0, 2) || 3660 pattern.regionMatches(true, pos, "\\p", 0, 2) || 3661 pattern.regionMatches(pos, "\\N", 0, 2); 3662 } 3663 3664 /** 3665 * Return true if the given iterator appears to point at a 3666 * property pattern. Regardless of the result, return with the 3667 * iterator unchanged. 3668 * @param chars iterator over the pattern characters. Upon return 3669 * it will be unchanged. 3670 * @param iterOpts RuleCharacterIterator options 3671 */ resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3672 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 3673 int iterOpts) { 3674 boolean result = false; 3675 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 3676 Object pos = chars.getPos(null); 3677 int c = chars.next(iterOpts); 3678 if (c == '[' || c == '\\') { 3679 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 3680 result = (c == '[') ? (d == ':') : 3681 (d == 'N' || d == 'p' || d == 'P'); 3682 } 3683 chars.setPos(pos); 3684 return result; 3685 } 3686 3687 /** 3688 * Parse the given property pattern at the given parse position. 3689 * @param symbols TODO 3690 */ applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3691 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 3692 int pos = ppos.getIndex(); 3693 3694 // On entry, ppos should point to one of the following locations: 3695 3696 // Minimum length is 5 characters, e.g. \p{L} 3697 if ((pos+5) > pattern.length()) { 3698 return null; 3699 } 3700 3701 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 3702 boolean isName = false; // true for \N{pat}, o/w false 3703 boolean invert = false; 3704 3705 // Look for an opening [:, [:^, \p, or \P 3706 if (pattern.regionMatches(pos, "[:", 0, 2)) { 3707 posix = true; 3708 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3709 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 3710 ++pos; 3711 invert = true; 3712 } 3713 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 3714 pattern.regionMatches(pos, "\\N", 0, 2)) { 3715 char c = pattern.charAt(pos+1); 3716 invert = (c == 'P'); 3717 isName = (c == 'N'); 3718 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3719 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 3720 // Syntax error; "\p" or "\P" not followed by "{" 3721 return null; 3722 } 3723 } else { 3724 // Open delimiter not seen 3725 return null; 3726 } 3727 3728 // Look for the matching close delimiter, either :] or } 3729 int close = pattern.indexOf(posix ? ":]" : "}", pos); 3730 if (close < 0) { 3731 // Syntax error; close delimiter missing 3732 return null; 3733 } 3734 3735 // Look for an '=' sign. If this is present, we will parse a 3736 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 3737 // pattern. 3738 int equals = pattern.indexOf('=', pos); 3739 String propName, valueName; 3740 if (equals >= 0 && equals < close && !isName) { 3741 // Equals seen; parse medium/long pattern 3742 propName = pattern.substring(pos, equals); 3743 valueName = pattern.substring(equals+1, close); 3744 } 3745 3746 else { 3747 // Handle case where no '=' is seen, and \N{} 3748 propName = pattern.substring(pos, close); 3749 valueName = ""; 3750 3751 // Handle \N{name} 3752 if (isName) { 3753 // This is a little inefficient since it means we have to 3754 // parse "na" back to UProperty.NAME even though we already 3755 // know it's UProperty.NAME. If we refactor the API to 3756 // support args of (int, String) then we can remove 3757 // "na" and make this a little more efficient. 3758 valueName = propName; 3759 propName = "na"; 3760 } 3761 } 3762 3763 applyPropertyAlias(propName, valueName, symbols); 3764 3765 if (invert) { 3766 complement(); 3767 } 3768 3769 // Move to the limit position after the close delimiter 3770 ppos.setIndex(close + (posix ? 2 : 1)); 3771 3772 return this; 3773 } 3774 3775 /** 3776 * Parse a property pattern. 3777 * @param chars iterator over the pattern characters. Upon return 3778 * it will be advanced to the first character after the parsed 3779 * pattern, or the end of the iteration if all characters are 3780 * parsed. 3781 * @param rebuiltPat the pattern that was parsed, rebuilt or 3782 * copied from the input pattern, as appropriate. 3783 * @param symbols TODO 3784 */ applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3785 private void applyPropertyPattern(RuleCharacterIterator chars, 3786 Appendable rebuiltPat, SymbolTable symbols) { 3787 String patStr = chars.lookahead(); 3788 ParsePosition pos = new ParsePosition(0); 3789 applyPropertyPattern(patStr, pos, symbols); 3790 if (pos.getIndex() == 0) { 3791 syntaxError(chars, "Invalid property pattern"); 3792 } 3793 chars.jumpahead(pos.getIndex()); 3794 append(rebuiltPat, patStr.substring(0, pos.getIndex())); 3795 } 3796 3797 //---------------------------------------------------------------- 3798 // Case folding API 3799 //---------------------------------------------------------------- 3800 3801 /** 3802 * Bitmask for constructor and applyPattern() indicating that 3803 * white space should be ignored. If set, ignore Unicode Pattern_White_Space characters, 3804 * unless they are quoted or escaped. This may be ORed together 3805 * with other selectors. 3806 * @stable ICU 3.8 3807 */ 3808 public static final int IGNORE_SPACE = 1; 3809 3810 /** 3811 * Bitmask for constructor, applyPattern(), and closeOver() 3812 * indicating letter case. This may be ORed together with other 3813 * selectors. 3814 * 3815 * Enable case insensitive matching. E.g., "[ab]" with this flag 3816 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3817 * match all except 'a', 'A', 'b', and 'B'. This performs a full 3818 * closure over case mappings, e.g. U+017F for s. 3819 * 3820 * The resulting set is a superset of the input for the code points but 3821 * not for the strings. 3822 * It performs a case mapping closure of the code points and adds 3823 * full case folding strings for the code points, and reduces strings of 3824 * the original set to their full case folding equivalents. 3825 * 3826 * This is designed for case-insensitive matches, for example 3827 * in regular expressions. The full code point case closure allows checking of 3828 * an input character directly against the closure set. 3829 * Strings are matched by comparing the case-folded form from the closure 3830 * set with an incremental case folding of the string in question. 3831 * 3832 * The closure set will also contain single code points if the original 3833 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 3834 * This is not necessary (that is, redundant) for the above matching method 3835 * but results in the same closure sets regardless of whether the original 3836 * set contained the code point or a string. 3837 * @stable ICU 3.8 3838 */ 3839 public static final int CASE = 2; 3840 3841 /** 3842 * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C 3843 * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h). 3844 * @see #CASE 3845 * @stable ICU 3.4 3846 */ 3847 public static final int CASE_INSENSITIVE = 2; 3848 3849 /** 3850 * Bitmask for constructor, applyPattern(), and closeOver() 3851 * indicating letter case. This may be ORed together with other 3852 * selectors. 3853 * 3854 * Enable case insensitive matching. E.g., "[ab]" with this flag 3855 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3856 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 3857 * title-, and uppercase mappings as well as the case folding 3858 * of each existing element in the set. 3859 * @stable ICU 3.4 3860 */ 3861 public static final int ADD_CASE_MAPPINGS = 4; 3862 3863 // add the result of a full case mapping to the set 3864 // use str as a temporary string to avoid constructing one addCaseMapping(UnicodeSet set, int result, StringBuilder full)3865 private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { 3866 if(result >= 0) { 3867 if(result > UCaseProps.MAX_STRING_LENGTH) { 3868 // add a single-code point case mapping 3869 set.add(result); 3870 } else { 3871 // add a string case mapping from full with length result 3872 set.add(full.toString()); 3873 full.setLength(0); 3874 } 3875 } 3876 // result < 0: the code point mapped to itself, no need to add it 3877 // see UCaseProps 3878 } 3879 3880 /** 3881 * Close this set over the given attribute. For the attribute 3882 * CASE, the result is to modify this set so that: 3883 * 3884 * 1. For each character or string 'a' in this set, all strings 3885 * 'b' such that foldCase(a) == foldCase(b) are added to this set. 3886 * (For most 'a' that are single characters, 'b' will have 3887 * b.length() == 1.) 3888 * 3889 * 2. For each string 'e' in the resulting set, if e != 3890 * foldCase(e), 'e' will be removed. 3891 * 3892 * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] 3893 * 3894 * (Here foldCase(x) refers to the operation 3895 * UCharacter.foldCase(x, true), and a == b actually denotes 3896 * a.equals(b), not pointer comparison.) 3897 * 3898 * @param attribute bitmask for attributes to close over. 3899 * Currently only the CASE bit is supported. Any undefined bits 3900 * are ignored. 3901 * @return a reference to this set. 3902 * @stable ICU 3.8 3903 */ closeOver(int attribute)3904 public UnicodeSet closeOver(int attribute) { 3905 checkFrozen(); 3906 if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) { 3907 UCaseProps csp = UCaseProps.INSTANCE; 3908 UnicodeSet foldSet = new UnicodeSet(this); 3909 ULocale root = ULocale.ROOT; 3910 3911 // start with input set to guarantee inclusion 3912 // CASE: remove strings because the strings will actually be reduced (folded); 3913 // therefore, start with no strings and add only those needed 3914 if((attribute & CASE) != 0 && foldSet.hasStrings()) { 3915 foldSet.strings.clear(); 3916 } 3917 3918 int n = getRangeCount(); 3919 int result; 3920 StringBuilder full = new StringBuilder(); 3921 3922 for (int i=0; i<n; ++i) { 3923 int start = getRangeStart(i); 3924 int end = getRangeEnd(i); 3925 3926 if((attribute & CASE) != 0) { 3927 // full case closure 3928 for (int cp=start; cp<=end; ++cp) { 3929 csp.addCaseClosure(cp, foldSet); 3930 } 3931 } else { 3932 // add case mappings 3933 // (does not add long s for regular s, or Kelvin for k, for example) 3934 for (int cp=start; cp<=end; ++cp) { 3935 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); 3936 addCaseMapping(foldSet, result, full); 3937 3938 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); 3939 addCaseMapping(foldSet, result, full); 3940 3941 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); 3942 addCaseMapping(foldSet, result, full); 3943 3944 result = csp.toFullFolding(cp, full, 0); 3945 addCaseMapping(foldSet, result, full); 3946 } 3947 } 3948 } 3949 if (hasStrings()) { 3950 if ((attribute & CASE) != 0) { 3951 for (String s : strings) { 3952 String str = UCharacter.foldCase(s, 0); 3953 if(!csp.addStringCaseClosure(str, foldSet)) { 3954 foldSet.add(str); // does not map to code points: add the folded string itself 3955 } 3956 } 3957 } else { 3958 BreakIterator bi = BreakIterator.getWordInstance(root); 3959 for (String str : strings) { 3960 // TODO: call lower-level functions 3961 foldSet.add(UCharacter.toLowerCase(root, str)); 3962 foldSet.add(UCharacter.toTitleCase(root, str, bi)); 3963 foldSet.add(UCharacter.toUpperCase(root, str)); 3964 foldSet.add(UCharacter.foldCase(str, 0)); 3965 } 3966 } 3967 } 3968 set(foldSet); 3969 } 3970 return this; 3971 } 3972 3973 /** 3974 * Internal class for customizing UnicodeSet parsing of properties. 3975 * TODO: extend to allow customizing of codepoint ranges 3976 * @draft ICU3.8 (retain) 3977 * @author medavis 3978 */ 3979 abstract public static class XSymbolTable implements SymbolTable { 3980 /** 3981 * Default constructor 3982 * @draft ICU3.8 (retain) 3983 */ XSymbolTable()3984 public XSymbolTable(){} 3985 /** 3986 * Supplies default implementation for SymbolTable (no action). 3987 * @draft ICU3.8 (retain) 3988 */ 3989 @Override lookupMatcher(int i)3990 public UnicodeMatcher lookupMatcher(int i) { 3991 return null; 3992 } 3993 3994 /** 3995 * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style 3996 * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be. 3997 * <p> 3998 * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a 3999 * propertyName+propertyValue combination. 4000 * 4001 * @param propertyName 4002 * the name of the property 4003 * @param propertyValue 4004 * the name of the property value 4005 * @param result UnicodeSet value to change 4006 * a set to which the characters having the propertyName+propertyValue are to be added. 4007 * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters 4008 * with that property have been added to the UnicodeSet, and returns false if the 4009 * propertyName+propertyValue combination is not recognized (in which case result is unaltered). 4010 * @draft ICU3.8 (retain) 4011 */ applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)4012 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 4013 return false; 4014 } 4015 /** 4016 * Supplies default implementation for SymbolTable (no action). 4017 * @draft ICU3.8 (retain) 4018 */ 4019 @Override lookup(String s)4020 public char[] lookup(String s) { 4021 return null; 4022 } 4023 /** 4024 * Supplies default implementation for SymbolTable (no action). 4025 * @draft ICU3.8 (retain) 4026 */ 4027 @Override parseReference(String text, ParsePosition pos, int limit)4028 public String parseReference(String text, ParsePosition pos, int limit) { 4029 return null; 4030 } 4031 } 4032 4033 /** 4034 * Is this frozen, according to the Freezable interface? 4035 * 4036 * @return value 4037 * @stable ICU 3.8 4038 */ 4039 @Override isFrozen()4040 public boolean isFrozen() { 4041 return (bmpSet != null || stringSpan != null); 4042 } 4043 4044 /** 4045 * Freeze this class, according to the Freezable interface. 4046 * 4047 * @return this 4048 * @stable ICU 4.4 4049 */ 4050 @Override freeze()4051 public UnicodeSet freeze() { 4052 if (!isFrozen()) { 4053 compact(); 4054 4055 // Optimize contains() and span() and similar functions. 4056 if (hasStrings()) { 4057 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL); 4058 } 4059 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { 4060 // Optimize for code point spans. 4061 // There are no strings, or 4062 // all strings are irrelevant for span() etc. because 4063 // all of each string's code points are contained in this set. 4064 // However, fully contained strings are relevant for spanAndCount(), 4065 // so we create both objects. 4066 bmpSet = new BMPSet(list, len); 4067 } 4068 } 4069 return this; 4070 } 4071 4072 /** 4073 * Span a string using this UnicodeSet. 4074 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4075 * @param s The string to be spanned 4076 * @param spanCondition The span condition 4077 * @return the length of the span 4078 * @stable ICU 4.4 4079 */ span(CharSequence s, SpanCondition spanCondition)4080 public int span(CharSequence s, SpanCondition spanCondition) { 4081 return span(s, 0, spanCondition); 4082 } 4083 4084 /** 4085 * Span a string using this UnicodeSet. 4086 * If the start index is less than 0, span will start from 0. 4087 * If the start index is greater than the string length, span returns the string length. 4088 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4089 * @param s The string to be spanned 4090 * @param start The start index that the span begins 4091 * @param spanCondition The span condition 4092 * @return the string index which ends the span (i.e. exclusive) 4093 * @stable ICU 4.4 4094 */ span(CharSequence s, int start, SpanCondition spanCondition)4095 public int span(CharSequence s, int start, SpanCondition spanCondition) { 4096 int end = s.length(); 4097 if (start < 0) { 4098 start = 0; 4099 } else if (start >= end) { 4100 return end; 4101 } 4102 if (bmpSet != null) { 4103 // Frozen set without strings, or no string is relevant for span(). 4104 return bmpSet.span(s, start, spanCondition, null); 4105 } 4106 if (stringSpan != null) { 4107 return stringSpan.span(s, start, spanCondition); 4108 } else if (hasStrings()) { 4109 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4110 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4111 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4112 if (strSpan.needsStringSpanUTF16()) { 4113 return strSpan.span(s, start, spanCondition); 4114 } 4115 } 4116 4117 return spanCodePointsAndCount(s, start, spanCondition, null); 4118 } 4119 4120 /** 4121 * Same as span() but also counts the smallest number of set elements on any path across the span. 4122 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4123 * @param outCount An output-only object (must not be null) for returning the count. 4124 * @return the limit (exclusive end) of the span 4125 * @internal 4126 * @deprecated This API is ICU internal only. 4127 */ 4128 @Deprecated spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4129 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { 4130 if (outCount == null) { 4131 throw new IllegalArgumentException("outCount must not be null"); 4132 } 4133 int end = s.length(); 4134 if (start < 0) { 4135 start = 0; 4136 } else if (start >= end) { 4137 return end; 4138 } 4139 if (stringSpan != null) { 4140 // We might also have bmpSet != null, 4141 // but fully-contained strings are relevant for counting elements. 4142 return stringSpan.spanAndCount(s, start, spanCondition, outCount); 4143 } else if (bmpSet != null) { 4144 return bmpSet.span(s, start, spanCondition, outCount); 4145 } else if (hasStrings()) { 4146 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4147 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4148 which |= UnicodeSetStringSpan.WITH_COUNT; 4149 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4150 return strSpan.spanAndCount(s, start, spanCondition, outCount); 4151 } 4152 4153 return spanCodePointsAndCount(s, start, spanCondition, outCount); 4154 } 4155 spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4156 private int spanCodePointsAndCount(CharSequence s, int start, 4157 SpanCondition spanCondition, OutputInt outCount) { 4158 // Pin to 0/1 values. 4159 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4160 4161 int c; 4162 int next = start; 4163 int length = s.length(); 4164 int count = 0; 4165 do { 4166 c = Character.codePointAt(s, next); 4167 if (spanContained != contains(c)) { 4168 break; 4169 } 4170 ++count; 4171 next += Character.charCount(c); 4172 } while (next < length); 4173 if (outCount != null) { outCount.value = count; } 4174 return next; 4175 } 4176 4177 /** 4178 * Span a string backwards (from the end) using this UnicodeSet. 4179 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4180 * @param s The string to be spanned 4181 * @param spanCondition The span condition 4182 * @return The string index which starts the span (i.e. inclusive). 4183 * @stable ICU 4.4 4184 */ spanBack(CharSequence s, SpanCondition spanCondition)4185 public int spanBack(CharSequence s, SpanCondition spanCondition) { 4186 return spanBack(s, s.length(), spanCondition); 4187 } 4188 4189 /** 4190 * Span a string backwards (from the fromIndex) using this UnicodeSet. 4191 * If the fromIndex is less than 0, spanBack will return 0. 4192 * If fromIndex is greater than the string length, spanBack will start from the string length. 4193 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4194 * @param s The string to be spanned 4195 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards 4196 * @param spanCondition The span condition 4197 * @return The string index which starts the span (i.e. inclusive). 4198 * @stable ICU 4.4 4199 */ spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4200 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { 4201 if (fromIndex <= 0) { 4202 return 0; 4203 } 4204 if (fromIndex > s.length()) { 4205 fromIndex = s.length(); 4206 } 4207 if (bmpSet != null) { 4208 // Frozen set without strings, or no string is relevant for spanBack(). 4209 return bmpSet.spanBack(s, fromIndex, spanCondition); 4210 } 4211 if (stringSpan != null) { 4212 return stringSpan.spanBack(s, fromIndex, spanCondition); 4213 } else if (hasStrings()) { 4214 int which = (spanCondition == SpanCondition.NOT_CONTAINED) 4215 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED 4216 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; 4217 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4218 if (strSpan.needsStringSpanUTF16()) { 4219 return strSpan.spanBack(s, fromIndex, spanCondition); 4220 } 4221 } 4222 4223 // Pin to 0/1 values. 4224 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4225 4226 int c; 4227 int prev = fromIndex; 4228 do { 4229 c = Character.codePointBefore(s, prev); 4230 if (spanContained != contains(c)) { 4231 break; 4232 } 4233 prev -= Character.charCount(c); 4234 } while (prev > 0); 4235 return prev; 4236 } 4237 4238 /** 4239 * Clone a thawed version of this class, according to the Freezable interface. 4240 * @return the clone, not frozen 4241 * @stable ICU 4.4 4242 */ 4243 @Override cloneAsThawed()4244 public UnicodeSet cloneAsThawed() { 4245 UnicodeSet result = new UnicodeSet(this); 4246 assert !result.isFrozen(); 4247 return result; 4248 } 4249 4250 // internal function checkFrozen()4251 private void checkFrozen() { 4252 if (isFrozen()) { 4253 throw new UnsupportedOperationException("Attempt to modify frozen object"); 4254 } 4255 } 4256 4257 // ************************ 4258 // Additional methods for integration with Generics and Collections 4259 // ************************ 4260 4261 /** 4262 * A struct-like class used for iteration through ranges, for faster iteration than by String. 4263 * Read about the restrictions on usage in {@link UnicodeSet#ranges()}. 4264 * 4265 * @stable ICU 54 4266 */ 4267 public static class EntryRange { 4268 /** 4269 * The starting code point of the range. 4270 * 4271 * @stable ICU 54 4272 */ 4273 public int codepoint; 4274 /** 4275 * The ending code point of the range 4276 * 4277 * @stable ICU 54 4278 */ 4279 public int codepointEnd; 4280 EntryRange()4281 EntryRange() { 4282 } 4283 4284 /** 4285 * {@inheritDoc} 4286 * 4287 * @stable ICU 54 4288 */ 4289 @Override toString()4290 public String toString() { 4291 StringBuilder b = new StringBuilder(); 4292 return ( 4293 codepoint == codepointEnd ? _appendToPat(b, codepoint, false) 4294 : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) 4295 .toString(); 4296 } 4297 } 4298 4299 /** 4300 * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points. 4301 * The UnicodeSet must not be altered during the iteration. 4302 * The EntryRange instance is the same each time; the contents are just reset. 4303 * 4304 * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings. 4305 * 4306 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4307 * Do not alter the UnicodeSet while iterating. 4308 * 4309 * <pre> 4310 * // Sample code 4311 * for (EntryRange range : us1.ranges()) { 4312 * // do something with code points between range.codepoint and range.codepointEnd; 4313 * } 4314 * for (String s : us1.strings()) { 4315 * // do something with each string; 4316 * } 4317 * </pre> 4318 * 4319 * @stable ICU 54 4320 */ ranges()4321 public Iterable<EntryRange> ranges() { 4322 return new EntryRangeIterable(); 4323 } 4324 4325 private class EntryRangeIterable implements Iterable<EntryRange> { 4326 @Override iterator()4327 public Iterator<EntryRange> iterator() { 4328 return new EntryRangeIterator(); 4329 } 4330 } 4331 4332 private class EntryRangeIterator implements Iterator<EntryRange> { 4333 int pos; 4334 EntryRange result = new EntryRange(); 4335 4336 @Override hasNext()4337 public boolean hasNext() { 4338 return pos < len-1; 4339 } 4340 @Override next()4341 public EntryRange next() { 4342 if (pos < len-1) { 4343 result.codepoint = list[pos++]; 4344 result.codepointEnd = list[pos++]-1; 4345 } else { 4346 throw new NoSuchElementException(); 4347 } 4348 return result; 4349 } 4350 @Override remove()4351 public void remove() { 4352 throw new UnsupportedOperationException(); 4353 } 4354 } 4355 4356 4357 /** 4358 * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. 4359 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4360 * Do not alter the UnicodeSet while iterating. 4361 * @see java.util.Set#iterator() 4362 * @stable ICU 4.4 4363 */ 4364 @Override iterator()4365 public Iterator<String> iterator() { 4366 return new UnicodeSetIterator2(this); 4367 } 4368 4369 // Cover for string iteration. 4370 private static class UnicodeSetIterator2 implements Iterator<String> { 4371 // Invariants: 4372 // sourceList != null then sourceList[item] is a valid character 4373 // sourceList == null then delegates to stringIterator 4374 private int[] sourceList; 4375 private int len; 4376 private int item; 4377 private int current; 4378 private int limit; 4379 private SortedSet<String> sourceStrings; 4380 private Iterator<String> stringIterator; 4381 private char[] buffer; 4382 UnicodeSetIterator2(UnicodeSet source)4383 UnicodeSetIterator2(UnicodeSet source) { 4384 // set according to invariants 4385 len = source.len - 1; 4386 if (len > 0) { 4387 sourceStrings = source.strings; 4388 sourceList = source.list; 4389 current = sourceList[item++]; 4390 limit = sourceList[item++]; 4391 } else { 4392 stringIterator = source.strings.iterator(); 4393 sourceList = null; 4394 } 4395 } 4396 4397 /* (non-Javadoc) 4398 * @see java.util.Iterator#hasNext() 4399 */ 4400 @Override hasNext()4401 public boolean hasNext() { 4402 return sourceList != null || stringIterator.hasNext(); 4403 } 4404 4405 /* (non-Javadoc) 4406 * @see java.util.Iterator#next() 4407 */ 4408 @Override next()4409 public String next() { 4410 if (sourceList == null) { 4411 return stringIterator.next(); 4412 } 4413 int codepoint = current++; 4414 // we have the codepoint we need, but we may need to adjust the state 4415 if (current >= limit) { 4416 if (item >= len) { 4417 stringIterator = sourceStrings.iterator(); 4418 sourceList = null; 4419 } else { 4420 current = sourceList[item++]; 4421 limit = sourceList[item++]; 4422 } 4423 } 4424 // Now return. Single code point is easy 4425 if (codepoint <= 0xFFFF) { 4426 return String.valueOf((char)codepoint); 4427 } 4428 // But Java lacks a valueOfCodePoint, so we handle ourselves for speed 4429 // allocate a buffer the first time, to make conversion faster. 4430 if (buffer == null) { 4431 buffer = new char[2]; 4432 } 4433 // compute ourselves, to save tests and calls 4434 int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; 4435 buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); 4436 buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); 4437 return String.valueOf(buffer); 4438 } 4439 4440 /* (non-Javadoc) 4441 * @see java.util.Iterator#remove() 4442 */ 4443 @Override remove()4444 public void remove() { 4445 throw new UnsupportedOperationException(); 4446 } 4447 } 4448 4449 /** 4450 * @see #containsAll(com.ibm.icu.text.UnicodeSet) 4451 * @stable ICU 4.4 4452 */ containsAll(Iterable<T> collection)4453 public <T extends CharSequence> boolean containsAll(Iterable<T> collection) { 4454 for (T o : collection) { 4455 if (!contains(o)) { 4456 return false; 4457 } 4458 } 4459 return true; 4460 } 4461 4462 /** 4463 * @see #containsNone(com.ibm.icu.text.UnicodeSet) 4464 * @stable ICU 4.4 4465 */ containsNone(Iterable<T> collection)4466 public <T extends CharSequence> boolean containsNone(Iterable<T> collection) { 4467 for (T o : collection) { 4468 if (contains(o)) { 4469 return false; 4470 } 4471 } 4472 return true; 4473 } 4474 4475 /** 4476 * @see #containsAll(com.ibm.icu.text.UnicodeSet) 4477 * @stable ICU 4.4 4478 */ containsSome(Iterable<T> collection)4479 public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) { 4480 return !containsNone(collection); 4481 } 4482 4483 /** 4484 * @see #addAll(com.ibm.icu.text.UnicodeSet) 4485 * @stable ICU 4.4 4486 */ 4487 @SuppressWarnings("unchecked") // See ticket #11395, this is safe. addAll(T... collection)4488 public <T extends CharSequence> UnicodeSet addAll(T... collection) { 4489 checkFrozen(); 4490 for (T str : collection) { 4491 add(str); 4492 } 4493 return this; 4494 } 4495 4496 4497 /** 4498 * @see #removeAll(com.ibm.icu.text.UnicodeSet) 4499 * @stable ICU 4.4 4500 */ removeAll(Iterable<T> collection)4501 public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) { 4502 checkFrozen(); 4503 for (T o : collection) { 4504 remove(o); 4505 } 4506 return this; 4507 } 4508 4509 /** 4510 * @see #retainAll(com.ibm.icu.text.UnicodeSet) 4511 * @stable ICU 4.4 4512 */ retainAll(Iterable<T> collection)4513 public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) { 4514 checkFrozen(); 4515 // TODO optimize 4516 UnicodeSet toRetain = new UnicodeSet(); 4517 toRetain.addAll(collection); 4518 retainAll(toRetain); 4519 return this; 4520 } 4521 4522 /** 4523 * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}. 4524 * @stable ICU 4.4 4525 */ 4526 public enum ComparisonStyle { 4527 /** 4528 * @stable ICU 4.4 4529 */ 4530 SHORTER_FIRST, 4531 /** 4532 * @stable ICU 4.4 4533 */ 4534 LEXICOGRAPHIC, 4535 /** 4536 * @stable ICU 4.4 4537 */ 4538 LONGER_FIRST 4539 } 4540 4541 /** 4542 * Compares UnicodeSets, where shorter come first, and otherwise lexigraphically 4543 * (according to the comparison of the first characters that differ). 4544 * @see java.lang.Comparable#compareTo(java.lang.Object) 4545 * @stable ICU 4.4 4546 */ 4547 @Override compareTo(UnicodeSet o)4548 public int compareTo(UnicodeSet o) { 4549 return compareTo(o, ComparisonStyle.SHORTER_FIRST); 4550 } 4551 /** 4552 * Compares UnicodeSets, in three different ways. 4553 * @see java.lang.Comparable#compareTo(java.lang.Object) 4554 * @stable ICU 4.4 4555 */ compareTo(UnicodeSet o, ComparisonStyle style)4556 public int compareTo(UnicodeSet o, ComparisonStyle style) { 4557 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4558 int diff = size() - o.size(); 4559 if (diff != 0) { 4560 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4561 } 4562 } 4563 int result; 4564 for (int i = 0; ; ++i) { 4565 if (0 != (result = list[i] - o.list[i])) { 4566 // if either list ran out, compare to the last string 4567 if (list[i] == HIGH) { 4568 if (!hasStrings()) return 1; 4569 String item = strings.first(); 4570 return compare(item, o.list[i]); 4571 } 4572 if (o.list[i] == HIGH) { 4573 if (!o.hasStrings()) return -1; 4574 String item = o.strings.first(); 4575 int compareResult = compare(item, list[i]); 4576 return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order. 4577 } 4578 // otherwise return the result if even index, or the reversal if not 4579 return (i & 1) == 0 ? result : -result; 4580 } 4581 if (list[i] == HIGH) { 4582 break; 4583 } 4584 } 4585 return compare(strings, o.strings); 4586 } 4587 4588 /** 4589 * @stable ICU 4.4 4590 */ compareTo(Iterable<String> other)4591 public int compareTo(Iterable<String> other) { 4592 return compare(this, other); 4593 } 4594 4595 /** 4596 * Utility to compare a string to a code point. 4597 * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) 4598 * and comparing, but much faster (no object creation). 4599 * Actually, there is one difference; a null compares as less. 4600 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4601 * @stable ICU 4.4 4602 */ 4603 compare(CharSequence string, int codePoint)4604 public static int compare(CharSequence string, int codePoint) { 4605 return CharSequences.compare(string, codePoint); 4606 } 4607 4608 /** 4609 * Utility to compare a string to a code point. 4610 * Same results as turning the code point into a string and comparing, but much faster (no object creation). 4611 * Actually, there is one difference; a null compares as less. 4612 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4613 * @stable ICU 4.4 4614 */ compare(int codePoint, CharSequence string)4615 public static int compare(int codePoint, CharSequence string) { 4616 return -CharSequences.compare(string, codePoint); 4617 } 4618 4619 4620 /** 4621 * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered, 4622 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4623 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4624 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4625 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4626 * @stable ICU 4.4 4627 */ compare(Iterable<T> collection1, Iterable<T> collection2)4628 public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) { 4629 return compare(collection1.iterator(), collection2.iterator()); 4630 } 4631 4632 /** 4633 * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, 4634 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4635 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4636 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4637 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4638 * @internal 4639 * @deprecated This API is ICU internal only. 4640 */ 4641 @Deprecated compare(Iterator<T> first, Iterator<T> other)4642 public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) { 4643 while (true) { 4644 if (!first.hasNext()) { 4645 return other.hasNext() ? -1 : 0; 4646 } else if (!other.hasNext()) { 4647 return 1; 4648 } 4649 T item1 = first.next(); 4650 T item2 = other.next(); 4651 int result = item1.compareTo(item2); 4652 if (result != 0) { 4653 return result; 4654 } 4655 } 4656 } 4657 4658 4659 /** 4660 * Utility to compare two collections, optionally by size, and then lexicographically. 4661 * @stable ICU 4.4 4662 */ compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4663 public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) { 4664 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4665 int diff = collection1.size() - collection2.size(); 4666 if (diff != 0) { 4667 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4668 } 4669 } 4670 return compare(collection1, collection2); 4671 } 4672 4673 /** 4674 * Utility for adding the contents of an iterable to a collection. 4675 * @stable ICU 4.4 4676 */ addAllTo(Iterable<T> source, U target)4677 public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) { 4678 for (T item : source) { 4679 target.add(item); 4680 } 4681 return target; 4682 } 4683 4684 /** 4685 * Utility for adding the contents of an iterable to a collection. 4686 * @stable ICU 4.4 4687 */ addAllTo(Iterable<T> source, T[] target)4688 public static <T> T[] addAllTo(Iterable<T> source, T[] target) { 4689 int i = 0; 4690 for (T item : source) { 4691 target[i++] = item; 4692 } 4693 return target; 4694 } 4695 4696 /** 4697 * For iterating through the strings in the set. Example: 4698 * <pre> 4699 * for (String key : myUnicodeSet.strings()) { 4700 * doSomethingWith(key); 4701 * } 4702 * </pre> 4703 * @stable ICU 4.4 4704 */ strings()4705 public Collection<String> strings() { 4706 if (hasStrings()) { 4707 return Collections.unmodifiableSortedSet(strings); 4708 } else { 4709 return EMPTY_STRINGS; 4710 } 4711 } 4712 4713 /** 4714 * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE. 4715 * @internal 4716 * @deprecated This API is ICU internal only. 4717 */ 4718 @Deprecated getSingleCodePoint(CharSequence s)4719 public static int getSingleCodePoint(CharSequence s) { 4720 return CharSequences.getSingleCodePoint(s); 4721 } 4722 4723 /** 4724 * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set. 4725 * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E 4726 * if the dontCare set includes unassigned characters (for a particular version of Unicode). 4727 * @param dontCare Set with the don't-care characters for spanning 4728 * @return the input set, modified 4729 * @internal 4730 * @deprecated This API is ICU internal only. 4731 */ 4732 @Deprecated addBridges(UnicodeSet dontCare)4733 public UnicodeSet addBridges(UnicodeSet dontCare) { 4734 UnicodeSet notInInput = new UnicodeSet(this).complement(); 4735 for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) { 4736 if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) { 4737 add(it.codepoint,it.codepointEnd); 4738 } 4739 } 4740 return this; 4741 } 4742 4743 /** 4744 * Find the first index at or after fromIndex where the UnicodeSet matches at that index. 4745 * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. 4746 * If there is no match, length is returned. 4747 * @internal 4748 * @deprecated This API is ICU internal only. Use span instead. 4749 */ 4750 @Deprecated findIn(CharSequence value, int fromIndex, boolean findNot)4751 public int findIn(CharSequence value, int fromIndex, boolean findNot) { 4752 //TODO add strings, optimize, using ICU4C algorithms 4753 int cp; 4754 for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) { 4755 cp = UTF16.charAt(value, fromIndex); 4756 if (contains(cp) != findNot) { 4757 break; 4758 } 4759 } 4760 return fromIndex; 4761 } 4762 4763 /** 4764 * Find the last index before fromIndex where the UnicodeSet matches at that index. 4765 * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match. 4766 * If there is no match, -1 is returned. 4767 * BEFORE index is not in the UnicodeSet. 4768 * @internal 4769 * @deprecated This API is ICU internal only. Use spanBack instead. 4770 */ 4771 @Deprecated findLastIn(CharSequence value, int fromIndex, boolean findNot)4772 public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { 4773 //TODO add strings, optimize, using ICU4C algorithms 4774 int cp; 4775 fromIndex -= 1; 4776 for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) { 4777 cp = UTF16.charAt(value, fromIndex); 4778 if (contains(cp) != findNot) { 4779 break; 4780 } 4781 } 4782 return fromIndex < 0 ? -1 : fromIndex; 4783 } 4784 4785 /** 4786 * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match. 4787 * @param source The source of the CharSequence to strip from. 4788 * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. 4789 * @return The string after it has been stripped. 4790 * @internal 4791 * @deprecated This API is ICU internal only. Use replaceFrom. 4792 */ 4793 @Deprecated stripFrom(CharSequence source, boolean matches)4794 public String stripFrom(CharSequence source, boolean matches) { 4795 StringBuilder result = new StringBuilder(); 4796 for (int pos = 0; pos < source.length();) { 4797 int inside = findIn(source, pos, !matches); 4798 result.append(source.subSequence(pos, inside)); 4799 pos = findIn(source, inside, matches); // get next start 4800 } 4801 return result.toString(); 4802 } 4803 4804 /** 4805 * Argument values for whether span() and similar functions continue while the current character is contained vs. 4806 * not contained in the set. 4807 * <p> 4808 * The functionality is straightforward for sets with only single code points, without strings (which is the common 4809 * case): 4810 * <ul> 4811 * <li>CONTAINED and SIMPLE work the same. 4812 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. 4813 * <li>span() and spanBack() partition any string the 4814 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). 4815 * <li>Using a 4816 * complemented (inverted) set and the opposite span conditions yields the same results. 4817 * </ul> 4818 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in 4819 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with 4820 * strings: 4821 * <ul> 4822 * <li>The complement of the set contains the opposite set of code points, but the same set of strings. 4823 * Therefore, complementing both the set and the span conditions may yield different results. 4824 * <li>When starting spans 4825 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 4826 * because a set string may start before the later position. 4827 * <li>span(SIMPLE) may be shorter than 4828 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which 4829 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", 4830 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). 4831 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, 4832 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield 4833 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. 4834 * </ul> 4835 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then 4836 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could 4837 * be used. 4838 * <p> 4839 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point 4840 * boundaries, never in the middle of a surrogate pair. 4841 * 4842 * @stable ICU 4.4 4843 */ 4844 public enum SpanCondition { 4845 /** 4846 * Continues a span() while there is no set element at the current position. 4847 * Increments by one code point at a time. 4848 * Stops before the first set element (character or string). 4849 * (For code points only, this is like while contains(current)==false). 4850 * <p> 4851 * When span() returns, the substring between where it started and the position it returned consists only of 4852 * characters that are not in the set, and none of its strings overlap with the span. 4853 * 4854 * @stable ICU 4.4 4855 */ 4856 NOT_CONTAINED, 4857 4858 /** 4859 * Spans the longest substring that is a concatenation of set elements (characters or strings). 4860 * (For characters only, this is like while contains(current)==true). 4861 * <p> 4862 * When span() returns, the substring between where it started and the position it returned consists only of set 4863 * elements (characters or strings) that are in the set. 4864 * <p> 4865 * If a set contains strings, then the span will be the longest substring for which there 4866 * exists at least one non-overlapping concatenation of set elements (characters or strings). 4867 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 4868 * (Java/ICU/Perl regex stops at the first match of an OR.) 4869 * 4870 * @stable ICU 4.4 4871 */ 4872 CONTAINED, 4873 4874 /** 4875 * Continues a span() while there is a set element at the current position. 4876 * Increments by the longest matching element at each position. 4877 * (For characters only, this is like while contains(current)==true). 4878 * <p> 4879 * When span() returns, the substring between where it started and the position it returned consists only of set 4880 * elements (characters or strings) that are in the set. 4881 * <p> 4882 * If a set only contains single characters, then this is the same as CONTAINED. 4883 * <p> 4884 * If a set contains strings, then the span will be the longest substring with a match at each position with the 4885 * longest single set element (character or string). 4886 * <p> 4887 * Use this span condition together with other longest-match algorithms, such as ICU converters 4888 * (ucnv_getUnicodeSet()). 4889 * 4890 * @stable ICU 4.4 4891 */ 4892 SIMPLE, 4893 4894 /** 4895 * One more than the last span condition. 4896 * 4897 * @stable ICU 4.4 4898 */ 4899 CONDITION_COUNT 4900 } 4901 4902 /** 4903 * Get the default symbol table. Null means ordinary processing. For internal use only. 4904 * @return the symbol table 4905 * @internal 4906 * @deprecated This API is ICU internal only. 4907 */ 4908 @Deprecated getDefaultXSymbolTable()4909 public static XSymbolTable getDefaultXSymbolTable() { 4910 return XSYMBOL_TABLE; 4911 } 4912 4913 /** 4914 * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing 4915 * of UnicodeSets. 4916 * <p> 4917 * WARNING: If this function is used with a UnicodeProperty, and the 4918 * Unassigned characters (gc=Cn) are different than in ICU, you MUST call 4919 * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} 4920 * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. 4921 * 4922 * @param xSymbolTable the new default symbol table. 4923 * @internal 4924 * @deprecated This API is ICU internal only. 4925 */ 4926 @Deprecated setDefaultXSymbolTable(XSymbolTable xSymbolTable)4927 public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { 4928 // If the properties override inclusions, these have to be regenerated. 4929 // TODO: Check if the Unicode Tools or Unicode Utilities really need this. 4930 CharacterPropertiesImpl.clear(); 4931 XSYMBOL_TABLE = xSymbolTable; 4932 } 4933 } 4934 //eof 4935