1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 12 import java.io.IOException; 13 import java.text.ParsePosition; 14 import java.util.ArrayList; 15 import java.util.Arrays; 16 import java.util.Collection; 17 import java.util.Collections; 18 import java.util.Iterator; 19 import java.util.NoSuchElementException; 20 import java.util.SortedSet; 21 import java.util.TreeSet; 22 23 import ohos.global.icu.impl.BMPSet; 24 import ohos.global.icu.impl.CharacterPropertiesImpl; 25 import ohos.global.icu.impl.PatternProps; 26 import ohos.global.icu.impl.RuleCharacterIterator; 27 import ohos.global.icu.impl.SortedSetRelation; 28 import ohos.global.icu.impl.StringRange; 29 import ohos.global.icu.impl.UCaseProps; 30 import ohos.global.icu.impl.UPropertyAliases; 31 import ohos.global.icu.impl.UnicodeSetStringSpan; 32 import ohos.global.icu.impl.Utility; 33 import ohos.global.icu.lang.CharSequences; 34 import ohos.global.icu.lang.CharacterProperties; 35 import ohos.global.icu.lang.UCharacter; 36 import ohos.global.icu.lang.UProperty; 37 import ohos.global.icu.lang.UScript; 38 import ohos.global.icu.util.Freezable; 39 import ohos.global.icu.util.ICUUncheckedIOException; 40 import ohos.global.icu.util.OutputInt; 41 import ohos.global.icu.util.ULocale; 42 import ohos.global.icu.util.VersionInfo; 43 44 /** 45 * A mutable set of Unicode characters and multicharacter strings. 46 * Objects of this class represent <em>character classes</em> used 47 * in regular expressions. A character specifies a subset of Unicode 48 * code points. Legal code points are U+0000 to U+10FFFF, inclusive. 49 * 50 * Note: method freeze() will not only make the set immutable, but 51 * also makes important methods much higher performance: 52 * contains(c), containsNone(...), span(...), spanBack(...) etc. 53 * After the object is frozen, any subsequent call that wants to change 54 * the object will throw UnsupportedOperationException. 55 * 56 * <p>The UnicodeSet class is not designed to be subclassed. 57 * 58 * <p><code>UnicodeSet</code> supports two APIs. The first is the 59 * <em>operand</em> API that allows the caller to modify the value of 60 * a <code>UnicodeSet</code> object. It conforms to Java 2's 61 * <code>java.util.Set</code> interface, although 62 * <code>UnicodeSet</code> does not actually implement that 63 * interface. All methods of <code>Set</code> are supported, with the 64 * modification that they take a character range or single character 65 * instead of an <code>Object</code>, and they take a 66 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 67 * operand API may be thought of in terms of boolean logic: a boolean 68 * OR is implemented by <code>add</code>, a boolean AND is implemented 69 * by <code>retain</code>, a boolean XOR is implemented by 70 * <code>complement</code> taking an argument, and a boolean NOT is 71 * implemented by <code>complement</code> with no argument. In terms 72 * of traditional set theory function names, <code>add</code> is a 73 * union, <code>retain</code> is an intersection, <code>remove</code> 74 * is an asymmetric difference, and <code>complement</code> with no 75 * argument is a set complement with respect to the superset range 76 * <code>MIN_VALUE-MAX_VALUE</code> 77 * 78 * <p>The second API is the 79 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 80 * <code>java.text.Format</code>-derived classes. Unlike the 81 * methods that add characters, add categories, and control the logic 82 * of the set, the method <code>applyPattern()</code> sets all 83 * attributes of a <code>UnicodeSet</code> at once, based on a 84 * string pattern. 85 * 86 * <p><b>Pattern syntax</b></p> 87 * 88 * Patterns are accepted by the constructors and the 89 * <code>applyPattern()</code> methods and returned by the 90 * <code>toPattern()</code> method. These patterns follow a syntax 91 * similar to that employed by version 8 regular expression character 92 * classes. Here are some simple examples: 93 * 94 * <blockquote> 95 * <table> 96 * <tr style="vertical-align: top"> 97 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td> 98 * <td style="vertical-align: top;">No characters</td> 99 * </tr><tr style="vertical-align: top"> 100 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td> 101 * <td style="vertical-align: top;">The character 'a'</td> 102 * </tr><tr style="vertical-align: top"> 103 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td> 104 * <td style="vertical-align: top;">The characters 'a' and 'e'</td> 105 * </tr> 106 * <tr> 107 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td> 108 * <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code 109 * point order</td> 110 * </tr> 111 * <tr> 112 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td> 113 * <td style="vertical-align: top;">The character U+4E01</td> 114 * </tr> 115 * <tr> 116 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td> 117 * <td style="vertical-align: top;">The character 'a' and the multicharacter strings "ab" and 118 * "ac"</td> 119 * </tr> 120 * <tr> 121 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td> 122 * <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td> 123 * </tr> 124 * </table> 125 * </blockquote> 126 * 127 * Any character may be preceded by a backslash in order to remove any special 128 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are 129 * ignored, unless they are escaped. 130 * 131 * <p>Property patterns specify a set of characters having a certain 132 * property as defined by the Unicode standard. Both the POSIX-like 133 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 134 * complete list of supported property patterns, see the User's Guide 135 * for UnicodeSet at 136 * <a href="http://www.icu-project.org/userguide/unicodeSet.html"> 137 * http://www.icu-project.org/userguide/unicodeSet.html</a>. 138 * Actual determination of property data is defined by the underlying 139 * Unicode database as implemented by UCharacter. 140 * 141 * <p>Patterns specify individual characters, ranges of characters, and 142 * Unicode property sets. When elements are concatenated, they 143 * specify their union. To complement a set, place a '^' immediately 144 * after the opening '['. Property patterns are inverted by modifying 145 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 146 * '^' has no special meaning. 147 * 148 * <p>Ranges are indicated by placing two a '-' between two 149 * characters, as in "a-z". This specifies the range of all 150 * characters from the left to the right, in Unicode order. If the 151 * left character is greater than or equal to the 152 * right character it is a syntax error. If a '-' occurs as the first 153 * character after the opening '[' or '[^', or if it occurs as the 154 * last character before the closing ']', then it is taken as a 155 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 156 * set of three characters, 'a', 'b', and '-'. 157 * 158 * <p>Sets may be intersected using the '&' operator or the asymmetric 159 * set difference may be taken using the '-' operator, for example, 160 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 161 * with values less than 4096. Operators ('&' and '|') have equal 162 * precedence and bind left-to-right. Thus 163 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 164 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 165 * difference; intersection is commutative. 166 * 167 * <table> 168 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a' 169 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a' 170 * through 'z' and all letters in between, in Unicode order 171 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing 172 * all characters but 'a' through 'z', 173 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 174 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 175 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 176 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 177 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 178 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 179 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 180 * <em>pat2</em> 181 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code> 182 * <td>The set of characters having the specified 183 * Unicode property; in 184 * this case, Unicode uppercase letters 185 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code> 186 * <td>The set of characters <em>not</em> having the given 187 * Unicode property 188 * </table> 189 * 190 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p> 191 * 192 * <p><b>Formal syntax</b></p> 193 * 194 * <blockquote> 195 * <table> 196 * <tr style="vertical-align: top"> 197 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern := </code></td> 198 * <td style="vertical-align: top;"><code>('[' '^'? item* ']') | 199 * property</code></td> 200 * </tr> 201 * <tr style="vertical-align: top"> 202 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item := </code></td> 203 * <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br> 204 * </code></td> 205 * </tr> 206 * <tr style="vertical-align: top"> 207 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr := </code></td> 208 * <td style="vertical-align: top;"><code>pattern | pattern-expr pattern | 209 * pattern-expr op pattern<br> 210 * </code></td> 211 * </tr> 212 * <tr style="vertical-align: top"> 213 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op := </code></td> 214 * <td style="vertical-align: top;"><code>'&' | '-'<br> 215 * </code></td> 216 * </tr> 217 * <tr style="vertical-align: top"> 218 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special := </code></td> 219 * <td style="vertical-align: top;"><code>'[' | ']' | '-'<br> 220 * </code></td> 221 * </tr> 222 * <tr style="vertical-align: top"> 223 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char := </code></td> 224 * <td style="vertical-align: top;"><em>any character that is not</em><code> special<br> 225 * | ('\\' </code><em>any character</em><code>)<br> 226 * | ('\u' hex hex hex hex)<br> 227 * </code></td> 228 * </tr> 229 * <tr style="vertical-align: top"> 230 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> 231 * <td style="vertical-align: top;"><em>any character for which 232 * </em><code>Character.digit(c, 16)</code><em> 233 * returns a non-negative result</em></td> 234 * </tr> 235 * <tr> 236 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td> 237 * <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td> 238 * </tr> 239 * </table> 240 * <br> 241 * <table border="1"> 242 * <tr> 243 * <td>Legend: <table> 244 * <tr> 245 * <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td> 246 * <td style="width: 20; vertical-align: top;"> </td> 247 * <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td> 248 * </tr> 249 * <tr> 250 * <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td> 251 * <td style="vertical-align: top;"></td> 252 * <td style="vertical-align: top;">zero or one instance of <code>a</code><br> 253 * </td> 254 * </tr> 255 * <tr> 256 * <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td> 257 * <td style="vertical-align: top;"></td> 258 * <td style="vertical-align: top;">one or more instances of <code>a</code><br> 259 * </td> 260 * </tr> 261 * <tr> 262 * <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td> 263 * <td style="vertical-align: top;"></td> 264 * <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br> 265 * </td> 266 * </tr> 267 * <tr> 268 * <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td> 269 * <td style="vertical-align: top;"></td> 270 * <td style="vertical-align: top;">the literal string between the quotes </td> 271 * </tr> 272 * </table> 273 * </td> 274 * </tr> 275 * </table> 276 * </blockquote> 277 * <p>To iterate over contents of UnicodeSet, the following are available: 278 * <ul><li>{@link #ranges()} to iterate through the ranges</li> 279 * <li>{@link #strings()} to iterate through the strings</li> 280 * <li>{@link #iterator()} to iterate through the entire contents in a single loop. 281 * That method is, however, not particularly efficient, since it "boxes" each code point into a String. 282 * </ul> 283 * All of the above can be used in <b>for</b> loops. 284 * The {@link ohos.global.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. 285 * <p>To replace, count elements, or delete spans, see {@link ohos.global.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 286 * 287 * @author Alan Liu 288 * @see UnicodeSetIterator 289 * @see UnicodeSetSpanner 290 */ 291 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> { 292 private static final SortedSet<String> EMPTY_STRINGS = 293 Collections.unmodifiableSortedSet(new TreeSet<String>()); 294 295 /** 296 * Constant for the empty set. 297 */ 298 public static final UnicodeSet EMPTY = new UnicodeSet().freeze(); 299 /** 300 * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.) 301 */ 302 public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); 303 304 private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing 305 306 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 307 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 308 // 110000 for codepoints 309 310 /** 311 * Enough for sets with few ranges. 312 * For example, White_Space has 10 ranges, list length 21. 313 */ 314 private static final int INITIAL_CAPACITY = 25; 315 316 /** Max list [0, 1, 2, ..., max code point, HIGH] */ 317 private static final int MAX_LENGTH = HIGH + 1; 318 319 /** 320 * Minimum value that can be stored in a UnicodeSet. 321 */ 322 public static final int MIN_VALUE = LOW; 323 324 /** 325 * Maximum value that can be stored in a UnicodeSet. 326 */ 327 public static final int MAX_VALUE = HIGH - 1; 328 329 private int len; // length used; list may be longer to minimize reallocs 330 private int[] list; // MUST be terminated with HIGH 331 private int[] rangeList; // internal buffer 332 private int[] buffer; // internal buffer 333 334 // is not private so that UnicodeSetIterator can get access 335 SortedSet<String> strings = EMPTY_STRINGS; 336 337 /** 338 * The pattern representation of this set. This may not be the 339 * most economical pattern. It is the pattern supplied to 340 * applyPattern(), with variables substituted and whitespace 341 * removed. For sets constructed without applyPattern(), or 342 * modified using the non-pattern API, this string will be null, 343 * indicating that toPattern() must generate a pattern 344 * representation from the inversion list. 345 */ 346 private String pat = null; 347 348 // Special property set IDs 349 private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF] 350 private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] 351 private static final String ASSIGNED = "Assigned"; // [:^Cn:] 352 353 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. 354 private volatile UnicodeSetStringSpan stringSpan; 355 //---------------------------------------------------------------- 356 // Public API 357 //---------------------------------------------------------------- 358 359 /** 360 * Constructs an empty set. 361 */ UnicodeSet()362 public UnicodeSet() { 363 list = new int[INITIAL_CAPACITY]; 364 list[0] = HIGH; 365 len = 1; 366 } 367 368 /** 369 * Constructs a copy of an existing set. 370 */ UnicodeSet(UnicodeSet other)371 public UnicodeSet(UnicodeSet other) { 372 set(other); 373 } 374 375 /** 376 * Constructs a set containing the given range. If <code>end > 377 * start</code> then an empty set is created. 378 * 379 * @param start first character, inclusive, of range 380 * @param end last character, inclusive, of range 381 */ UnicodeSet(int start, int end)382 public UnicodeSet(int start, int end) { 383 this(); 384 add(start, end); 385 } 386 387 /** 388 * Quickly constructs a set from a set of ranges <s0, e0, s1, e1, s2, e2, ..., sn, en>. 389 * There must be an even number of integers, and they must be all greater than zero, 390 * all less than or equal to Character.MAX_CODE_POINT. 391 * In each pair (..., si, ei, ...) it must be true that si <= ei 392 * Between adjacent pairs (...ei, sj...), it must be true that ei+1 < sj 393 * @param pairs pairs of character representing ranges 394 */ UnicodeSet(int... pairs)395 public UnicodeSet(int... pairs) { 396 if ((pairs.length & 1) != 0) { 397 throw new IllegalArgumentException("Must have even number of integers"); 398 } 399 list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set. 400 len = list.length; 401 int last = -1; // used to ensure that the results are monotonically increasing. 402 int i = 0; 403 while (i < pairs.length) { 404 int start = pairs[i]; 405 if (last >= start) { 406 throw new IllegalArgumentException("Must be monotonically increasing."); 407 } 408 list[i++] = start; 409 int limit = pairs[i] + 1; 410 if (start >= limit) { 411 throw new IllegalArgumentException("Must be monotonically increasing."); 412 } 413 list[i++] = last = limit; 414 } 415 list[i] = HIGH; // terminate 416 } 417 418 /** 419 * Constructs a set from the given pattern. See the class description 420 * for the syntax of the pattern language. Whitespace is ignored. 421 * @param pattern a string specifying what characters are in the set 422 * @exception java.lang.IllegalArgumentException if the pattern contains 423 * a syntax error. 424 */ UnicodeSet(String pattern)425 public UnicodeSet(String pattern) { 426 this(); 427 applyPattern(pattern, null, null, IGNORE_SPACE); 428 } 429 430 /** 431 * Constructs a set from the given pattern. See the class description 432 * for the syntax of the pattern language. 433 * @param pattern a string specifying what characters are in the set 434 * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters 435 * @exception java.lang.IllegalArgumentException if the pattern contains 436 * a syntax error. 437 */ UnicodeSet(String pattern, boolean ignoreWhitespace)438 public UnicodeSet(String pattern, boolean ignoreWhitespace) { 439 this(); 440 applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 441 } 442 443 /** 444 * Constructs a set from the given pattern. See the class description 445 * for the syntax of the pattern language. 446 * @param pattern a string specifying what characters are in the set 447 * @param options a bitmask indicating which options to apply. 448 * Valid options are IGNORE_SPACE and CASE. 449 * @exception java.lang.IllegalArgumentException if the pattern contains 450 * a syntax error. 451 */ UnicodeSet(String pattern, int options)452 public UnicodeSet(String pattern, int options) { 453 this(); 454 applyPattern(pattern, null, null, options); 455 } 456 457 /** 458 * Constructs a set from the given pattern. See the class description 459 * for the syntax of the pattern language. 460 * @param pattern a string specifying what characters are in the set 461 * @param pos on input, the position in pattern at which to start parsing. 462 * On output, the position after the last character parsed. 463 * @param symbols a symbol table mapping variables to char[] arrays 464 * and chars to UnicodeSets 465 * @exception java.lang.IllegalArgumentException if the pattern 466 * contains a syntax error. 467 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols)468 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) { 469 this(); 470 applyPattern(pattern, pos, symbols, IGNORE_SPACE); 471 } 472 473 /** 474 * Constructs a set from the given pattern. See the class description 475 * for the syntax of the pattern language. 476 * @param pattern a string specifying what characters are in the set 477 * @param pos on input, the position in pattern at which to start parsing. 478 * On output, the position after the last character parsed. 479 * @param symbols a symbol table mapping variables to char[] arrays 480 * and chars to UnicodeSets 481 * @param options a bitmask indicating which options to apply. 482 * Valid options are IGNORE_SPACE and CASE. 483 * @exception java.lang.IllegalArgumentException if the pattern 484 * contains a syntax error. 485 */ UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options)486 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) { 487 this(); 488 applyPattern(pattern, pos, symbols, options); 489 } 490 491 492 /** 493 * Return a new set that is equivalent to this one. 494 */ 495 @Override clone()496 public Object clone() { 497 if (isFrozen()) { 498 return this; 499 } 500 return new UnicodeSet(this); 501 } 502 503 /** 504 * Make this object represent the range <code>start - end</code>. 505 * If <code>end > start</code> then this object is set to an empty range. 506 * 507 * @param start first character in the set, inclusive 508 * @param end last character in the set, inclusive 509 */ set(int start, int end)510 public UnicodeSet set(int start, int end) { 511 checkFrozen(); 512 clear(); 513 complement(start, end); 514 return this; 515 } 516 517 /** 518 * Make this object represent the same set as <code>other</code>. 519 * @param other a <code>UnicodeSet</code> whose value will be 520 * copied to this object 521 */ set(UnicodeSet other)522 public UnicodeSet set(UnicodeSet other) { 523 checkFrozen(); 524 list = Arrays.copyOf(other.list, other.len); 525 len = other.len; 526 pat = other.pat; 527 if (other.hasStrings()) { 528 strings = new TreeSet<>(other.strings); 529 } else { 530 strings = EMPTY_STRINGS; 531 } 532 return this; 533 } 534 535 /** 536 * Modifies this set to represent the set specified by the given pattern. 537 * See the class description for the syntax of the pattern language. 538 * Whitespace is ignored. 539 * @param pattern a string specifying what characters are in the set 540 * @exception java.lang.IllegalArgumentException if the pattern 541 * contains a syntax error. 542 */ applyPattern(String pattern)543 public final UnicodeSet applyPattern(String pattern) { 544 checkFrozen(); 545 return applyPattern(pattern, null, null, IGNORE_SPACE); 546 } 547 548 /** 549 * Modifies this set to represent the set specified by the given pattern, 550 * optionally ignoring whitespace. 551 * See the class description for the syntax of the pattern language. 552 * @param pattern a string specifying what characters are in the set 553 * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored 554 * @exception java.lang.IllegalArgumentException if the pattern 555 * contains a syntax error. 556 */ applyPattern(String pattern, boolean ignoreWhitespace)557 public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) { 558 checkFrozen(); 559 return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 560 } 561 562 /** 563 * Modifies this set to represent the set specified by the given pattern, 564 * optionally ignoring whitespace. 565 * See the class description for the syntax of the pattern language. 566 * @param pattern a string specifying what characters are in the set 567 * @param options a bitmask indicating which options to apply. 568 * Valid options are IGNORE_SPACE and CASE. 569 * @exception java.lang.IllegalArgumentException if the pattern 570 * contains a syntax error. 571 */ applyPattern(String pattern, int options)572 public UnicodeSet applyPattern(String pattern, int options) { 573 checkFrozen(); 574 return applyPattern(pattern, null, null, options); 575 } 576 577 /** 578 * Return true if the given position, in the given pattern, appears 579 * to be the start of a UnicodeSet pattern. 580 * @hide unsupported on OHOS 581 */ resemblesPattern(String pattern, int pos)582 public static boolean resemblesPattern(String pattern, int pos) { 583 return ((pos+1) < pattern.length() && 584 pattern.charAt(pos) == '[') || 585 resemblesPropertyPattern(pattern, pos); 586 } 587 588 /** 589 * TODO: create Appendable version of UTF16.append(buf, c), 590 * maybe in new class Appendables? 591 * @throws IOException 592 */ appendCodePoint(Appendable app, int c)593 private static void appendCodePoint(Appendable app, int c) { 594 assert 0 <= c && c <= 0x10ffff; 595 try { 596 if (c <= 0xffff) { 597 app.append((char) c); 598 } else { 599 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c)); 600 } 601 } catch (IOException e) { 602 throw new ICUUncheckedIOException(e); 603 } 604 } 605 606 /** 607 * TODO: create class Appendables? 608 * @throws IOException 609 */ append(Appendable app, CharSequence s)610 private static void append(Appendable app, CharSequence s) { 611 try { 612 app.append(s); 613 } catch (IOException e) { 614 throw new ICUUncheckedIOException(e); 615 } 616 } 617 618 /** 619 * Append the <code>toPattern()</code> representation of a 620 * string to the given <code>Appendable</code>. 621 */ _appendToPat(T buf, String s, boolean escapeUnprintable)622 private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) { 623 int cp; 624 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 625 cp = s.codePointAt(i); 626 _appendToPat(buf, cp, escapeUnprintable); 627 } 628 return buf; 629 } 630 631 /** 632 * Append the <code>toPattern()</code> representation of a 633 * character to the given <code>Appendable</code>. 634 */ _appendToPat(T buf, int c, boolean escapeUnprintable)635 private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) { 636 try { 637 if (escapeUnprintable && Utility.isUnprintable(c)) { 638 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 639 // unprintable 640 if (Utility.escapeUnprintable(buf, c)) { 641 return buf; 642 } 643 } 644 // Okay to let ':' pass through 645 switch (c) { 646 case '[': // SET_OPEN: 647 case ']': // SET_CLOSE: 648 case '-': // HYPHEN: 649 case '^': // COMPLEMENT: 650 case '&': // INTERSECTION: 651 case '\\': //BACKSLASH: 652 case '{': 653 case '}': 654 case '$': 655 case ':': 656 buf.append('\\'); 657 break; 658 default: 659 // Escape whitespace 660 if (PatternProps.isWhiteSpace(c)) { 661 buf.append('\\'); 662 } 663 break; 664 } 665 appendCodePoint(buf, c); 666 return buf; 667 } catch (IOException e) { 668 throw new ICUUncheckedIOException(e); 669 } 670 } 671 672 /** 673 * Returns a string representation of this set. If the result of 674 * calling this function is passed to a UnicodeSet constructor, it 675 * will produce another set that is equal to this one. 676 */ 677 @Override toPattern(boolean escapeUnprintable)678 public String toPattern(boolean escapeUnprintable) { 679 if (pat != null && !escapeUnprintable) { 680 return pat; 681 } 682 StringBuilder result = new StringBuilder(); 683 return _toPattern(result, escapeUnprintable).toString(); 684 } 685 686 /** 687 * Append a string representation of this set to result. This will be 688 * a cleaned version of the string passed to applyPattern(), if there 689 * is one. Otherwise it will be generated. 690 */ _toPattern(T result, boolean escapeUnprintable)691 private <T extends Appendable> T _toPattern(T result, 692 boolean escapeUnprintable) { 693 if (pat == null) { 694 return appendNewPattern(result, escapeUnprintable, true); 695 } 696 try { 697 if (!escapeUnprintable) { 698 result.append(pat); 699 return result; 700 } 701 boolean oddNumberOfBackslashes = false; 702 for (int i=0; i<pat.length(); ) { 703 int c = pat.codePointAt(i); 704 i += Character.charCount(c); 705 if (Utility.isUnprintable(c)) { 706 // If the unprintable character is preceded by an odd 707 // number of backslashes, then it has been escaped 708 // and we omit the last backslash. 709 Utility.escapeUnprintable(result, c); 710 oddNumberOfBackslashes = false; 711 } else if (!oddNumberOfBackslashes && c == '\\') { 712 // Temporarily withhold an odd-numbered backslash. 713 oddNumberOfBackslashes = true; 714 } else { 715 if (oddNumberOfBackslashes) { 716 result.append('\\'); 717 } 718 appendCodePoint(result, c); 719 oddNumberOfBackslashes = false; 720 } 721 } 722 if (oddNumberOfBackslashes) { 723 result.append('\\'); 724 } 725 return result; 726 } catch (IOException e) { 727 throw new ICUUncheckedIOException(e); 728 } 729 } 730 731 /** 732 * Generate and append a string representation of this set to result. 733 * This does not use this.pat, the cleaned up copy of the string 734 * passed to applyPattern(). 735 * @param result the buffer into which to generate the pattern 736 * @param escapeUnprintable escape unprintable characters if true 737 */ _generatePattern(StringBuffer result, boolean escapeUnprintable)738 public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { 739 return _generatePattern(result, escapeUnprintable, true); 740 } 741 742 /** 743 * Generate and append a string representation of this set to result. 744 * This does not use this.pat, the cleaned up copy of the string 745 * passed to applyPattern(). 746 * @param includeStrings if false, doesn't include the strings. 747 */ _generatePattern(StringBuffer result, boolean escapeUnprintable, boolean includeStrings)748 public StringBuffer _generatePattern(StringBuffer result, 749 boolean escapeUnprintable, boolean includeStrings) { 750 return appendNewPattern(result, escapeUnprintable, includeStrings); 751 } 752 appendNewPattern( T result, boolean escapeUnprintable, boolean includeStrings)753 private <T extends Appendable> T appendNewPattern( 754 T result, boolean escapeUnprintable, boolean includeStrings) { 755 try { 756 result.append('['); 757 758 int count = getRangeCount(); 759 760 // If the set contains at least 2 intervals and includes both 761 // MIN_VALUE and MAX_VALUE, then the inverse representation will 762 // be more economical. 763 if (count > 1 && 764 getRangeStart(0) == MIN_VALUE && 765 getRangeEnd(count-1) == MAX_VALUE) { 766 767 // Emit the inverse 768 result.append('^'); 769 770 for (int i = 1; i < count; ++i) { 771 int start = getRangeEnd(i-1)+1; 772 int end = getRangeStart(i)-1; 773 _appendToPat(result, start, escapeUnprintable); 774 if (start != end) { 775 if ((start+1) != end) { 776 result.append('-'); 777 } 778 _appendToPat(result, end, escapeUnprintable); 779 } 780 } 781 } 782 783 // Default; emit the ranges as pairs 784 else { 785 for (int i = 0; i < count; ++i) { 786 int start = getRangeStart(i); 787 int end = getRangeEnd(i); 788 _appendToPat(result, start, escapeUnprintable); 789 if (start != end) { 790 if ((start+1) != end) { 791 result.append('-'); 792 } 793 _appendToPat(result, end, escapeUnprintable); 794 } 795 } 796 } 797 798 if (includeStrings && hasStrings()) { 799 for (String s : strings) { 800 result.append('{'); 801 _appendToPat(result, s, escapeUnprintable); 802 result.append('}'); 803 } 804 } 805 result.append(']'); 806 return result; 807 } catch (IOException e) { 808 throw new ICUUncheckedIOException(e); 809 } 810 } 811 hasStrings()812 boolean hasStrings() { 813 return !strings.isEmpty(); 814 } 815 816 /** 817 * Returns the number of elements in this set (its cardinality) 818 * Note than the elements of a set may include both individual 819 * codepoints and strings. 820 * 821 * @return the number of elements in this set (its cardinality). 822 */ size()823 public int size() { 824 int n = 0; 825 int count = getRangeCount(); 826 for (int i = 0; i < count; ++i) { 827 n += getRangeEnd(i) - getRangeStart(i) + 1; 828 } 829 return n + strings.size(); 830 } 831 832 /** 833 * Returns <tt>true</tt> if this set contains no elements. 834 * 835 * @return <tt>true</tt> if this set contains no elements. 836 */ isEmpty()837 public boolean isEmpty() { 838 return len == 1 && !hasStrings(); 839 } 840 841 /** 842 * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if 843 * this set contains any character whose low byte is the given 844 * value. This is used by <tt>RuleBasedTransliterator</tt> for 845 * indexing. 846 */ 847 @Override matchesIndexValue(int v)848 public boolean matchesIndexValue(int v) { 849 /* The index value v, in the range [0,255], is contained in this set if 850 * it is contained in any pair of this set. Pairs either have the high 851 * bytes equal, or unequal. If the high bytes are equal, then we have 852 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 853 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 854 * Then v is contained if xx <= v || v <= yy. (This is identical to the 855 * time zone month containment logic.) 856 */ 857 for (int i=0; i<getRangeCount(); ++i) { 858 int low = getRangeStart(i); 859 int high = getRangeEnd(i); 860 if ((low & ~0xFF) == (high & ~0xFF)) { 861 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 862 return true; 863 } 864 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 865 return true; 866 } 867 } 868 if (hasStrings()) { 869 for (String s : strings) { 870 //if (s.length() == 0) { 871 // // Empty strings match everything 872 // return true; 873 //} 874 // assert(s.length() != 0); // We enforce this elsewhere 875 int c = UTF16.charAt(s, 0); 876 if ((c & 0xFF) == v) { 877 return true; 878 } 879 } 880 } 881 return false; 882 } 883 884 /** 885 * Implementation of UnicodeMatcher.matches(). Always matches the 886 * longest possible multichar string. 887 */ 888 @Override matches(Replaceable text, int[] offset, int limit, boolean incremental)889 public int matches(Replaceable text, 890 int[] offset, 891 int limit, 892 boolean incremental) { 893 894 if (offset[0] == limit) { 895 // Strings, if any, have length != 0, so we don't worry 896 // about them here. If we ever allow zero-length strings 897 // we much check for them here. 898 if (contains(UnicodeMatcher.ETHER)) { 899 return incremental ? U_PARTIAL_MATCH : U_MATCH; 900 } else { 901 return U_MISMATCH; 902 } 903 } else { 904 if (hasStrings()) { // try strings first 905 906 // might separate forward and backward loops later 907 // for now they are combined 908 909 // TODO Improve efficiency of this, at least in the forward 910 // direction, if not in both. In the forward direction we 911 // can assume the strings are sorted. 912 913 boolean forward = offset[0] < limit; 914 915 // firstChar is the leftmost char to match in the 916 // forward direction or the rightmost char to match in 917 // the reverse direction. 918 char firstChar = text.charAt(offset[0]); 919 920 // If there are multiple strings that can match we 921 // return the longest match. 922 int highWaterLength = 0; 923 924 for (String trial : strings) { 925 //if (trial.length() == 0) { 926 // return U_MATCH; // null-string always matches 927 //} 928 // assert(trial.length() != 0); // We ensure this elsewhere 929 930 char c = trial.charAt(forward ? 0 : trial.length() - 1); 931 932 // Strings are sorted, so we can optimize in the 933 // forward direction. 934 if (forward && c > firstChar) break; 935 if (c != firstChar) continue; 936 937 int length = matchRest(text, offset[0], limit, trial); 938 939 if (incremental) { 940 int maxLen = forward ? limit-offset[0] : offset[0]-limit; 941 if (length == maxLen) { 942 // We have successfully matched but only up to limit. 943 return U_PARTIAL_MATCH; 944 } 945 } 946 947 if (length == trial.length()) { 948 // We have successfully matched the whole string. 949 if (length > highWaterLength) { 950 highWaterLength = length; 951 } 952 // In the forward direction we know strings 953 // are sorted so we can bail early. 954 if (forward && length < highWaterLength) { 955 break; 956 } 957 continue; 958 } 959 } 960 961 // We've checked all strings without a partial match. 962 // If we have full matches, return the longest one. 963 if (highWaterLength != 0) { 964 offset[0] += forward ? highWaterLength : -highWaterLength; 965 return U_MATCH; 966 } 967 } 968 return super.matches(text, offset, limit, incremental); 969 } 970 } 971 972 /** 973 * Returns the longest match for s in text at the given position. 974 * If limit > start then match forward from start+1 to limit 975 * matching all characters except s.charAt(0). If limit < start, 976 * go backward starting from start-1 matching all characters 977 * except s.charAt(s.length()-1). This method assumes that the 978 * first character, text.charAt(start), matches s, so it does not 979 * check it. 980 * @param text the text to match 981 * @param start the first character to match. In the forward 982 * direction, text.charAt(start) is matched against s.charAt(0). 983 * In the reverse direction, it is matched against 984 * s.charAt(s.length()-1). 985 * @param limit the limit offset for matching, either last+1 in 986 * the forward direction, or last-1 in the reverse direction, 987 * where last is the index of the last character to match. 988 * @return If part of s matches up to the limit, return |limit - 989 * start|. If all of s matches before reaching the limit, return 990 * s.length(). If there is a mismatch between s and text, return 991 * 0 992 */ matchRest(Replaceable text, int start, int limit, String s)993 private static int matchRest (Replaceable text, int start, int limit, String s) { 994 int maxLen; 995 int slen = s.length(); 996 if (start < limit) { 997 maxLen = limit - start; 998 if (maxLen > slen) maxLen = slen; 999 for (int i = 1; i < maxLen; ++i) { 1000 if (text.charAt(start + i) != s.charAt(i)) return 0; 1001 } 1002 } else { 1003 maxLen = start - limit; 1004 if (maxLen > slen) maxLen = slen; 1005 --slen; // <=> slen = s.length() - 1; 1006 for (int i = 1; i < maxLen; ++i) { 1007 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1008 } 1009 } 1010 return maxLen; 1011 } 1012 1013 /** 1014 * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. 1015 * @deprecated This API is ICU internal only. 1016 * @hide deprecated on icu4j-org 1017 * @hide draft / provisional / internal are hidden on OHOS 1018 */ 1019 @Deprecated matchesAt(CharSequence text, int offset)1020 public int matchesAt(CharSequence text, int offset) { 1021 int lastLen = -1; 1022 strings: 1023 if (hasStrings()) { 1024 char firstChar = text.charAt(offset); 1025 String trial = null; 1026 // find the first string starting with firstChar 1027 Iterator<String> it = strings.iterator(); 1028 while (it.hasNext()) { 1029 trial = it.next(); 1030 char firstStringChar = trial.charAt(0); 1031 if (firstStringChar < firstChar) continue; 1032 if (firstStringChar > firstChar) break strings; 1033 } 1034 1035 // now keep checking string until we get the longest one 1036 for (;;) { 1037 int tempLen = matchesAt(text, offset, trial); 1038 if (lastLen > tempLen) break strings; 1039 lastLen = tempLen; 1040 if (!it.hasNext()) break; 1041 trial = it.next(); 1042 } 1043 } 1044 1045 if (lastLen < 2) { 1046 int cp = UTF16.charAt(text, offset); 1047 if (contains(cp)) lastLen = UTF16.getCharCount(cp); 1048 } 1049 1050 return offset+lastLen; 1051 } 1052 1053 /** 1054 * Does one string contain another, starting at a specific offset? 1055 * @param text text to match 1056 * @param offsetInText offset within that text 1057 * @param substring substring to match at offset in text 1058 * @return -1 if match fails, otherwise other.length() 1059 */ 1060 // Note: This method was moved from CollectionUtilities matchesAt(CharSequence text, int offsetInText, CharSequence substring)1061 private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) { 1062 int len = substring.length(); 1063 int textLength = text.length(); 1064 if (textLength + offsetInText > len) { 1065 return -1; 1066 } 1067 int i = 0; 1068 for (int j = offsetInText; i < len; ++i, ++j) { 1069 char pc = substring.charAt(i); 1070 char tc = text.charAt(j); 1071 if (pc != tc) return -1; 1072 } 1073 return i; 1074 } 1075 1076 /** 1077 * Implementation of UnicodeMatcher API. Union the set of all 1078 * characters that may be matched by this object into the given 1079 * set. 1080 * @param toUnionTo the set into which to union the source characters 1081 */ 1082 @Override addMatchSetTo(UnicodeSet toUnionTo)1083 public void addMatchSetTo(UnicodeSet toUnionTo) { 1084 toUnionTo.addAll(this); 1085 } 1086 1087 /** 1088 * Returns the index of the given character within this set, where 1089 * the set is ordered by ascending code point. If the character 1090 * is not in this set, return -1. The inverse of this method is 1091 * <code>charAt()</code>. 1092 * @return an index from 0..size()-1, or -1 1093 */ indexOf(int c)1094 public int indexOf(int c) { 1095 if (c < MIN_VALUE || c > MAX_VALUE) { 1096 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1097 } 1098 int i = 0; 1099 int n = 0; 1100 for (;;) { 1101 int start = list[i++]; 1102 if (c < start) { 1103 return -1; 1104 } 1105 int limit = list[i++]; 1106 if (c < limit) { 1107 return n + c - start; 1108 } 1109 n += limit - start; 1110 } 1111 } 1112 1113 /** 1114 * Returns the character at the given index within this set, where 1115 * the set is ordered by ascending code point. If the index is 1116 * out of range, return -1. The inverse of this method is 1117 * <code>indexOf()</code>. 1118 * @param index an index from 0..size()-1 1119 * @return the character at the given index, or -1. 1120 */ charAt(int index)1121 public int charAt(int index) { 1122 if (index >= 0) { 1123 // len2 is the largest even integer <= len, that is, it is len 1124 // for even values and len-1 for odd values. With odd values 1125 // the last entry is UNICODESET_HIGH. 1126 int len2 = len & ~1; 1127 for (int i=0; i < len2;) { 1128 int start = list[i++]; 1129 int count = list[i++] - start; 1130 if (index < count) { 1131 return start + index; 1132 } 1133 index -= count; 1134 } 1135 } 1136 return -1; 1137 } 1138 1139 /** 1140 * Adds the specified range to this set if it is not already 1141 * present. If this set already contains the specified range, 1142 * the call leaves this set unchanged. If <code>end > start</code> 1143 * then an empty range is added, leaving the set unchanged. 1144 * 1145 * @param start first character, inclusive, of range to be added 1146 * to this set. 1147 * @param end last character, inclusive, of range to be added 1148 * to this set. 1149 */ add(int start, int end)1150 public UnicodeSet add(int start, int end) { 1151 checkFrozen(); 1152 return add_unchecked(start, end); 1153 } 1154 1155 /** 1156 * Adds all characters in range (uses preferred naming convention). 1157 * @param start The index of where to start on adding all characters. 1158 * @param end The index of where to end on adding all characters. 1159 * @return a reference to this object 1160 */ addAll(int start, int end)1161 public UnicodeSet addAll(int start, int end) { 1162 checkFrozen(); 1163 return add_unchecked(start, end); 1164 } 1165 1166 // for internal use, after checkFrozen has been called add_unchecked(int start, int end)1167 private UnicodeSet add_unchecked(int start, int end) { 1168 if (start < MIN_VALUE || start > MAX_VALUE) { 1169 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1170 } 1171 if (end < MIN_VALUE || end > MAX_VALUE) { 1172 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1173 } 1174 if (start < end) { 1175 int limit = end + 1; 1176 // Fast path for adding a new range after the last one. 1177 // Odd list length: [..., lastStart, lastLimit, HIGH] 1178 if ((len & 1) != 0) { 1179 // If the list is empty, set lastLimit low enough to not be adjacent to 0. 1180 int lastLimit = len == 1 ? -2 : list[len - 2]; 1181 if (lastLimit <= start) { 1182 checkFrozen(); 1183 if (lastLimit == start) { 1184 // Extend the last range. 1185 list[len - 2] = limit; 1186 if (limit == HIGH) { 1187 --len; 1188 } 1189 } else { 1190 list[len - 1] = start; 1191 if (limit < HIGH) { 1192 ensureCapacity(len + 2); 1193 list[len++] = limit; 1194 list[len++] = HIGH; 1195 } else { // limit == HIGH 1196 ensureCapacity(len + 1); 1197 list[len++] = HIGH; 1198 } 1199 } 1200 pat = null; 1201 return this; 1202 } 1203 } 1204 // This is slow. Could be much faster using findCodePoint(start) 1205 // and modifying the list, dealing with adjacent & overlapping ranges. 1206 add(range(start, end), 2, 0); 1207 } else if (start == end) { 1208 add(start); 1209 } 1210 return this; 1211 } 1212 1213 // /** 1214 // * Format out the inversion list as a string, for debugging. Uncomment when 1215 // * needed. 1216 // */ 1217 // public final String dump() { 1218 // StringBuffer buf = new StringBuffer("["); 1219 // for (int i=0; i<len; ++i) { 1220 // if (i != 0) buf.append(", "); 1221 // int c = list[i]; 1222 // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') { 1223 // // buf.append((char) c); 1224 // //} else { 1225 // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6)); 1226 // //} 1227 // } 1228 // buf.append("]"); 1229 // return buf.toString(); 1230 // } 1231 1232 /** 1233 * Adds the specified character to this set if it is not already 1234 * present. If this set already contains the specified character, 1235 * the call leaves this set unchanged. 1236 */ add(int c)1237 public final UnicodeSet add(int c) { 1238 checkFrozen(); 1239 return add_unchecked(c); 1240 } 1241 1242 // for internal use only, after checkFrozen has been called add_unchecked(int c)1243 private final UnicodeSet add_unchecked(int c) { 1244 if (c < MIN_VALUE || c > MAX_VALUE) { 1245 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1246 } 1247 1248 // find smallest i such that c < list[i] 1249 // if odd, then it is IN the set 1250 // if even, then it is OUT of the set 1251 int i = findCodePoint(c); 1252 1253 // already in set? 1254 if ((i & 1) != 0) return this; 1255 1256 // HIGH is 0x110000 1257 // assert(list[len-1] == HIGH); 1258 1259 // empty = [HIGH] 1260 // [start_0, limit_0, start_1, limit_1, HIGH] 1261 1262 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1263 // ^ 1264 // list[i] 1265 1266 // i == 0 means c is before the first range 1267 // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into 1268 // add_unchecked, the maximum value that "c" will be compared to 1269 // is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will 1270 // never be reached according to this logic. 1271 if (c == list[i]-1) { 1272 // c is before start of next range 1273 list[i] = c; 1274 // if we touched the HIGH mark, then add a new one 1275 if (c == MAX_VALUE) { 1276 ensureCapacity(len+1); 1277 list[len++] = HIGH; 1278 } 1279 if (i > 0 && c == list[i-1]) { 1280 // collapse adjacent ranges 1281 1282 // [..., start_k-1, c, c, limit_k, ..., HIGH] 1283 // ^ 1284 // list[i] 1285 System.arraycopy(list, i+1, list, i-1, len-i-1); 1286 len -= 2; 1287 } 1288 } 1289 1290 else if (i > 0 && c == list[i-1]) { 1291 // c is after end of prior range 1292 list[i-1]++; 1293 // no need to chcek for collapse here 1294 } 1295 1296 else { 1297 // At this point we know the new char is not adjacent to 1298 // any existing ranges, and it is not 10FFFF. 1299 1300 1301 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1302 // ^ 1303 // list[i] 1304 1305 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1306 // ^ 1307 // list[i] 1308 1309 // Don't use ensureCapacity() to save on copying. 1310 // NOTE: This has no measurable impact on performance, 1311 // but it might help in some usage patterns. 1312 if (len+2 > list.length) { 1313 int[] temp = new int[nextCapacity(len + 2)]; 1314 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 1315 System.arraycopy(list, i, temp, i+2, len-i); 1316 list = temp; 1317 } else { 1318 System.arraycopy(list, i, list, i+2, len-i); 1319 } 1320 1321 list[i] = c; 1322 list[i+1] = c+1; 1323 len += 2; 1324 } 1325 1326 pat = null; 1327 return this; 1328 } 1329 1330 /** 1331 * Adds the specified multicharacter to this set if it is not already 1332 * present. If this set already contains the multicharacter, 1333 * the call leaves this set unchanged. 1334 * Thus "ch" => {"ch"} 1335 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1336 * @param s the source string 1337 * @return this object, for chaining 1338 */ add(CharSequence s)1339 public final UnicodeSet add(CharSequence s) { 1340 checkFrozen(); 1341 int cp = getSingleCP(s); 1342 if (cp < 0) { 1343 String str = s.toString(); 1344 if (!strings.contains(str)) { 1345 addString(str); 1346 pat = null; 1347 } 1348 } else { 1349 add_unchecked(cp, cp); 1350 } 1351 return this; 1352 } 1353 addString(CharSequence s)1354 private void addString(CharSequence s) { 1355 if (strings == EMPTY_STRINGS) { 1356 strings = new TreeSet<>(); 1357 } 1358 strings.add(s.toString()); 1359 } 1360 1361 /** 1362 * Utility for getting code point from single code point CharSequence. 1363 * See the public UTF16.getSingleCodePoint() 1364 * @return a code point IF the string consists of a single one. 1365 * otherwise returns -1. 1366 * @param s to test 1367 */ getSingleCP(CharSequence s)1368 private static int getSingleCP(CharSequence s) { 1369 if (s.length() < 1) { 1370 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 1371 } 1372 if (s.length() > 2) return -1; 1373 if (s.length() == 1) return s.charAt(0); 1374 1375 // at this point, len = 2 1376 int cp = UTF16.charAt(s, 0); 1377 if (cp > 0xFFFF) { // is surrogate pair 1378 return cp; 1379 } 1380 return -1; 1381 } 1382 1383 /** 1384 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1385 * If this set already any particular character, it has no effect on that character. 1386 * @param s the source string 1387 * @return this object, for chaining 1388 */ addAll(CharSequence s)1389 public final UnicodeSet addAll(CharSequence s) { 1390 checkFrozen(); 1391 int cp; 1392 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1393 cp = UTF16.charAt(s, i); 1394 add_unchecked(cp, cp); 1395 } 1396 return this; 1397 } 1398 1399 /** 1400 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1401 * If this set already any particular character, it has no effect on that character. 1402 * @param s the source string 1403 * @return this object, for chaining 1404 */ retainAll(CharSequence s)1405 public final UnicodeSet retainAll(CharSequence s) { 1406 return retainAll(fromAll(s)); 1407 } 1408 1409 /** 1410 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1411 * If this set already any particular character, it has no effect on that character. 1412 * @param s the source string 1413 * @return this object, for chaining 1414 */ complementAll(CharSequence s)1415 public final UnicodeSet complementAll(CharSequence s) { 1416 return complementAll(fromAll(s)); 1417 } 1418 1419 /** 1420 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1421 * If this set already any particular character, it has no effect on that character. 1422 * @param s the source string 1423 * @return this object, for chaining 1424 */ removeAll(CharSequence s)1425 public final UnicodeSet removeAll(CharSequence s) { 1426 return removeAll(fromAll(s)); 1427 } 1428 1429 /** 1430 * Remove all strings from this UnicodeSet 1431 * @return this object, for chaining 1432 */ removeAllStrings()1433 public final UnicodeSet removeAllStrings() { 1434 checkFrozen(); 1435 if (hasStrings()) { 1436 strings.clear(); 1437 pat = null; 1438 } 1439 return this; 1440 } 1441 1442 /** 1443 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1444 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1445 * @param s the source string 1446 * @return a newly created set containing the given string 1447 */ from(CharSequence s)1448 public static UnicodeSet from(CharSequence s) { 1449 return new UnicodeSet().add(s); 1450 } 1451 1452 1453 /** 1454 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1455 * @param s the source string 1456 * @return a newly created set containing the given characters 1457 */ fromAll(CharSequence s)1458 public static UnicodeSet fromAll(CharSequence s) { 1459 return new UnicodeSet().addAll(s); 1460 } 1461 1462 1463 /** 1464 * Retain only the elements in this set that are contained in the 1465 * specified range. If <code>end > start</code> then an empty range is 1466 * retained, leaving the set empty. 1467 * 1468 * @param start first character, inclusive, of range to be retained 1469 * to this set. 1470 * @param end last character, inclusive, of range to be retained 1471 * to this set. 1472 */ retain(int start, int end)1473 public UnicodeSet retain(int start, int end) { 1474 checkFrozen(); 1475 if (start < MIN_VALUE || start > MAX_VALUE) { 1476 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1477 } 1478 if (end < MIN_VALUE || end > MAX_VALUE) { 1479 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1480 } 1481 if (start <= end) { 1482 retain(range(start, end), 2, 0); 1483 } else { 1484 clear(); 1485 } 1486 return this; 1487 } 1488 1489 /** 1490 * Retain the specified character from this set if it is present. 1491 * Upon return this set will be empty if it did not contain c, or 1492 * will only contain c if it did contain c. 1493 * @param c the character to be retained 1494 * @return this object, for chaining 1495 */ retain(int c)1496 public final UnicodeSet retain(int c) { 1497 return retain(c, c); 1498 } 1499 1500 /** 1501 * Retain the specified string in this set if it is present. 1502 * Upon return this set will be empty if it did not contain s, or 1503 * will only contain s if it did contain s. 1504 * @param cs the string to be retained 1505 * @return this object, for chaining 1506 */ retain(CharSequence cs)1507 public final UnicodeSet retain(CharSequence cs) { 1508 int cp = getSingleCP(cs); 1509 if (cp < 0) { 1510 checkFrozen(); 1511 String s = cs.toString(); 1512 boolean isIn = strings.contains(s); 1513 if (isIn && size() == 1) { 1514 return this; 1515 } 1516 clear(); 1517 addString(s); 1518 pat = null; 1519 } else { 1520 retain(cp, cp); 1521 } 1522 return this; 1523 } 1524 1525 /** 1526 * Removes the specified range from this set if it is present. 1527 * The set will not contain the specified range once the call 1528 * returns. If <code>end > start</code> then an empty range is 1529 * removed, leaving the set unchanged. 1530 * 1531 * @param start first character, inclusive, of range to be removed 1532 * from this set. 1533 * @param end last character, inclusive, of range to be removed 1534 * from this set. 1535 */ remove(int start, int end)1536 public UnicodeSet remove(int start, int end) { 1537 checkFrozen(); 1538 if (start < MIN_VALUE || start > MAX_VALUE) { 1539 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1540 } 1541 if (end < MIN_VALUE || end > MAX_VALUE) { 1542 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1543 } 1544 if (start <= end) { 1545 retain(range(start, end), 2, 2); 1546 } 1547 return this; 1548 } 1549 1550 /** 1551 * Removes the specified character from this set if it is present. 1552 * The set will not contain the specified character once the call 1553 * returns. 1554 * @param c the character to be removed 1555 * @return this object, for chaining 1556 */ remove(int c)1557 public final UnicodeSet remove(int c) { 1558 return remove(c, c); 1559 } 1560 1561 /** 1562 * Removes the specified string from this set if it is present. 1563 * The set will not contain the specified string once the call 1564 * returns. 1565 * @param s the string to be removed 1566 * @return this object, for chaining 1567 */ remove(CharSequence s)1568 public final UnicodeSet remove(CharSequence s) { 1569 int cp = getSingleCP(s); 1570 if (cp < 0) { 1571 checkFrozen(); 1572 String str = s.toString(); 1573 if (strings.contains(str)) { 1574 strings.remove(str); 1575 pat = null; 1576 } 1577 } else { 1578 remove(cp, cp); 1579 } 1580 return this; 1581 } 1582 1583 /** 1584 * Complements the specified range in this set. Any character in 1585 * the range will be removed if it is in this set, or will be 1586 * added if it is not in this set. If <code>end > start</code> 1587 * then an empty range is complemented, leaving the set unchanged. 1588 * 1589 * @param start first character, inclusive, of range to be removed 1590 * from this set. 1591 * @param end last character, inclusive, of range to be removed 1592 * from this set. 1593 */ complement(int start, int end)1594 public UnicodeSet complement(int start, int end) { 1595 checkFrozen(); 1596 if (start < MIN_VALUE || start > MAX_VALUE) { 1597 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1598 } 1599 if (end < MIN_VALUE || end > MAX_VALUE) { 1600 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1601 } 1602 if (start <= end) { 1603 xor(range(start, end), 2, 0); 1604 } 1605 pat = null; 1606 return this; 1607 } 1608 1609 /** 1610 * Complements the specified character in this set. The character 1611 * will be removed if it is in this set, or will be added if it is 1612 * not in this set. 1613 */ complement(int c)1614 public final UnicodeSet complement(int c) { 1615 return complement(c, c); 1616 } 1617 1618 /** 1619 * This is equivalent to 1620 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1621 */ complement()1622 public UnicodeSet complement() { 1623 checkFrozen(); 1624 if (list[0] == LOW) { 1625 System.arraycopy(list, 1, list, 0, len-1); 1626 --len; 1627 } else { 1628 ensureCapacity(len+1); 1629 System.arraycopy(list, 0, list, 1, len); 1630 list[0] = LOW; 1631 ++len; 1632 } 1633 pat = null; 1634 return this; 1635 } 1636 1637 /** 1638 * Complement the specified string in this set. 1639 * The set will not contain the specified string once the call 1640 * returns. 1641 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1642 * @param s the string to complement 1643 * @return this object, for chaining 1644 */ complement(CharSequence s)1645 public final UnicodeSet complement(CharSequence s) { 1646 checkFrozen(); 1647 int cp = getSingleCP(s); 1648 if (cp < 0) { 1649 String s2 = s.toString(); 1650 if (strings.contains(s2)) { 1651 strings.remove(s2); 1652 } else { 1653 addString(s2); 1654 } 1655 pat = null; 1656 } else { 1657 complement(cp, cp); 1658 } 1659 return this; 1660 } 1661 1662 /** 1663 * Returns true if this set contains the given character. 1664 * @param c character to be checked for containment 1665 * @return true if the test condition is met 1666 */ 1667 @Override contains(int c)1668 public boolean contains(int c) { 1669 if (c < MIN_VALUE || c > MAX_VALUE) { 1670 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1671 } 1672 if (bmpSet != null) { 1673 return bmpSet.contains(c); 1674 } 1675 if (stringSpan != null) { 1676 return stringSpan.contains(c); 1677 } 1678 1679 /* 1680 // Set i to the index of the start item greater than ch 1681 // We know we will terminate without length test! 1682 int i = -1; 1683 while (true) { 1684 if (c < list[++i]) break; 1685 } 1686 */ 1687 1688 int i = findCodePoint(c); 1689 1690 return ((i & 1) != 0); // return true if odd 1691 } 1692 1693 /** 1694 * Returns the smallest value i such that c < list[i]. Caller 1695 * must ensure that c is a legal value or this method will enter 1696 * an infinite loop. This method performs a binary search. 1697 * @param c a character in the range MIN_VALUE..MAX_VALUE 1698 * inclusive 1699 * @return the smallest integer i in the range 0..len-1, 1700 * inclusive, such that c < list[i] 1701 */ findCodePoint(int c)1702 private final int findCodePoint(int c) { 1703 /* Examples: 1704 findCodePoint(c) 1705 set list[] c=0 1 3 4 7 8 1706 === ============== =========== 1707 [] [110000] 0 0 0 0 0 0 1708 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1709 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1710 [:all:] [0, 110000] 1 1 1 1 1 1 1711 */ 1712 1713 // Return the smallest i such that c < list[i]. Assume 1714 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1715 if (c < list[0]) return 0; 1716 // High runner test. c is often after the last range, so an 1717 // initial check for this condition pays off. 1718 if (len >= 2 && c >= list[len-2]) return len-1; 1719 int lo = 0; 1720 int hi = len - 1; 1721 // invariant: c >= list[lo] 1722 // invariant: c < list[hi] 1723 for (;;) { 1724 int i = (lo + hi) >>> 1; 1725 if (i == lo) return hi; 1726 if (c < list[i]) { 1727 hi = i; 1728 } else { 1729 lo = i; 1730 } 1731 } 1732 } 1733 1734 // //---------------------------------------------------------------- 1735 // // Unrolled binary search 1736 // //---------------------------------------------------------------- 1737 // 1738 // private int validLen = -1; // validated value of len 1739 // private int topOfLow; 1740 // private int topOfHigh; 1741 // private int power; 1742 // private int deltaStart; 1743 // 1744 // private void validate() { 1745 // if (len <= 1) { 1746 // throw new IllegalArgumentException("list.len==" + len + "; must be >1"); 1747 // } 1748 // 1749 // // find greatest power of 2 less than or equal to len 1750 // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {} 1751 // 1752 // // assert(exp2[power] <= len); 1753 // 1754 // // determine the starting points 1755 // topOfLow = exp2[power] - 1; 1756 // topOfHigh = len - 1; 1757 // deltaStart = exp2[power-1]; 1758 // validLen = len; 1759 // } 1760 // 1761 // private static final int exp2[] = { 1762 // 0x1, 0x2, 0x4, 0x8, 1763 // 0x10, 0x20, 0x40, 0x80, 1764 // 0x100, 0x200, 0x400, 0x800, 1765 // 0x1000, 0x2000, 0x4000, 0x8000, 1766 // 0x10000, 0x20000, 0x40000, 0x80000, 1767 // 0x100000, 0x200000, 0x400000, 0x800000, 1768 // 0x1000000, 0x2000000, 0x4000000, 0x8000000, 1769 // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java 1770 // }; 1771 // 1772 // /** 1773 // * Unrolled lowest index GT. 1774 // */ 1775 // private final int leastIndexGT(int searchValue) { 1776 // 1777 // if (len != validLen) { 1778 // if (len == 1) return 0; 1779 // validate(); 1780 // } 1781 // int temp; 1782 // 1783 // // set up initial range to search. Each subrange is a power of two in length 1784 // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh; 1785 // 1786 // // Completely unrolled binary search, folhighing "Programming Pearls" 1787 // // Each case deliberately falls through to the next 1788 // // Logically, list[-1] < all_search_values && list[count] > all_search_values 1789 // // although the values -1 and count are never actually touched. 1790 // 1791 // // The bounds at each point are low & high, 1792 // // where low == high - delta*2 1793 // // so high - delta is the midpoint 1794 // 1795 // // The invariant AFTER each line is that list[low] < searchValue <= list[high] 1796 // 1797 // switch (power) { 1798 // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java 1799 // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp; 1800 // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp; 1801 // 1802 // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp; 1803 // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp; 1804 // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp; 1805 // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp; 1806 // 1807 // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp; 1808 // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp; 1809 // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp; 1810 // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp; 1811 // 1812 // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp; 1813 // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp; 1814 // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp; 1815 // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp; 1816 // 1817 // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp; 1818 // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp; 1819 // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp; 1820 // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp; 1821 // 1822 // case 12: if (searchValue < list[temp = high- 0x800]) high = temp; 1823 // case 11: if (searchValue < list[temp = high- 0x400]) high = temp; 1824 // case 10: if (searchValue < list[temp = high- 0x200]) high = temp; 1825 // case 9: if (searchValue < list[temp = high- 0x100]) high = temp; 1826 // 1827 // case 8: if (searchValue < list[temp = high- 0x80]) high = temp; 1828 // case 7: if (searchValue < list[temp = high- 0x40]) high = temp; 1829 // case 6: if (searchValue < list[temp = high- 0x20]) high = temp; 1830 // case 5: if (searchValue < list[temp = high- 0x10]) high = temp; 1831 // 1832 // case 4: if (searchValue < list[temp = high- 0x8]) high = temp; 1833 // case 3: if (searchValue < list[temp = high- 0x4]) high = temp; 1834 // case 2: if (searchValue < list[temp = high- 0x2]) high = temp; 1835 // case 1: if (searchValue < list[temp = high- 0x1]) high = temp; 1836 // } 1837 // 1838 // return high; 1839 // } 1840 // 1841 // // For debugging only 1842 // public int len() { 1843 // return len; 1844 // } 1845 // 1846 // //---------------------------------------------------------------- 1847 // //---------------------------------------------------------------- 1848 1849 /** 1850 * Returns true if this set contains every character 1851 * of the given range. 1852 * @param start first character, inclusive, of the range 1853 * @param end last character, inclusive, of the range 1854 * @return true if the test condition is met 1855 */ contains(int start, int end)1856 public boolean contains(int start, int end) { 1857 if (start < MIN_VALUE || start > MAX_VALUE) { 1858 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1859 } 1860 if (end < MIN_VALUE || end > MAX_VALUE) { 1861 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1862 } 1863 //int i = -1; 1864 //while (true) { 1865 // if (start < list[++i]) break; 1866 //} 1867 int i = findCodePoint(start); 1868 return ((i & 1) != 0 && end < list[i]); 1869 } 1870 1871 /** 1872 * Returns <tt>true</tt> if this set contains the given 1873 * multicharacter string. 1874 * @param s string to be checked for containment 1875 * @return <tt>true</tt> if this set contains the specified string 1876 */ contains(CharSequence s)1877 public final boolean contains(CharSequence s) { 1878 1879 int cp = getSingleCP(s); 1880 if (cp < 0) { 1881 return strings.contains(s.toString()); 1882 } else { 1883 return contains(cp); 1884 } 1885 } 1886 1887 /** 1888 * Returns true if this set contains all the characters and strings 1889 * of the given set. 1890 * @param b set to be checked for containment 1891 * @return true if the test condition is met 1892 */ containsAll(UnicodeSet b)1893 public boolean containsAll(UnicodeSet b) { 1894 // The specified set is a subset if all of its pairs are contained in 1895 // this set. This implementation accesses the lists directly for speed. 1896 // TODO: this could be faster if size() were cached. But that would affect building speed 1897 // so it needs investigation. 1898 int[] listB = b.list; 1899 boolean needA = true; 1900 boolean needB = true; 1901 int aPtr = 0; 1902 int bPtr = 0; 1903 int aLen = len - 1; 1904 int bLen = b.len - 1; 1905 int startA = 0, startB = 0, limitA = 0, limitB = 0; 1906 while (true) { 1907 // double iterations are such a pain... 1908 if (needA) { 1909 if (aPtr >= aLen) { 1910 // ran out of A. If B is also exhausted, then break; 1911 if (needB && bPtr >= bLen) { 1912 break; 1913 } 1914 return false; 1915 } 1916 startA = list[aPtr++]; 1917 limitA = list[aPtr++]; 1918 } 1919 if (needB) { 1920 if (bPtr >= bLen) { 1921 // ran out of B. Since we got this far, we have an A and we are ok so far 1922 break; 1923 } 1924 startB = listB[bPtr++]; 1925 limitB = listB[bPtr++]; 1926 } 1927 // if B doesn't overlap and is greater than A, get new A 1928 if (startB >= limitA) { 1929 needA = true; 1930 needB = false; 1931 continue; 1932 } 1933 // if B is wholy contained in A, then get a new B 1934 if (startB >= startA && limitB <= limitA) { 1935 needA = false; 1936 needB = true; 1937 continue; 1938 } 1939 // all other combinations mean we fail 1940 return false; 1941 } 1942 1943 if (!strings.containsAll(b.strings)) return false; 1944 return true; 1945 } 1946 1947 // /** 1948 // * Returns true if this set contains all the characters and strings 1949 // * of the given set. 1950 // * @param c set to be checked for containment 1951 // * @return true if the test condition is met 1952 // * @stable ICU 2.0 1953 // */ 1954 // public boolean containsAllOld(UnicodeSet c) { 1955 // // The specified set is a subset if all of its pairs are contained in 1956 // // this set. It's possible to code this more efficiently in terms of 1957 // // direct manipulation of the inversion lists if the need arises. 1958 // int n = c.getRangeCount(); 1959 // for (int i=0; i<n; ++i) { 1960 // if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 1961 // return false; 1962 // } 1963 // } 1964 // if (!strings.containsAll(c.strings)) return false; 1965 // return true; 1966 // } 1967 1968 /** 1969 * Returns true if there is a partition of the string such that this set contains each of the partitioned strings. 1970 * For example, for the Unicode set [a{bc}{cd}]<br> 1971 * containsAll is true for each of: "a", "bc", ""cdbca"<br> 1972 * containsAll is false for each of: "acb", "bcda", "bcx"<br> 1973 * @param s string containing characters to be checked for containment 1974 * @return true if the test condition is met 1975 */ containsAll(String s)1976 public boolean containsAll(String s) { 1977 int cp; 1978 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1979 cp = UTF16.charAt(s, i); 1980 if (!contains(cp)) { 1981 if (!hasStrings()) { 1982 return false; 1983 } 1984 return containsAll(s, 0); 1985 } 1986 } 1987 return true; 1988 } 1989 1990 /** 1991 * Recursive routine called if we fail to find a match in containsAll, and there are strings 1992 * @param s source string 1993 * @param i point to match to the end on 1994 * @return true if ok 1995 */ containsAll(String s, int i)1996 private boolean containsAll(String s, int i) { 1997 if (i >= s.length()) { 1998 return true; 1999 } 2000 int cp= UTF16.charAt(s, i); 2001 if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) { 2002 return true; 2003 } 2004 for (String setStr : strings) { 2005 if (s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) { 2006 return true; 2007 } 2008 } 2009 return false; 2010 2011 } 2012 2013 /** 2014 * Get the Regex equivalent for this UnicodeSet 2015 * @return regex pattern equivalent to this UnicodeSet 2016 * @deprecated This API is ICU internal only. 2017 * @hide deprecated on icu4j-org 2018 * @hide draft / provisional / internal are hidden on OHOS 2019 */ 2020 @Deprecated getRegexEquivalent()2021 public String getRegexEquivalent() { 2022 if (!hasStrings()) { 2023 return toString(); 2024 } 2025 StringBuilder result = new StringBuilder("(?:"); 2026 appendNewPattern(result, true, false); 2027 for (String s : strings) { 2028 result.append('|'); 2029 _appendToPat(result, s, true); 2030 } 2031 return result.append(")").toString(); 2032 } 2033 2034 /** 2035 * Returns true if this set contains none of the characters 2036 * of the given range. 2037 * @param start first character, inclusive, of the range 2038 * @param end last character, inclusive, of the range 2039 * @return true if the test condition is met 2040 */ containsNone(int start, int end)2041 public boolean containsNone(int start, int end) { 2042 if (start < MIN_VALUE || start > MAX_VALUE) { 2043 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 2044 } 2045 if (end < MIN_VALUE || end > MAX_VALUE) { 2046 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 2047 } 2048 int i = -1; 2049 while (true) { 2050 if (start < list[++i]) break; 2051 } 2052 return ((i & 1) == 0 && end < list[i]); 2053 } 2054 2055 /** 2056 * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2057 * For example, for the Unicode set [a{bc}{cd}]<br> 2058 * containsNone is true for: "xy", "cb"<br> 2059 * containsNone is false for: "a", "bc", "bcd"<br> 2060 * @param b set to be checked for containment 2061 * @return true if the test condition is met 2062 */ containsNone(UnicodeSet b)2063 public boolean containsNone(UnicodeSet b) { 2064 // The specified set is a subset if some of its pairs overlap with some of this set's pairs. 2065 // This implementation accesses the lists directly for speed. 2066 int[] listB = b.list; 2067 boolean needA = true; 2068 boolean needB = true; 2069 int aPtr = 0; 2070 int bPtr = 0; 2071 int aLen = len - 1; 2072 int bLen = b.len - 1; 2073 int startA = 0, startB = 0, limitA = 0, limitB = 0; 2074 while (true) { 2075 // double iterations are such a pain... 2076 if (needA) { 2077 if (aPtr >= aLen) { 2078 // ran out of A: break so we test strings 2079 break; 2080 } 2081 startA = list[aPtr++]; 2082 limitA = list[aPtr++]; 2083 } 2084 if (needB) { 2085 if (bPtr >= bLen) { 2086 // ran out of B: break so we test strings 2087 break; 2088 } 2089 startB = listB[bPtr++]; 2090 limitB = listB[bPtr++]; 2091 } 2092 // if B is higher than any part of A, get new A 2093 if (startB >= limitA) { 2094 needA = true; 2095 needB = false; 2096 continue; 2097 } 2098 // if A is higher than any part of B, get new B 2099 if (startA >= limitB) { 2100 needA = false; 2101 needB = true; 2102 continue; 2103 } 2104 // all other combinations mean we fail 2105 return false; 2106 } 2107 2108 if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false; 2109 return true; 2110 } 2111 2112 // /** 2113 // * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2114 // * For example, for the Unicode set [a{bc}{cd}]<br> 2115 // * containsNone is true for: "xy", "cb"<br> 2116 // * containsNone is false for: "a", "bc", "bcd"<br> 2117 // * @param c set to be checked for containment 2118 // * @return true if the test condition is met 2119 // * @stable ICU 2.0 2120 // */ 2121 // public boolean containsNoneOld(UnicodeSet c) { 2122 // // The specified set is a subset if all of its pairs are contained in 2123 // // this set. It's possible to code this more efficiently in terms of 2124 // // direct manipulation of the inversion lists if the need arises. 2125 // int n = c.getRangeCount(); 2126 // for (int i=0; i<n; ++i) { 2127 // if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 2128 // return false; 2129 // } 2130 // } 2131 // if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false; 2132 // return true; 2133 // } 2134 2135 /** 2136 * Returns true if this set contains none of the characters 2137 * of the given string. 2138 * @param s string containing characters to be checked for containment 2139 * @return true if the test condition is met 2140 */ containsNone(CharSequence s)2141 public boolean containsNone(CharSequence s) { 2142 return span(s, SpanCondition.NOT_CONTAINED) == s.length(); 2143 } 2144 2145 /** 2146 * Returns true if this set contains one or more of the characters 2147 * in the given range. 2148 * @param start first character, inclusive, of the range 2149 * @param end last character, inclusive, of the range 2150 * @return true if the condition is met 2151 */ containsSome(int start, int end)2152 public final boolean containsSome(int start, int end) { 2153 return !containsNone(start, end); 2154 } 2155 2156 /** 2157 * Returns true if this set contains one or more of the characters 2158 * and strings of the given set. 2159 * @param s set to be checked for containment 2160 * @return true if the condition is met 2161 */ containsSome(UnicodeSet s)2162 public final boolean containsSome(UnicodeSet s) { 2163 return !containsNone(s); 2164 } 2165 2166 /** 2167 * Returns true if this set contains one or more of the characters 2168 * of the given string. 2169 * @param s string containing characters to be checked for containment 2170 * @return true if the condition is met 2171 */ containsSome(CharSequence s)2172 public final boolean containsSome(CharSequence s) { 2173 return !containsNone(s); 2174 } 2175 2176 2177 /** 2178 * Adds all of the elements in the specified set to this set if 2179 * they're not already present. This operation effectively 2180 * modifies this set so that its value is the <i>union</i> of the two 2181 * sets. The behavior of this operation is unspecified if the specified 2182 * collection is modified while the operation is in progress. 2183 * 2184 * @param c set whose elements are to be added to this set. 2185 */ addAll(UnicodeSet c)2186 public UnicodeSet addAll(UnicodeSet c) { 2187 checkFrozen(); 2188 add(c.list, c.len, 0); 2189 if (c.hasStrings()) { 2190 if (strings == EMPTY_STRINGS) { 2191 strings = new TreeSet<>(c.strings); 2192 } else { 2193 strings.addAll(c.strings); 2194 } 2195 } 2196 return this; 2197 } 2198 2199 /** 2200 * Retains only the elements in this set that are contained in the 2201 * specified set. In other words, removes from this set all of 2202 * its elements that are not contained in the specified set. This 2203 * operation effectively modifies this set so that its value is 2204 * the <i>intersection</i> of the two sets. 2205 * 2206 * @param c set that defines which elements this set will retain. 2207 */ retainAll(UnicodeSet c)2208 public UnicodeSet retainAll(UnicodeSet c) { 2209 checkFrozen(); 2210 retain(c.list, c.len, 0); 2211 if (hasStrings()) { 2212 if (!c.hasStrings()) { 2213 strings.clear(); 2214 } else { 2215 strings.retainAll(c.strings); 2216 } 2217 } 2218 return this; 2219 } 2220 2221 /** 2222 * Removes from this set all of its elements that are contained in the 2223 * specified set. This operation effectively modifies this 2224 * set so that its value is the <i>asymmetric set difference</i> of 2225 * the two sets. 2226 * 2227 * @param c set that defines which elements will be removed from 2228 * this set. 2229 */ removeAll(UnicodeSet c)2230 public UnicodeSet removeAll(UnicodeSet c) { 2231 checkFrozen(); 2232 retain(c.list, c.len, 2); 2233 if (hasStrings() && c.hasStrings()) { 2234 strings.removeAll(c.strings); 2235 } 2236 return this; 2237 } 2238 2239 /** 2240 * Complements in this set all elements contained in the specified 2241 * set. Any character in the other set will be removed if it is 2242 * in this set, or will be added if it is not in this set. 2243 * 2244 * @param c set that defines which elements will be complemented from 2245 * this set. 2246 */ complementAll(UnicodeSet c)2247 public UnicodeSet complementAll(UnicodeSet c) { 2248 checkFrozen(); 2249 xor(c.list, c.len, 0); 2250 if (c.hasStrings()) { 2251 if (strings == EMPTY_STRINGS) { 2252 strings = new TreeSet<>(c.strings); 2253 } else { 2254 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings); 2255 } 2256 } 2257 return this; 2258 } 2259 2260 /** 2261 * Removes all of the elements from this set. This set will be 2262 * empty after this call returns. 2263 */ clear()2264 public UnicodeSet clear() { 2265 checkFrozen(); 2266 list[0] = HIGH; 2267 len = 1; 2268 pat = null; 2269 if (hasStrings()) { 2270 strings.clear(); 2271 } 2272 return this; 2273 } 2274 2275 /** 2276 * Iteration method that returns the number of ranges contained in 2277 * this set. 2278 * @see #getRangeStart 2279 * @see #getRangeEnd 2280 */ getRangeCount()2281 public int getRangeCount() { 2282 return len/2; 2283 } 2284 2285 /** 2286 * Iteration method that returns the first character in the 2287 * specified range of this set. 2288 * @exception ArrayIndexOutOfBoundsException if index is outside 2289 * the range <code>0..getRangeCount()-1</code> 2290 * @see #getRangeCount 2291 * @see #getRangeEnd 2292 */ getRangeStart(int index)2293 public int getRangeStart(int index) { 2294 return list[index*2]; 2295 } 2296 2297 /** 2298 * Iteration method that returns the last character in the 2299 * specified range of this set. 2300 * @exception ArrayIndexOutOfBoundsException if index is outside 2301 * the range <code>0..getRangeCount()-1</code> 2302 * @see #getRangeStart 2303 * @see #getRangeEnd 2304 */ getRangeEnd(int index)2305 public int getRangeEnd(int index) { 2306 return (list[index*2 + 1] - 1); 2307 } 2308 2309 /** 2310 * Reallocate this objects internal structures to take up the least 2311 * possible space, without changing this object's value. 2312 */ compact()2313 public UnicodeSet compact() { 2314 checkFrozen(); 2315 if ((len + 7) < list.length) { 2316 // If we have more than a little unused capacity, shrink it to len. 2317 list = Arrays.copyOf(list, len); 2318 } 2319 rangeList = null; 2320 buffer = null; 2321 if (strings != EMPTY_STRINGS && strings.isEmpty()) { 2322 strings = EMPTY_STRINGS; 2323 } 2324 return this; 2325 } 2326 2327 /** 2328 * Compares the specified object with this set for equality. Returns 2329 * <tt>true</tt> if the specified object is also a set, the two sets 2330 * have the same size, and every member of the specified set is 2331 * contained in this set (or equivalently, every member of this set is 2332 * contained in the specified set). 2333 * 2334 * @param o Object to be compared for equality with this set. 2335 * @return <tt>true</tt> if the specified Object is equal to this set. 2336 */ 2337 @Override equals(Object o)2338 public boolean equals(Object o) { 2339 if (o == null) { 2340 return false; 2341 } 2342 if (this == o) { 2343 return true; 2344 } 2345 try { 2346 UnicodeSet that = (UnicodeSet) o; 2347 if (len != that.len) return false; 2348 for (int i = 0; i < len; ++i) { 2349 if (list[i] != that.list[i]) return false; 2350 } 2351 if (!strings.equals(that.strings)) return false; 2352 } catch (Exception e) { 2353 return false; 2354 } 2355 return true; 2356 } 2357 2358 /** 2359 * Returns the hash code value for this set. 2360 * 2361 * @return the hash code value for this set. 2362 * @see java.lang.Object#hashCode() 2363 */ 2364 @Override hashCode()2365 public int hashCode() { 2366 int result = len; 2367 for (int i = 0; i < len; ++i) { 2368 result *= 1000003; 2369 result += list[i]; 2370 } 2371 return result; 2372 } 2373 2374 /** 2375 * Return a programmer-readable string representation of this object. 2376 */ 2377 @Override toString()2378 public String toString() { 2379 return toPattern(true); 2380 } 2381 2382 //---------------------------------------------------------------- 2383 // Implementation: Pattern parsing 2384 //---------------------------------------------------------------- 2385 2386 /** 2387 * Parses the given pattern, starting at the given position. The character 2388 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 2389 * Parsing continues until the corresponding closing ']'. If a syntax error 2390 * is encountered between the opening and closing brace, the parse fails. 2391 * Upon return from a successful parse, the ParsePosition is updated to 2392 * point to the character following the closing ']', and an inversion 2393 * list for the parsed pattern is returned. This method 2394 * calls itself recursively to parse embedded subpatterns. 2395 * 2396 * @param pattern the string containing the pattern to be parsed. The 2397 * portion of the string from pos.getIndex(), which must be a '[', to the 2398 * corresponding closing ']', is parsed. 2399 * @param pos upon entry, the position at which to being parsing. The 2400 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 2401 * from a successful parse, pos.getIndex() is either the character after the 2402 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 2403 * is the last character of the pattern string. 2404 * @return an inversion list for the parsed substring 2405 * of <code>pattern</code> 2406 * @exception java.lang.IllegalArgumentException if the parse fails. 2407 * @deprecated This API is ICU internal only. 2408 * @hide deprecated on icu4j-org 2409 * @hide draft / provisional / internal are hidden on OHOS 2410 */ 2411 @Deprecated applyPattern(String pattern, ParsePosition pos, SymbolTable symbols, int options)2412 public UnicodeSet applyPattern(String pattern, 2413 ParsePosition pos, 2414 SymbolTable symbols, 2415 int options) { 2416 2417 // Need to build the pattern in a temporary string because 2418 // _applyPattern calls add() etc., which set pat to empty. 2419 boolean parsePositionWasNull = pos == null; 2420 if (parsePositionWasNull) { 2421 pos = new ParsePosition(0); 2422 } 2423 2424 StringBuilder rebuiltPat = new StringBuilder(); 2425 RuleCharacterIterator chars = 2426 new RuleCharacterIterator(pattern, symbols, pos); 2427 applyPattern(chars, symbols, rebuiltPat, options, 0); 2428 if (chars.inVariable()) { 2429 syntaxError(chars, "Extra chars in variable value"); 2430 } 2431 pat = rebuiltPat.toString(); 2432 if (parsePositionWasNull) { 2433 int i = pos.getIndex(); 2434 2435 // Skip over trailing whitespace 2436 if ((options & IGNORE_SPACE) != 0) { 2437 i = PatternProps.skipWhiteSpace(pattern, i); 2438 } 2439 2440 if (i != pattern.length()) { 2441 throw new IllegalArgumentException("Parse of \"" + pattern + 2442 "\" failed at " + i); 2443 } 2444 } 2445 return this; 2446 } 2447 2448 // Add constants to make the applyPattern() code easier to follow. 2449 2450 private static final int LAST0_START = 0, 2451 LAST1_RANGE = 1, 2452 LAST2_SET = 2; 2453 2454 private static final int MODE0_NONE = 0, 2455 MODE1_INBRACKET = 1, 2456 MODE2_OUTBRACKET = 2; 2457 2458 private static final int SETMODE0_NONE = 0, 2459 SETMODE1_UNICODESET = 1, 2460 SETMODE2_PROPERTYPAT = 2, 2461 SETMODE3_PREPARSED = 3; 2462 2463 private static final int MAX_DEPTH = 100; 2464 2465 /** 2466 * Parse the pattern from the given RuleCharacterIterator. The 2467 * iterator is advanced over the parsed pattern. 2468 * @param chars iterator over the pattern characters. Upon return 2469 * it will be advanced to the first character after the parsed 2470 * pattern, or the end of the iteration if all characters are 2471 * parsed. 2472 * @param symbols symbol table to use to parse and dereference 2473 * variables, or null if none. 2474 * @param rebuiltPat the pattern that was parsed, rebuilt or 2475 * copied from the input pattern, as appropriate. 2476 * @param options a bit mask of zero or more of the following: 2477 * IGNORE_SPACE, CASE. 2478 */ applyPattern(RuleCharacterIterator chars, SymbolTable symbols, Appendable rebuiltPat, int options, int depth)2479 private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 2480 Appendable rebuiltPat, int options, int depth) { 2481 if (depth > MAX_DEPTH) { 2482 syntaxError(chars, "Pattern nested too deeply"); 2483 } 2484 2485 // Syntax characters: [ ] ^ - & { } 2486 2487 // Recognized special forms for chars, sets: c-c s-s s&s 2488 2489 int opts = RuleCharacterIterator.PARSE_VARIABLES | 2490 RuleCharacterIterator.PARSE_ESCAPES; 2491 if ((options & IGNORE_SPACE) != 0) { 2492 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 2493 } 2494 2495 StringBuilder patBuf = new StringBuilder(), buf = null; 2496 boolean usePat = false; 2497 UnicodeSet scratch = null; 2498 Object backup = null; 2499 2500 // mode: 0=before [, 1=between [...], 2=after ] 2501 // lastItem: 0=none, 1=char, 2=set 2502 int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; 2503 char op = 0; 2504 2505 boolean invert = false; 2506 2507 clear(); 2508 String lastString = null; 2509 2510 while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { 2511 //Eclipse stated the following is "dead code" 2512 /* 2513 if (false) { 2514 // Debugging assertion 2515 if (!((lastItem == 0 && op == 0) || 2516 (lastItem == 1 && (op == 0 || op == '-')) || 2517 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 2518 throw new IllegalArgumentException(); 2519 } 2520 }*/ 2521 2522 int c = 0; 2523 boolean literal = false; 2524 UnicodeSet nested = null; 2525 2526 // -------- Check for property pattern 2527 2528 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 2529 int setMode = SETMODE0_NONE; 2530 if (resemblesPropertyPattern(chars, opts)) { 2531 setMode = SETMODE2_PROPERTYPAT; 2532 } 2533 2534 // -------- Parse '[' of opening delimiter OR nested set. 2535 // If there is a nested set, use `setMode' to define how 2536 // the set should be parsed. If the '[' is part of the 2537 // opening delimiter for this pattern, parse special 2538 // strings "[", "[^", "[-", and "[^-". Check for stand-in 2539 // characters representing a nested set in the symbol 2540 // table. 2541 2542 else { 2543 // Prepare to backup if necessary 2544 backup = chars.getPos(backup); 2545 c = chars.next(opts); 2546 literal = chars.isEscaped(); 2547 2548 if (c == '[' && !literal) { 2549 if (mode == MODE1_INBRACKET) { 2550 chars.setPos(backup); // backup 2551 setMode = SETMODE1_UNICODESET; 2552 } else { 2553 // Handle opening '[' delimiter 2554 mode = MODE1_INBRACKET; 2555 patBuf.append('['); 2556 backup = chars.getPos(backup); // prepare to backup 2557 c = chars.next(opts); 2558 literal = chars.isEscaped(); 2559 if (c == '^' && !literal) { 2560 invert = true; 2561 patBuf.append('^'); 2562 backup = chars.getPos(backup); // prepare to backup 2563 c = chars.next(opts); 2564 literal = chars.isEscaped(); 2565 } 2566 // Fall through to handle special leading '-'; 2567 // otherwise restart loop for nested [], \p{}, etc. 2568 if (c == '-') { 2569 literal = true; 2570 // Fall through to handle literal '-' below 2571 } else { 2572 chars.setPos(backup); // backup 2573 continue; 2574 } 2575 } 2576 } else if (symbols != null) { 2577 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 2578 if (m != null) { 2579 try { 2580 nested = (UnicodeSet) m; 2581 setMode = SETMODE3_PREPARSED; 2582 } catch (ClassCastException e) { 2583 syntaxError(chars, "Syntax error"); 2584 } 2585 } 2586 } 2587 } 2588 2589 // -------- Handle a nested set. This either is inline in 2590 // the pattern or represented by a stand-in that has 2591 // previously been parsed and was looked up in the symbol 2592 // table. 2593 2594 if (setMode != SETMODE0_NONE) { 2595 if (lastItem == LAST1_RANGE) { 2596 if (op != 0) { 2597 syntaxError(chars, "Char expected after operator"); 2598 } 2599 add_unchecked(lastChar, lastChar); 2600 _appendToPat(patBuf, lastChar, false); 2601 lastItem = LAST0_START; 2602 op = 0; 2603 } 2604 2605 if (op == '-' || op == '&') { 2606 patBuf.append(op); 2607 } 2608 2609 if (nested == null) { 2610 if (scratch == null) scratch = new UnicodeSet(); 2611 nested = scratch; 2612 } 2613 switch (setMode) { 2614 case SETMODE1_UNICODESET: 2615 nested.applyPattern(chars, symbols, patBuf, options, depth + 1); 2616 break; 2617 case SETMODE2_PROPERTYPAT: 2618 chars.skipIgnored(opts); 2619 nested.applyPropertyPattern(chars, patBuf, symbols); 2620 break; 2621 case SETMODE3_PREPARSED: // `nested' already parsed 2622 nested._toPattern(patBuf, false); 2623 break; 2624 } 2625 2626 usePat = true; 2627 2628 if (mode == MODE0_NONE) { 2629 // Entire pattern is a category; leave parse loop 2630 set(nested); 2631 mode = MODE2_OUTBRACKET; 2632 break; 2633 } 2634 2635 switch (op) { 2636 case '-': 2637 removeAll(nested); 2638 break; 2639 case '&': 2640 retainAll(nested); 2641 break; 2642 case 0: 2643 addAll(nested); 2644 break; 2645 } 2646 2647 op = 0; 2648 lastItem = LAST2_SET; 2649 2650 continue; 2651 } 2652 2653 if (mode == MODE0_NONE) { 2654 syntaxError(chars, "Missing '['"); 2655 } 2656 2657 // -------- Parse special (syntax) characters. If the 2658 // current character is not special, or if it is escaped, 2659 // then fall through and handle it below. 2660 2661 if (!literal) { 2662 switch (c) { 2663 case ']': 2664 if (lastItem == LAST1_RANGE) { 2665 add_unchecked(lastChar, lastChar); 2666 _appendToPat(patBuf, lastChar, false); 2667 } 2668 // Treat final trailing '-' as a literal 2669 if (op == '-') { 2670 add_unchecked(op, op); 2671 patBuf.append(op); 2672 } else if (op == '&') { 2673 syntaxError(chars, "Trailing '&'"); 2674 } 2675 patBuf.append(']'); 2676 mode = MODE2_OUTBRACKET; 2677 continue; 2678 case '-': 2679 if (op == 0) { 2680 if (lastItem != LAST0_START) { 2681 op = (char) c; 2682 continue; 2683 } else if (lastString != null) { 2684 op = (char) c; 2685 continue; 2686 } else { 2687 // Treat final trailing '-' as a literal 2688 add_unchecked(c, c); 2689 c = chars.next(opts); 2690 literal = chars.isEscaped(); 2691 if (c == ']' && !literal) { 2692 patBuf.append("-]"); 2693 mode = MODE2_OUTBRACKET; 2694 continue; 2695 } 2696 } 2697 } 2698 syntaxError(chars, "'-' not after char, string, or set"); 2699 break; 2700 case '&': 2701 if (lastItem == LAST2_SET && op == 0) { 2702 op = (char) c; 2703 continue; 2704 } 2705 syntaxError(chars, "'&' not after set"); 2706 break; 2707 case '^': 2708 syntaxError(chars, "'^' not after '['"); 2709 break; 2710 case '{': 2711 if (op != 0 && op != '-') { 2712 syntaxError(chars, "Missing operand after operator"); 2713 } 2714 if (lastItem == LAST1_RANGE) { 2715 add_unchecked(lastChar, lastChar); 2716 _appendToPat(patBuf, lastChar, false); 2717 } 2718 lastItem = LAST0_START; 2719 if (buf == null) { 2720 buf = new StringBuilder(); 2721 } else { 2722 buf.setLength(0); 2723 } 2724 boolean ok = false; 2725 while (!chars.atEnd()) { 2726 c = chars.next(opts); 2727 literal = chars.isEscaped(); 2728 if (c == '}' && !literal) { 2729 ok = true; 2730 break; 2731 } 2732 appendCodePoint(buf, c); 2733 } 2734 if (buf.length() < 1 || !ok) { 2735 syntaxError(chars, "Invalid multicharacter string"); 2736 } 2737 // We have new string. Add it to set and continue; 2738 // we don't need to drop through to the further 2739 // processing 2740 String curString = buf.toString(); 2741 if (op == '-') { 2742 int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); 2743 int curSingle = CharSequences.getSingleCodePoint(curString); 2744 if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { 2745 add(lastSingle,curSingle); 2746 } else { 2747 if (strings == EMPTY_STRINGS) { 2748 strings = new TreeSet<>(); 2749 } 2750 try { 2751 StringRange.expand(lastString, curString, true, strings); 2752 } catch (Exception e) { 2753 syntaxError(chars, e.getMessage()); 2754 } 2755 } 2756 lastString = null; 2757 op = 0; 2758 } else { 2759 add(curString); 2760 lastString = curString; 2761 } 2762 patBuf.append('{'); 2763 _appendToPat(patBuf, curString, false); 2764 patBuf.append('}'); 2765 continue; 2766 case SymbolTable.SYMBOL_REF: 2767 // symbols nosymbols 2768 // [a-$] error error (ambiguous) 2769 // [a$] anchor anchor 2770 // [a-$x] var "x"* literal '$' 2771 // [a-$.] error literal '$' 2772 // *We won't get here in the case of var "x" 2773 backup = chars.getPos(backup); 2774 c = chars.next(opts); 2775 literal = chars.isEscaped(); 2776 boolean anchor = (c == ']' && !literal); 2777 if (symbols == null && !anchor) { 2778 c = SymbolTable.SYMBOL_REF; 2779 chars.setPos(backup); 2780 break; // literal '$' 2781 } 2782 if (anchor && op == 0) { 2783 if (lastItem == LAST1_RANGE) { 2784 add_unchecked(lastChar, lastChar); 2785 _appendToPat(patBuf, lastChar, false); 2786 } 2787 add_unchecked(UnicodeMatcher.ETHER); 2788 usePat = true; 2789 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 2790 mode = MODE2_OUTBRACKET; 2791 continue; 2792 } 2793 syntaxError(chars, "Unquoted '$'"); 2794 break; 2795 default: 2796 break; 2797 } 2798 } 2799 2800 // -------- Parse literal characters. This includes both 2801 // escaped chars ("\u4E01") and non-syntax characters 2802 // ("a"). 2803 2804 switch (lastItem) { 2805 case LAST0_START: 2806 if (op == '-' && lastString != null) { 2807 syntaxError(chars, "Invalid range"); 2808 } 2809 lastItem = LAST1_RANGE; 2810 lastChar = c; 2811 lastString = null; 2812 break; 2813 case LAST1_RANGE: 2814 if (op == '-') { 2815 if (lastString != null) { 2816 syntaxError(chars, "Invalid range"); 2817 } 2818 if (lastChar >= c) { 2819 // Don't allow redundant (a-a) or empty (b-a) ranges; 2820 // these are most likely typos. 2821 syntaxError(chars, "Invalid range"); 2822 } 2823 add_unchecked(lastChar, c); 2824 _appendToPat(patBuf, lastChar, false); 2825 patBuf.append(op); 2826 _appendToPat(patBuf, c, false); 2827 lastItem = LAST0_START; 2828 op = 0; 2829 } else { 2830 add_unchecked(lastChar, lastChar); 2831 _appendToPat(patBuf, lastChar, false); 2832 lastChar = c; 2833 } 2834 break; 2835 case LAST2_SET: 2836 if (op != 0) { 2837 syntaxError(chars, "Set expected after operator"); 2838 } 2839 lastChar = c; 2840 lastItem = LAST1_RANGE; 2841 break; 2842 } 2843 } 2844 2845 if (mode != MODE2_OUTBRACKET) { 2846 syntaxError(chars, "Missing ']'"); 2847 } 2848 2849 chars.skipIgnored(opts); 2850 2851 /** 2852 * Handle global flags (invert, case insensitivity). If this 2853 * pattern should be compiled case-insensitive, then we need 2854 * to close over case BEFORE COMPLEMENTING. This makes 2855 * patterns like /[^abc]/i work. 2856 */ 2857 if ((options & CASE) != 0) { 2858 closeOver(CASE); 2859 } 2860 if (invert) { 2861 complement(); 2862 } 2863 2864 // Use the rebuilt pattern (pat) only if necessary. Prefer the 2865 // generated pattern. 2866 if (usePat) { 2867 append(rebuiltPat, patBuf.toString()); 2868 } else { 2869 appendNewPattern(rebuiltPat, false, true); 2870 } 2871 } 2872 syntaxError(RuleCharacterIterator chars, String msg)2873 private static void syntaxError(RuleCharacterIterator chars, String msg) { 2874 throw new IllegalArgumentException("Error: " + msg + " at \"" + 2875 Utility.escape(chars.toString()) + 2876 '"'); 2877 } 2878 2879 /** 2880 * Add the contents of the UnicodeSet (as strings) into a collection. 2881 * @param target collection to add into 2882 */ addAllTo(T target)2883 public <T extends Collection<String>> T addAllTo(T target) { 2884 return addAllTo(this, target); 2885 } 2886 2887 2888 /** 2889 * Add the contents of the UnicodeSet (as strings) into a collection. 2890 * @param target collection to add into 2891 * @hide unsupported on OHOS 2892 */ addAllTo(String[] target)2893 public String[] addAllTo(String[] target) { 2894 return addAllTo(this, target); 2895 } 2896 2897 /** 2898 * Add the contents of the UnicodeSet (as strings) into an array. 2899 * @hide unsupported on OHOS 2900 */ toArray(UnicodeSet set)2901 public static String[] toArray(UnicodeSet set) { 2902 return addAllTo(set, new String[set.size()]); 2903 } 2904 2905 /** 2906 * Add the contents of the collection (as strings) into this UnicodeSet. 2907 * The collection must not contain null. 2908 * @param source the collection to add 2909 * @return a reference to this object 2910 */ add(Iterable<?> source)2911 public UnicodeSet add(Iterable<?> source) { 2912 return addAll(source); 2913 } 2914 2915 /** 2916 * Add a collection (as strings) into this UnicodeSet. 2917 * Uses standard naming convention. 2918 * @param source collection to add into 2919 * @return a reference to this object 2920 */ addAll(Iterable<?> source)2921 public UnicodeSet addAll(Iterable<?> source) { 2922 checkFrozen(); 2923 for (Object o : source) { 2924 add(o.toString()); 2925 } 2926 return this; 2927 } 2928 2929 //---------------------------------------------------------------- 2930 // Implementation: Utility methods 2931 //---------------------------------------------------------------- 2932 nextCapacity(int minCapacity)2933 private int nextCapacity(int minCapacity) { 2934 // Grow exponentially to reduce the frequency of allocations. 2935 if (minCapacity < INITIAL_CAPACITY) { 2936 return minCapacity + INITIAL_CAPACITY; 2937 } else if (minCapacity <= 2500) { 2938 return 5 * minCapacity; 2939 } else { 2940 int newCapacity = 2 * minCapacity; 2941 if (newCapacity > MAX_LENGTH) { 2942 newCapacity = MAX_LENGTH; 2943 } 2944 return newCapacity; 2945 } 2946 } 2947 ensureCapacity(int newLen)2948 private void ensureCapacity(int newLen) { 2949 if (newLen > MAX_LENGTH) { 2950 newLen = MAX_LENGTH; 2951 } 2952 if (newLen <= list.length) return; 2953 int newCapacity = nextCapacity(newLen); 2954 int[] temp = new int[newCapacity]; 2955 // Copy only the actual contents. 2956 System.arraycopy(list, 0, temp, 0, len); 2957 list = temp; 2958 } 2959 ensureBufferCapacity(int newLen)2960 private void ensureBufferCapacity(int newLen) { 2961 if (newLen > MAX_LENGTH) { 2962 newLen = MAX_LENGTH; 2963 } 2964 if (buffer != null && newLen <= buffer.length) return; 2965 int newCapacity = nextCapacity(newLen); 2966 buffer = new int[newCapacity]; 2967 // The buffer has no contents to be copied. 2968 // It is always filled from scratch after this call. 2969 } 2970 2971 /** 2972 * Assumes start <= end. 2973 */ range(int start, int end)2974 private int[] range(int start, int end) { 2975 if (rangeList == null) { 2976 rangeList = new int[] { start, end+1, HIGH }; 2977 } else { 2978 rangeList[0] = start; 2979 rangeList[1] = end+1; 2980 } 2981 return rangeList; 2982 } 2983 2984 //---------------------------------------------------------------- 2985 // Implementation: Fundamental operations 2986 //---------------------------------------------------------------- 2987 2988 // polarity = 0, 3 is normal: x xor y 2989 // polarity = 1, 2: x xor ~y == x === y 2990 xor(int[] other, int otherLen, int polarity)2991 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 2992 ensureBufferCapacity(len + otherLen); 2993 int i = 0, j = 0, k = 0; 2994 int a = list[i++]; 2995 int b; 2996 // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used 2997 // so the following if statement will not be called. 2998 ///CLOVER:OFF 2999 if (polarity == 1 || polarity == 2) { 3000 b = LOW; 3001 if (other[j] == LOW) { // skip base if already LOW 3002 ++j; 3003 b = other[j]; 3004 } 3005 ///CLOVER:ON 3006 } else { 3007 b = other[j++]; 3008 } 3009 // simplest of all the routines 3010 // sort the values, discarding identicals! 3011 while (true) { 3012 if (a < b) { 3013 buffer[k++] = a; 3014 a = list[i++]; 3015 } else if (b < a) { 3016 buffer[k++] = b; 3017 b = other[j++]; 3018 } else if (a != HIGH) { // at this point, a == b 3019 // discard both values! 3020 a = list[i++]; 3021 b = other[j++]; 3022 } else { // DONE! 3023 buffer[k++] = HIGH; 3024 len = k; 3025 break; 3026 } 3027 } 3028 // swap list and buffer 3029 int[] temp = list; 3030 list = buffer; 3031 buffer = temp; 3032 pat = null; 3033 return this; 3034 } 3035 3036 // polarity = 0 is normal: x union y 3037 // polarity = 2: x union ~y 3038 // polarity = 1: ~x union y 3039 // polarity = 3: ~x union ~y 3040 add(int[] other, int otherLen, int polarity)3041 private UnicodeSet add(int[] other, int otherLen, int polarity) { 3042 ensureBufferCapacity(len + otherLen); 3043 int i = 0, j = 0, k = 0; 3044 int a = list[i++]; 3045 int b = other[j++]; 3046 // change from xor is that we have to check overlapping pairs 3047 // polarity bit 1 means a is second, bit 2 means b is. 3048 main: 3049 while (true) { 3050 switch (polarity) { 3051 case 0: // both first; take lower if unequal 3052 if (a < b) { // take a 3053 // Back up over overlapping ranges in buffer[] 3054 if (k > 0 && a <= buffer[k-1]) { 3055 // Pick latter end value in buffer[] vs. list[] 3056 a = max(list[i], buffer[--k]); 3057 } else { 3058 // No overlap 3059 buffer[k++] = a; 3060 a = list[i]; 3061 } 3062 i++; // Common if/else code factored out 3063 polarity ^= 1; 3064 } else if (b < a) { // take b 3065 if (k > 0 && b <= buffer[k-1]) { 3066 b = max(other[j], buffer[--k]); 3067 } else { 3068 buffer[k++] = b; 3069 b = other[j]; 3070 } 3071 j++; 3072 polarity ^= 2; 3073 } else { // a == b, take a, drop b 3074 if (a == HIGH) break main; 3075 // This is symmetrical; it doesn't matter if 3076 // we backtrack with a or b. - liu 3077 if (k > 0 && a <= buffer[k-1]) { 3078 a = max(list[i], buffer[--k]); 3079 } else { 3080 // No overlap 3081 buffer[k++] = a; 3082 a = list[i]; 3083 } 3084 i++; 3085 polarity ^= 1; 3086 b = other[j++]; polarity ^= 2; 3087 } 3088 break; 3089 case 3: // both second; take higher if unequal, and drop other 3090 if (b <= a) { // take a 3091 if (a == HIGH) break main; 3092 buffer[k++] = a; 3093 } else { // take b 3094 if (b == HIGH) break main; 3095 buffer[k++] = b; 3096 } 3097 a = list[i++]; polarity ^= 1; // factored common code 3098 b = other[j++]; polarity ^= 2; 3099 break; 3100 case 1: // a second, b first; if b < a, overlap 3101 if (a < b) { // no overlap, take a 3102 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3103 } else if (b < a) { // OVERLAP, drop b 3104 b = other[j++]; polarity ^= 2; 3105 } else { // a == b, drop both! 3106 if (a == HIGH) break main; 3107 a = list[i++]; polarity ^= 1; 3108 b = other[j++]; polarity ^= 2; 3109 } 3110 break; 3111 case 2: // a first, b second; if a < b, overlap 3112 if (b < a) { // no overlap, take b 3113 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3114 } else if (a < b) { // OVERLAP, drop a 3115 a = list[i++]; polarity ^= 1; 3116 } else { // a == b, drop both! 3117 if (a == HIGH) break main; 3118 a = list[i++]; polarity ^= 1; 3119 b = other[j++]; polarity ^= 2; 3120 } 3121 break; 3122 } 3123 } 3124 buffer[k++] = HIGH; // terminate 3125 len = k; 3126 // swap list and buffer 3127 int[] temp = list; 3128 list = buffer; 3129 buffer = temp; 3130 pat = null; 3131 return this; 3132 } 3133 3134 // polarity = 0 is normal: x intersect y 3135 // polarity = 2: x intersect ~y == set-minus 3136 // polarity = 1: ~x intersect y 3137 // polarity = 3: ~x intersect ~y 3138 retain(int[] other, int otherLen, int polarity)3139 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 3140 ensureBufferCapacity(len + otherLen); 3141 int i = 0, j = 0, k = 0; 3142 int a = list[i++]; 3143 int b = other[j++]; 3144 // change from xor is that we have to check overlapping pairs 3145 // polarity bit 1 means a is second, bit 2 means b is. 3146 main: 3147 while (true) { 3148 switch (polarity) { 3149 case 0: // both first; drop the smaller 3150 if (a < b) { // drop a 3151 a = list[i++]; polarity ^= 1; 3152 } else if (b < a) { // drop b 3153 b = other[j++]; polarity ^= 2; 3154 } else { // a == b, take one, drop other 3155 if (a == HIGH) break main; 3156 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3157 b = other[j++]; polarity ^= 2; 3158 } 3159 break; 3160 case 3: // both second; take lower if unequal 3161 if (a < b) { // take a 3162 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3163 } else if (b < a) { // take b 3164 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3165 } else { // a == b, take one, drop other 3166 if (a == HIGH) break main; 3167 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3168 b = other[j++]; polarity ^= 2; 3169 } 3170 break; 3171 case 1: // a second, b first; 3172 if (a < b) { // NO OVERLAP, drop a 3173 a = list[i++]; polarity ^= 1; 3174 } else if (b < a) { // OVERLAP, take b 3175 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3176 } else { // a == b, drop both! 3177 if (a == HIGH) break main; 3178 a = list[i++]; polarity ^= 1; 3179 b = other[j++]; polarity ^= 2; 3180 } 3181 break; 3182 case 2: // a first, b second; if a < b, overlap 3183 if (b < a) { // no overlap, drop b 3184 b = other[j++]; polarity ^= 2; 3185 } else if (a < b) { // OVERLAP, take a 3186 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3187 } else { // a == b, drop both! 3188 if (a == HIGH) break main; 3189 a = list[i++]; polarity ^= 1; 3190 b = other[j++]; polarity ^= 2; 3191 } 3192 break; 3193 } 3194 } 3195 buffer[k++] = HIGH; // terminate 3196 len = k; 3197 // swap list and buffer 3198 int[] temp = list; 3199 list = buffer; 3200 buffer = temp; 3201 pat = null; 3202 return this; 3203 } 3204 max(int a, int b)3205 private static final int max(int a, int b) { 3206 return (a > b) ? a : b; 3207 } 3208 3209 //---------------------------------------------------------------- 3210 // Generic filter-based scanning code 3211 //---------------------------------------------------------------- 3212 3213 private static interface Filter { contains(int codePoint)3214 boolean contains(int codePoint); 3215 } 3216 3217 private static final class NumericValueFilter implements Filter { 3218 double value; NumericValueFilter(double value)3219 NumericValueFilter(double value) { this.value = value; } 3220 @Override contains(int ch)3221 public boolean contains(int ch) { 3222 return UCharacter.getUnicodeNumericValue(ch) == value; 3223 } 3224 } 3225 3226 private static final class GeneralCategoryMaskFilter implements Filter { 3227 int mask; GeneralCategoryMaskFilter(int mask)3228 GeneralCategoryMaskFilter(int mask) { this.mask = mask; } 3229 @Override contains(int ch)3230 public boolean contains(int ch) { 3231 return ((1 << UCharacter.getType(ch)) & mask) != 0; 3232 } 3233 } 3234 3235 private static final class IntPropertyFilter implements Filter { 3236 int prop; 3237 int value; IntPropertyFilter(int prop, int value)3238 IntPropertyFilter(int prop, int value) { 3239 this.prop = prop; 3240 this.value = value; 3241 } 3242 @Override contains(int ch)3243 public boolean contains(int ch) { 3244 return UCharacter.getIntPropertyValue(ch, prop) == value; 3245 } 3246 } 3247 3248 private static final class ScriptExtensionsFilter implements Filter { 3249 int script; ScriptExtensionsFilter(int script)3250 ScriptExtensionsFilter(int script) { this.script = script; } 3251 @Override contains(int c)3252 public boolean contains(int c) { 3253 return UScript.hasScript(c, script); 3254 } 3255 } 3256 3257 // VersionInfo for unassigned characters 3258 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 3259 3260 private static final class VersionFilter implements Filter { 3261 VersionInfo version; VersionFilter(VersionInfo version)3262 VersionFilter(VersionInfo version) { this.version = version; } 3263 @Override contains(int ch)3264 public boolean contains(int ch) { 3265 VersionInfo v = UCharacter.getAge(ch); 3266 // Reference comparison ok; VersionInfo caches and reuses 3267 // unique objects. 3268 return !Utility.sameObjects(v, NO_VERSION) && 3269 v.compareTo(version) <= 0; 3270 } 3271 } 3272 3273 /** 3274 * Generic filter-based scanning code for UCD property UnicodeSets. 3275 */ applyFilter(Filter filter, UnicodeSet inclusions)3276 private void applyFilter(Filter filter, UnicodeSet inclusions) { 3277 // Logically, walk through all Unicode characters, noting the start 3278 // and end of each range for which filter.contain(c) is 3279 // true. Add each range to a set. 3280 // 3281 // To improve performance, use an inclusions set which 3282 // encodes information about character ranges that are known 3283 // to have identical properties. 3284 // inclusions contains the first characters of 3285 // same-value ranges for the given property. 3286 3287 clear(); 3288 3289 int startHasProperty = -1; 3290 int limitRange = inclusions.getRangeCount(); 3291 3292 for (int j=0; j<limitRange; ++j) { 3293 // get current range 3294 int start = inclusions.getRangeStart(j); 3295 int end = inclusions.getRangeEnd(j); 3296 3297 // for all the code points in the range, process 3298 for (int ch = start; ch <= end; ++ch) { 3299 // only add to the unicodeset on inflection points -- 3300 // where the hasProperty value changes to false 3301 if (filter.contains(ch)) { 3302 if (startHasProperty < 0) { 3303 startHasProperty = ch; 3304 } 3305 } else if (startHasProperty >= 0) { 3306 add_unchecked(startHasProperty, ch-1); 3307 startHasProperty = -1; 3308 } 3309 } 3310 } 3311 if (startHasProperty >= 0) { 3312 add_unchecked(startHasProperty, 0x10FFFF); 3313 } 3314 } 3315 3316 /** 3317 * Remove leading and trailing Pattern_White_Space and compress 3318 * internal Pattern_White_Space to a single space character. 3319 */ mungeCharName(String source)3320 private static String mungeCharName(String source) { 3321 source = PatternProps.trimWhiteSpace(source); 3322 StringBuilder buf = null; 3323 for (int i=0; i<source.length(); ++i) { 3324 char ch = source.charAt(i); 3325 if (PatternProps.isWhiteSpace(ch)) { 3326 if (buf == null) { 3327 buf = new StringBuilder().append(source, 0, i); 3328 } else if (buf.charAt(buf.length() - 1) == ' ') { 3329 continue; 3330 } 3331 ch = ' '; // convert to ' ' 3332 } 3333 if (buf != null) { 3334 buf.append(ch); 3335 } 3336 } 3337 return buf == null ? source : buf.toString(); 3338 } 3339 3340 //---------------------------------------------------------------- 3341 // Property set API 3342 //---------------------------------------------------------------- 3343 3344 /** 3345 * Modifies this set to contain those code points which have the 3346 * given value for the given binary or enumerated property, as 3347 * returned by UCharacter.getIntPropertyValue. Prior contents of 3348 * this set are lost. 3349 * 3350 * @param prop a property in the range 3351 * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or 3352 * UProperty.INT_START..UProperty.INT_LIMIT-1 or. 3353 * UProperty.MASK_START..UProperty.MASK_LIMIT-1. 3354 * 3355 * @param value a value in the range 3356 * UCharacter.getIntPropertyMinValue(prop).. 3357 * UCharacter.getIntPropertyMaxValue(prop), with one exception. 3358 * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be 3359 * a UCharacter.getType() result, but rather a mask value produced 3360 * by logically ORing (1 << UCharacter.getType()) values together. 3361 * This allows grouped categories such as [:L:] to be represented. 3362 * 3363 * @return a reference to this set 3364 */ applyIntPropertyValue(int prop, int value)3365 public UnicodeSet applyIntPropertyValue(int prop, int value) { 3366 // All of the following include checkFrozen() before modifying this set. 3367 if (prop == UProperty.GENERAL_CATEGORY_MASK) { 3368 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3369 applyFilter(new GeneralCategoryMaskFilter(value), inclusions); 3370 } else if (prop == UProperty.SCRIPT_EXTENSIONS) { 3371 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3372 applyFilter(new ScriptExtensionsFilter(value), inclusions); 3373 } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) { 3374 if (value == 0 || value == 1) { 3375 set(CharacterProperties.getBinaryPropertySet(prop)); 3376 if (value == 0) { 3377 complement(); 3378 } 3379 } else { 3380 clear(); 3381 } 3382 } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) { 3383 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3384 applyFilter(new IntPropertyFilter(prop, value), inclusions); 3385 } else { 3386 throw new IllegalArgumentException("unsupported property " + prop); 3387 } 3388 return this; 3389 } 3390 3391 3392 3393 /** 3394 * Modifies this set to contain those code points which have the 3395 * given value for the given property. Prior contents of this 3396 * set are lost. 3397 * 3398 * @param propertyAlias a property alias, either short or long. 3399 * The name is matched loosely. See PropertyAliases.txt for names 3400 * and a description of loose matching. If the value string is 3401 * empty, then this string is interpreted as either a 3402 * General_Category value alias, a Script value alias, a binary 3403 * property alias, or a special ID. Special IDs are matched 3404 * loosely and correspond to the following sets: 3405 * 3406 * "ANY" = [\\u0000-\\U0010FFFF], 3407 * "ASCII" = [\\u0000-\\u007F]. 3408 * 3409 * @param valueAlias a value alias, either short or long. The 3410 * name is matched loosely. See PropertyValueAliases.txt for 3411 * names and a description of loose matching. In addition to 3412 * aliases listed, numeric values and canonical combining classes 3413 * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", 3414 * "220"). The value string may also be empty. 3415 * 3416 * @return a reference to this set 3417 */ applyPropertyAlias(String propertyAlias, String valueAlias)3418 public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { 3419 return applyPropertyAlias(propertyAlias, valueAlias, null); 3420 } 3421 3422 /** 3423 * Modifies this set to contain those code points which have the 3424 * given value for the given property. Prior contents of this 3425 * set are lost. 3426 * @param propertyAlias A string of the property alias. 3427 * @param valueAlias A string of the value alias. 3428 * @param symbols if not null, then symbols are first called to see if a property 3429 * is available. If true, then everything else is skipped. 3430 * @return this set 3431 */ applyPropertyAlias(String propertyAlias, String valueAlias, SymbolTable symbols)3432 public UnicodeSet applyPropertyAlias(String propertyAlias, 3433 String valueAlias, SymbolTable symbols) { 3434 checkFrozen(); 3435 int p; 3436 int v; 3437 boolean invert = false; 3438 3439 if (symbols != null 3440 && (symbols instanceof XSymbolTable) 3441 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { 3442 return this; 3443 } 3444 3445 if (XSYMBOL_TABLE != null) { 3446 if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { 3447 return this; 3448 } 3449 } 3450 3451 if (valueAlias.length() > 0) { 3452 p = UCharacter.getPropertyEnum(propertyAlias); 3453 3454 // Treat gc as gcm 3455 if (p == UProperty.GENERAL_CATEGORY) { 3456 p = UProperty.GENERAL_CATEGORY_MASK; 3457 } 3458 3459 if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) || 3460 (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) || 3461 (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) { 3462 try { 3463 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3464 } catch (IllegalArgumentException e) { 3465 // Handle numeric CCC 3466 if (p == UProperty.CANONICAL_COMBINING_CLASS || 3467 p == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 3468 p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) { 3469 v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias)); 3470 // Anything between 0 and 255 is valid even if unused. 3471 if (v < 0 || v > 255) throw e; 3472 } else { 3473 throw e; 3474 } 3475 } 3476 } 3477 3478 else { 3479 switch (p) { 3480 case UProperty.NUMERIC_VALUE: 3481 { 3482 double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); 3483 applyFilter(new NumericValueFilter(value), 3484 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3485 return this; 3486 } 3487 case UProperty.NAME: 3488 { 3489 // Must munge name, since 3490 // UCharacter.charFromName() does not do 3491 // 'loose' matching. 3492 String buf = mungeCharName(valueAlias); 3493 int ch = UCharacter.getCharFromExtendedName(buf); 3494 if (ch == -1) { 3495 throw new IllegalArgumentException("Invalid character name"); 3496 } 3497 clear(); 3498 add_unchecked(ch); 3499 return this; 3500 } 3501 case UProperty.UNICODE_1_NAME: 3502 // ICU 49 deprecates the Unicode_1_Name property APIs. 3503 throw new IllegalArgumentException("Unicode_1_Name (na1) not supported"); 3504 case UProperty.AGE: 3505 { 3506 // Must munge name, since 3507 // VersionInfo.getInstance() does not do 3508 // 'loose' matching. 3509 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 3510 applyFilter(new VersionFilter(version), 3511 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3512 return this; 3513 } 3514 case UProperty.SCRIPT_EXTENSIONS: 3515 v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias); 3516 // fall through to calling applyIntPropertyValue() 3517 break; 3518 default: 3519 // p is a non-binary, non-enumerated property that we 3520 // don't support (yet). 3521 throw new IllegalArgumentException("Unsupported property"); 3522 } 3523 } 3524 } 3525 3526 else { 3527 // valueAlias is empty. Interpret as General Category, Script, 3528 // Binary property, or ANY or ASCII. Upon success, p and v will 3529 // be set. 3530 UPropertyAliases pnames = UPropertyAliases.INSTANCE; 3531 p = UProperty.GENERAL_CATEGORY_MASK; 3532 v = pnames.getPropertyValueEnum(p, propertyAlias); 3533 if (v == UProperty.UNDEFINED) { 3534 p = UProperty.SCRIPT; 3535 v = pnames.getPropertyValueEnum(p, propertyAlias); 3536 if (v == UProperty.UNDEFINED) { 3537 p = pnames.getPropertyEnum(propertyAlias); 3538 if (p == UProperty.UNDEFINED) { 3539 p = -1; 3540 } 3541 if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) { 3542 v = 1; 3543 } else if (p == -1) { 3544 if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) { 3545 set(MIN_VALUE, MAX_VALUE); 3546 return this; 3547 } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) { 3548 set(0, 0x7F); 3549 return this; 3550 } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) { 3551 // [:Assigned:]=[:^Cn:] 3552 p = UProperty.GENERAL_CATEGORY_MASK; 3553 v = (1<<UCharacter.UNASSIGNED); 3554 invert = true; 3555 } else { 3556 // Property name was never matched. 3557 throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias); 3558 } 3559 } else { 3560 // Valid propery name, but it isn't binary, so the value 3561 // must be supplied. 3562 throw new IllegalArgumentException("Missing property value"); 3563 } 3564 } 3565 } 3566 } 3567 3568 applyIntPropertyValue(p, v); 3569 if(invert) { 3570 complement(); 3571 } 3572 3573 return this; 3574 } 3575 3576 //---------------------------------------------------------------- 3577 // Property set patterns 3578 //---------------------------------------------------------------- 3579 3580 /** 3581 * Return true if the given position, in the given pattern, appears 3582 * to be the start of a property set pattern. 3583 */ resemblesPropertyPattern(String pattern, int pos)3584 private static boolean resemblesPropertyPattern(String pattern, int pos) { 3585 // Patterns are at least 5 characters long 3586 if ((pos+5) > pattern.length()) { 3587 return false; 3588 } 3589 3590 // Look for an opening [:, [:^, \p, or \P 3591 return pattern.regionMatches(pos, "[:", 0, 2) || 3592 pattern.regionMatches(true, pos, "\\p", 0, 2) || 3593 pattern.regionMatches(pos, "\\N", 0, 2); 3594 } 3595 3596 /** 3597 * Return true if the given iterator appears to point at a 3598 * property pattern. Regardless of the result, return with the 3599 * iterator unchanged. 3600 * @param chars iterator over the pattern characters. Upon return 3601 * it will be unchanged. 3602 * @param iterOpts RuleCharacterIterator options 3603 */ resemblesPropertyPattern(RuleCharacterIterator chars, int iterOpts)3604 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 3605 int iterOpts) { 3606 boolean result = false; 3607 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 3608 Object pos = chars.getPos(null); 3609 int c = chars.next(iterOpts); 3610 if (c == '[' || c == '\\') { 3611 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 3612 result = (c == '[') ? (d == ':') : 3613 (d == 'N' || d == 'p' || d == 'P'); 3614 } 3615 chars.setPos(pos); 3616 return result; 3617 } 3618 3619 /** 3620 * Parse the given property pattern at the given parse position. 3621 * @param symbols TODO 3622 */ applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols)3623 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 3624 int pos = ppos.getIndex(); 3625 3626 // On entry, ppos should point to one of the following locations: 3627 3628 // Minimum length is 5 characters, e.g. \p{L} 3629 if ((pos+5) > pattern.length()) { 3630 return null; 3631 } 3632 3633 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 3634 boolean isName = false; // true for \N{pat}, o/w false 3635 boolean invert = false; 3636 3637 // Look for an opening [:, [:^, \p, or \P 3638 if (pattern.regionMatches(pos, "[:", 0, 2)) { 3639 posix = true; 3640 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3641 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 3642 ++pos; 3643 invert = true; 3644 } 3645 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 3646 pattern.regionMatches(pos, "\\N", 0, 2)) { 3647 char c = pattern.charAt(pos+1); 3648 invert = (c == 'P'); 3649 isName = (c == 'N'); 3650 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3651 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 3652 // Syntax error; "\p" or "\P" not followed by "{" 3653 return null; 3654 } 3655 } else { 3656 // Open delimiter not seen 3657 return null; 3658 } 3659 3660 // Look for the matching close delimiter, either :] or } 3661 int close = pattern.indexOf(posix ? ":]" : "}", pos); 3662 if (close < 0) { 3663 // Syntax error; close delimiter missing 3664 return null; 3665 } 3666 3667 // Look for an '=' sign. If this is present, we will parse a 3668 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 3669 // pattern. 3670 int equals = pattern.indexOf('=', pos); 3671 String propName, valueName; 3672 if (equals >= 0 && equals < close && !isName) { 3673 // Equals seen; parse medium/long pattern 3674 propName = pattern.substring(pos, equals); 3675 valueName = pattern.substring(equals+1, close); 3676 } 3677 3678 else { 3679 // Handle case where no '=' is seen, and \N{} 3680 propName = pattern.substring(pos, close); 3681 valueName = ""; 3682 3683 // Handle \N{name} 3684 if (isName) { 3685 // This is a little inefficient since it means we have to 3686 // parse "na" back to UProperty.NAME even though we already 3687 // know it's UProperty.NAME. If we refactor the API to 3688 // support args of (int, String) then we can remove 3689 // "na" and make this a little more efficient. 3690 valueName = propName; 3691 propName = "na"; 3692 } 3693 } 3694 3695 applyPropertyAlias(propName, valueName, symbols); 3696 3697 if (invert) { 3698 complement(); 3699 } 3700 3701 // Move to the limit position after the close delimiter 3702 ppos.setIndex(close + (posix ? 2 : 1)); 3703 3704 return this; 3705 } 3706 3707 /** 3708 * Parse a property pattern. 3709 * @param chars iterator over the pattern characters. Upon return 3710 * it will be advanced to the first character after the parsed 3711 * pattern, or the end of the iteration if all characters are 3712 * parsed. 3713 * @param rebuiltPat the pattern that was parsed, rebuilt or 3714 * copied from the input pattern, as appropriate. 3715 * @param symbols TODO 3716 */ applyPropertyPattern(RuleCharacterIterator chars, Appendable rebuiltPat, SymbolTable symbols)3717 private void applyPropertyPattern(RuleCharacterIterator chars, 3718 Appendable rebuiltPat, SymbolTable symbols) { 3719 String patStr = chars.lookahead(); 3720 ParsePosition pos = new ParsePosition(0); 3721 applyPropertyPattern(patStr, pos, symbols); 3722 if (pos.getIndex() == 0) { 3723 syntaxError(chars, "Invalid property pattern"); 3724 } 3725 chars.jumpahead(pos.getIndex()); 3726 append(rebuiltPat, patStr.substring(0, pos.getIndex())); 3727 } 3728 3729 //---------------------------------------------------------------- 3730 // Case folding API 3731 //---------------------------------------------------------------- 3732 3733 /** 3734 * Bitmask for constructor and applyPattern() indicating that 3735 * white space should be ignored. If set, ignore Unicode Pattern_White_Space characters, 3736 * unless they are quoted or escaped. This may be ORed together 3737 * with other selectors. 3738 */ 3739 public static final int IGNORE_SPACE = 1; 3740 3741 /** 3742 * Bitmask for constructor, applyPattern(), and closeOver() 3743 * indicating letter case. This may be ORed together with other 3744 * selectors. 3745 * 3746 * Enable case insensitive matching. E.g., "[ab]" with this flag 3747 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3748 * match all except 'a', 'A', 'b', and 'B'. This performs a full 3749 * closure over case mappings, e.g. U+017F for s. 3750 * 3751 * The resulting set is a superset of the input for the code points but 3752 * not for the strings. 3753 * It performs a case mapping closure of the code points and adds 3754 * full case folding strings for the code points, and reduces strings of 3755 * the original set to their full case folding equivalents. 3756 * 3757 * This is designed for case-insensitive matches, for example 3758 * in regular expressions. The full code point case closure allows checking of 3759 * an input character directly against the closure set. 3760 * Strings are matched by comparing the case-folded form from the closure 3761 * set with an incremental case folding of the string in question. 3762 * 3763 * The closure set will also contain single code points if the original 3764 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 3765 * This is not necessary (that is, redundant) for the above matching method 3766 * but results in the same closure sets regardless of whether the original 3767 * set contained the code point or a string. 3768 */ 3769 public static final int CASE = 2; 3770 3771 /** 3772 * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C 3773 * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h). 3774 * @see #CASE 3775 */ 3776 public static final int CASE_INSENSITIVE = 2; 3777 3778 /** 3779 * Bitmask for constructor, applyPattern(), and closeOver() 3780 * indicating letter case. This may be ORed together with other 3781 * selectors. 3782 * 3783 * Enable case insensitive matching. E.g., "[ab]" with this flag 3784 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3785 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 3786 * title-, and uppercase mappings as well as the case folding 3787 * of each existing element in the set. 3788 */ 3789 public static final int ADD_CASE_MAPPINGS = 4; 3790 3791 // add the result of a full case mapping to the set 3792 // use str as a temporary string to avoid constructing one addCaseMapping(UnicodeSet set, int result, StringBuilder full)3793 private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { 3794 if(result >= 0) { 3795 if(result > UCaseProps.MAX_STRING_LENGTH) { 3796 // add a single-code point case mapping 3797 set.add(result); 3798 } else { 3799 // add a string case mapping from full with length result 3800 set.add(full.toString()); 3801 full.setLength(0); 3802 } 3803 } 3804 // result < 0: the code point mapped to itself, no need to add it 3805 // see UCaseProps 3806 } 3807 3808 /** 3809 * Close this set over the given attribute. For the attribute 3810 * CASE, the result is to modify this set so that: 3811 * 3812 * 1. For each character or string 'a' in this set, all strings 3813 * 'b' such that foldCase(a) == foldCase(b) are added to this set. 3814 * (For most 'a' that are single characters, 'b' will have 3815 * b.length() == 1.) 3816 * 3817 * 2. For each string 'e' in the resulting set, if e != 3818 * foldCase(e), 'e' will be removed. 3819 * 3820 * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] 3821 * 3822 * (Here foldCase(x) refers to the operation 3823 * UCharacter.foldCase(x, true), and a == b actually denotes 3824 * a.equals(b), not pointer comparison.) 3825 * 3826 * @param attribute bitmask for attributes to close over. 3827 * Currently only the CASE bit is supported. Any undefined bits 3828 * are ignored. 3829 * @return a reference to this set. 3830 */ closeOver(int attribute)3831 public UnicodeSet closeOver(int attribute) { 3832 checkFrozen(); 3833 if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) { 3834 UCaseProps csp = UCaseProps.INSTANCE; 3835 UnicodeSet foldSet = new UnicodeSet(this); 3836 ULocale root = ULocale.ROOT; 3837 3838 // start with input set to guarantee inclusion 3839 // CASE: remove strings because the strings will actually be reduced (folded); 3840 // therefore, start with no strings and add only those needed 3841 if((attribute & CASE) != 0 && foldSet.hasStrings()) { 3842 foldSet.strings.clear(); 3843 } 3844 3845 int n = getRangeCount(); 3846 int result; 3847 StringBuilder full = new StringBuilder(); 3848 3849 for (int i=0; i<n; ++i) { 3850 int start = getRangeStart(i); 3851 int end = getRangeEnd(i); 3852 3853 if((attribute & CASE) != 0) { 3854 // full case closure 3855 for (int cp=start; cp<=end; ++cp) { 3856 csp.addCaseClosure(cp, foldSet); 3857 } 3858 } else { 3859 // add case mappings 3860 // (does not add long s for regular s, or Kelvin for k, for example) 3861 for (int cp=start; cp<=end; ++cp) { 3862 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); 3863 addCaseMapping(foldSet, result, full); 3864 3865 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); 3866 addCaseMapping(foldSet, result, full); 3867 3868 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); 3869 addCaseMapping(foldSet, result, full); 3870 3871 result = csp.toFullFolding(cp, full, 0); 3872 addCaseMapping(foldSet, result, full); 3873 } 3874 } 3875 } 3876 if (hasStrings()) { 3877 if ((attribute & CASE) != 0) { 3878 for (String s : strings) { 3879 String str = UCharacter.foldCase(s, 0); 3880 if(!csp.addStringCaseClosure(str, foldSet)) { 3881 foldSet.add(str); // does not map to code points: add the folded string itself 3882 } 3883 } 3884 } else { 3885 BreakIterator bi = BreakIterator.getWordInstance(root); 3886 for (String str : strings) { 3887 // TODO: call lower-level functions 3888 foldSet.add(UCharacter.toLowerCase(root, str)); 3889 foldSet.add(UCharacter.toTitleCase(root, str, bi)); 3890 foldSet.add(UCharacter.toUpperCase(root, str)); 3891 foldSet.add(UCharacter.foldCase(str, 0)); 3892 } 3893 } 3894 } 3895 set(foldSet); 3896 } 3897 return this; 3898 } 3899 3900 /** 3901 * Internal class for customizing UnicodeSet parsing of properties. 3902 * TODO: extend to allow customizing of codepoint ranges 3903 * @author medavis 3904 * @hide exposed on OHOS 3905 * @hide draft / provisional / internal are hidden on OHOS 3906 */ 3907 abstract public static class XSymbolTable implements SymbolTable { 3908 /** 3909 * Default constructor 3910 * @hide draft / provisional / internal are hidden on OHOS 3911 */ XSymbolTable()3912 public XSymbolTable(){} 3913 /** 3914 * Supplies default implementation for SymbolTable (no action). 3915 * @hide draft / provisional / internal are hidden on OHOS 3916 */ 3917 @Override lookupMatcher(int i)3918 public UnicodeMatcher lookupMatcher(int i) { 3919 return null; 3920 } 3921 3922 /** 3923 * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style 3924 * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be. 3925 * <p> 3926 * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a 3927 * propertyName+propertyValue combination. 3928 * 3929 * @param propertyName 3930 * the name of the property 3931 * @param propertyValue 3932 * the name of the property value 3933 * @param result UnicodeSet value to change 3934 * a set to which the characters having the propertyName+propertyValue are to be added. 3935 * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters 3936 * with that property have been added to the UnicodeSet, and returns false if the 3937 * propertyName+propertyValue combination is not recognized (in which case result is unaltered). 3938 * @hide draft / provisional / internal are hidden on OHOS 3939 */ applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result)3940 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 3941 return false; 3942 } 3943 /** 3944 * Supplies default implementation for SymbolTable (no action). 3945 * @hide draft / provisional / internal are hidden on OHOS 3946 */ 3947 @Override lookup(String s)3948 public char[] lookup(String s) { 3949 return null; 3950 } 3951 /** 3952 * Supplies default implementation for SymbolTable (no action). 3953 * @hide draft / provisional / internal are hidden on OHOS 3954 */ 3955 @Override parseReference(String text, ParsePosition pos, int limit)3956 public String parseReference(String text, ParsePosition pos, int limit) { 3957 return null; 3958 } 3959 } 3960 3961 /** 3962 * Is this frozen, according to the Freezable interface? 3963 * 3964 * @return value 3965 */ 3966 @Override isFrozen()3967 public boolean isFrozen() { 3968 return (bmpSet != null || stringSpan != null); 3969 } 3970 3971 /** 3972 * Freeze this class, according to the Freezable interface. 3973 * 3974 * @return this 3975 */ 3976 @Override freeze()3977 public UnicodeSet freeze() { 3978 if (!isFrozen()) { 3979 compact(); 3980 3981 // Optimize contains() and span() and similar functions. 3982 if (hasStrings()) { 3983 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL); 3984 } 3985 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { 3986 // Optimize for code point spans. 3987 // There are no strings, or 3988 // all strings are irrelevant for span() etc. because 3989 // all of each string's code points are contained in this set. 3990 // However, fully contained strings are relevant for spanAndCount(), 3991 // so we create both objects. 3992 bmpSet = new BMPSet(list, len); 3993 } 3994 } 3995 return this; 3996 } 3997 3998 /** 3999 * Span a string using this UnicodeSet. 4000 * <p>To replace, count elements, or delete spans, see {@link ohos.global.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4001 * @param s The string to be spanned 4002 * @param spanCondition The span condition 4003 * @return the length of the span 4004 */ span(CharSequence s, SpanCondition spanCondition)4005 public int span(CharSequence s, SpanCondition spanCondition) { 4006 return span(s, 0, spanCondition); 4007 } 4008 4009 /** 4010 * Span a string using this UnicodeSet. 4011 * If the start index is less than 0, span will start from 0. 4012 * If the start index is greater than the string length, span returns the string length. 4013 * <p>To replace, count elements, or delete spans, see {@link ohos.global.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4014 * @param s The string to be spanned 4015 * @param start The start index that the span begins 4016 * @param spanCondition The span condition 4017 * @return the string index which ends the span (i.e. exclusive) 4018 */ span(CharSequence s, int start, SpanCondition spanCondition)4019 public int span(CharSequence s, int start, SpanCondition spanCondition) { 4020 int end = s.length(); 4021 if (start < 0) { 4022 start = 0; 4023 } else if (start >= end) { 4024 return end; 4025 } 4026 if (bmpSet != null) { 4027 // Frozen set without strings, or no string is relevant for span(). 4028 return bmpSet.span(s, start, spanCondition, null); 4029 } 4030 if (stringSpan != null) { 4031 return stringSpan.span(s, start, spanCondition); 4032 } else if (hasStrings()) { 4033 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4034 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4035 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4036 if (strSpan.needsStringSpanUTF16()) { 4037 return strSpan.span(s, start, spanCondition); 4038 } 4039 } 4040 4041 return spanCodePointsAndCount(s, start, spanCondition, null); 4042 } 4043 4044 /** 4045 * Same as span() but also counts the smallest number of set elements on any path across the span. 4046 * <p>To replace, count elements, or delete spans, see {@link ohos.global.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4047 * @param outCount An output-only object (must not be null) for returning the count. 4048 * @return the limit (exclusive end) of the span 4049 * @deprecated This API is ICU internal only. 4050 * @hide deprecated on icu4j-org 4051 * @hide draft / provisional / internal are hidden on OHOS 4052 */ 4053 @Deprecated spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4054 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { 4055 if (outCount == null) { 4056 throw new IllegalArgumentException("outCount must not be null"); 4057 } 4058 int end = s.length(); 4059 if (start < 0) { 4060 start = 0; 4061 } else if (start >= end) { 4062 return end; 4063 } 4064 if (stringSpan != null) { 4065 // We might also have bmpSet != null, 4066 // but fully-contained strings are relevant for counting elements. 4067 return stringSpan.spanAndCount(s, start, spanCondition, outCount); 4068 } else if (bmpSet != null) { 4069 return bmpSet.span(s, start, spanCondition, outCount); 4070 } else if (hasStrings()) { 4071 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4072 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4073 which |= UnicodeSetStringSpan.WITH_COUNT; 4074 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4075 return strSpan.spanAndCount(s, start, spanCondition, outCount); 4076 } 4077 4078 return spanCodePointsAndCount(s, start, spanCondition, outCount); 4079 } 4080 spanCodePointsAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)4081 private int spanCodePointsAndCount(CharSequence s, int start, 4082 SpanCondition spanCondition, OutputInt outCount) { 4083 // Pin to 0/1 values. 4084 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4085 4086 int c; 4087 int next = start; 4088 int length = s.length(); 4089 int count = 0; 4090 do { 4091 c = Character.codePointAt(s, next); 4092 if (spanContained != contains(c)) { 4093 break; 4094 } 4095 ++count; 4096 next += Character.charCount(c); 4097 } while (next < length); 4098 if (outCount != null) { outCount.value = count; } 4099 return next; 4100 } 4101 4102 /** 4103 * Span a string backwards (from the end) using this UnicodeSet. 4104 * <p>To replace, count elements, or delete spans, see {@link ohos.global.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4105 * @param s The string to be spanned 4106 * @param spanCondition The span condition 4107 * @return The string index which starts the span (i.e. inclusive). 4108 */ spanBack(CharSequence s, SpanCondition spanCondition)4109 public int spanBack(CharSequence s, SpanCondition spanCondition) { 4110 return spanBack(s, s.length(), spanCondition); 4111 } 4112 4113 /** 4114 * Span a string backwards (from the fromIndex) using this UnicodeSet. 4115 * If the fromIndex is less than 0, spanBack will return 0. 4116 * If fromIndex is greater than the string length, spanBack will start from the string length. 4117 * <p>To replace, count elements, or delete spans, see {@link ohos.global.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4118 * @param s The string to be spanned 4119 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards 4120 * @param spanCondition The span condition 4121 * @return The string index which starts the span (i.e. inclusive). 4122 */ spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition)4123 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { 4124 if (fromIndex <= 0) { 4125 return 0; 4126 } 4127 if (fromIndex > s.length()) { 4128 fromIndex = s.length(); 4129 } 4130 if (bmpSet != null) { 4131 // Frozen set without strings, or no string is relevant for spanBack(). 4132 return bmpSet.spanBack(s, fromIndex, spanCondition); 4133 } 4134 if (stringSpan != null) { 4135 return stringSpan.spanBack(s, fromIndex, spanCondition); 4136 } else if (hasStrings()) { 4137 int which = (spanCondition == SpanCondition.NOT_CONTAINED) 4138 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED 4139 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; 4140 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4141 if (strSpan.needsStringSpanUTF16()) { 4142 return strSpan.spanBack(s, fromIndex, spanCondition); 4143 } 4144 } 4145 4146 // Pin to 0/1 values. 4147 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4148 4149 int c; 4150 int prev = fromIndex; 4151 do { 4152 c = Character.codePointBefore(s, prev); 4153 if (spanContained != contains(c)) { 4154 break; 4155 } 4156 prev -= Character.charCount(c); 4157 } while (prev > 0); 4158 return prev; 4159 } 4160 4161 /** 4162 * Clone a thawed version of this class, according to the Freezable interface. 4163 * @return the clone, not frozen 4164 */ 4165 @Override cloneAsThawed()4166 public UnicodeSet cloneAsThawed() { 4167 UnicodeSet result = new UnicodeSet(this); 4168 assert !result.isFrozen(); 4169 return result; 4170 } 4171 4172 // internal function checkFrozen()4173 private void checkFrozen() { 4174 if (isFrozen()) { 4175 throw new UnsupportedOperationException("Attempt to modify frozen object"); 4176 } 4177 } 4178 4179 // ************************ 4180 // Additional methods for integration with Generics and Collections 4181 // ************************ 4182 4183 /** 4184 * A struct-like class used for iteration through ranges, for faster iteration than by String. 4185 * Read about the restrictions on usage in {@link UnicodeSet#ranges()}. 4186 */ 4187 public static class EntryRange { 4188 /** 4189 * The starting code point of the range. 4190 */ 4191 public int codepoint; 4192 /** 4193 * The ending code point of the range 4194 */ 4195 public int codepointEnd; 4196 EntryRange()4197 EntryRange() { 4198 } 4199 4200 /** 4201 * {@inheritDoc} 4202 */ 4203 @Override toString()4204 public String toString() { 4205 StringBuilder b = new StringBuilder(); 4206 return ( 4207 codepoint == codepointEnd ? _appendToPat(b, codepoint, false) 4208 : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) 4209 .toString(); 4210 } 4211 } 4212 4213 /** 4214 * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points. 4215 * The UnicodeSet must not be altered during the iteration. 4216 * The EntryRange instance is the same each time; the contents are just reset. 4217 * 4218 * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings. 4219 * 4220 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4221 * Do not alter the UnicodeSet while iterating. 4222 * 4223 * <pre> 4224 * // Sample code 4225 * for (EntryRange range : us1.ranges()) { 4226 * // do something with code points between range.codepoint and range.codepointEnd; 4227 * } 4228 * for (String s : us1.strings()) { 4229 * // do something with each string; 4230 * } 4231 * </pre> 4232 */ ranges()4233 public Iterable<EntryRange> ranges() { 4234 return new EntryRangeIterable(); 4235 } 4236 4237 private class EntryRangeIterable implements Iterable<EntryRange> { 4238 @Override iterator()4239 public Iterator<EntryRange> iterator() { 4240 return new EntryRangeIterator(); 4241 } 4242 } 4243 4244 private class EntryRangeIterator implements Iterator<EntryRange> { 4245 int pos; 4246 EntryRange result = new EntryRange(); 4247 4248 @Override hasNext()4249 public boolean hasNext() { 4250 return pos < len-1; 4251 } 4252 @Override next()4253 public EntryRange next() { 4254 if (pos < len-1) { 4255 result.codepoint = list[pos++]; 4256 result.codepointEnd = list[pos++]-1; 4257 } else { 4258 throw new NoSuchElementException(); 4259 } 4260 return result; 4261 } 4262 @Override remove()4263 public void remove() { 4264 throw new UnsupportedOperationException(); 4265 } 4266 } 4267 4268 4269 /** 4270 * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. 4271 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4272 * Do not alter the UnicodeSet while iterating. 4273 * @see java.util.Set#iterator() 4274 */ 4275 @Override iterator()4276 public Iterator<String> iterator() { 4277 return new UnicodeSetIterator2(this); 4278 } 4279 4280 // Cover for string iteration. 4281 private static class UnicodeSetIterator2 implements Iterator<String> { 4282 // Invariants: 4283 // sourceList != null then sourceList[item] is a valid character 4284 // sourceList == null then delegates to stringIterator 4285 private int[] sourceList; 4286 private int len; 4287 private int item; 4288 private int current; 4289 private int limit; 4290 private SortedSet<String> sourceStrings; 4291 private Iterator<String> stringIterator; 4292 private char[] buffer; 4293 UnicodeSetIterator2(UnicodeSet source)4294 UnicodeSetIterator2(UnicodeSet source) { 4295 // set according to invariants 4296 len = source.len - 1; 4297 if (len > 0) { 4298 sourceStrings = source.strings; 4299 sourceList = source.list; 4300 current = sourceList[item++]; 4301 limit = sourceList[item++]; 4302 } else { 4303 stringIterator = source.strings.iterator(); 4304 sourceList = null; 4305 } 4306 } 4307 4308 /* (non-Javadoc) 4309 * @see java.util.Iterator#hasNext() 4310 */ 4311 @Override hasNext()4312 public boolean hasNext() { 4313 return sourceList != null || stringIterator.hasNext(); 4314 } 4315 4316 /* (non-Javadoc) 4317 * @see java.util.Iterator#next() 4318 */ 4319 @Override next()4320 public String next() { 4321 if (sourceList == null) { 4322 return stringIterator.next(); 4323 } 4324 int codepoint = current++; 4325 // we have the codepoint we need, but we may need to adjust the state 4326 if (current >= limit) { 4327 if (item >= len) { 4328 stringIterator = sourceStrings.iterator(); 4329 sourceList = null; 4330 } else { 4331 current = sourceList[item++]; 4332 limit = sourceList[item++]; 4333 } 4334 } 4335 // Now return. Single code point is easy 4336 if (codepoint <= 0xFFFF) { 4337 return String.valueOf((char)codepoint); 4338 } 4339 // But Java lacks a valueOfCodePoint, so we handle ourselves for speed 4340 // allocate a buffer the first time, to make conversion faster. 4341 if (buffer == null) { 4342 buffer = new char[2]; 4343 } 4344 // compute ourselves, to save tests and calls 4345 int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; 4346 buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); 4347 buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); 4348 return String.valueOf(buffer); 4349 } 4350 4351 /* (non-Javadoc) 4352 * @see java.util.Iterator#remove() 4353 */ 4354 @Override remove()4355 public void remove() { 4356 throw new UnsupportedOperationException(); 4357 } 4358 } 4359 4360 /** 4361 * @see #containsAll(ohos.global.icu.text.UnicodeSet) 4362 */ containsAll(Iterable<T> collection)4363 public <T extends CharSequence> boolean containsAll(Iterable<T> collection) { 4364 for (T o : collection) { 4365 if (!contains(o)) { 4366 return false; 4367 } 4368 } 4369 return true; 4370 } 4371 4372 /** 4373 * @see #containsNone(ohos.global.icu.text.UnicodeSet) 4374 */ containsNone(Iterable<T> collection)4375 public <T extends CharSequence> boolean containsNone(Iterable<T> collection) { 4376 for (T o : collection) { 4377 if (contains(o)) { 4378 return false; 4379 } 4380 } 4381 return true; 4382 } 4383 4384 /** 4385 * @see #containsAll(ohos.global.icu.text.UnicodeSet) 4386 */ containsSome(Iterable<T> collection)4387 public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) { 4388 return !containsNone(collection); 4389 } 4390 4391 /** 4392 * @see #addAll(ohos.global.icu.text.UnicodeSet) 4393 */ 4394 @SuppressWarnings("unchecked") // See ticket #11395, this is safe. addAll(T... collection)4395 public <T extends CharSequence> UnicodeSet addAll(T... collection) { 4396 checkFrozen(); 4397 for (T str : collection) { 4398 add(str); 4399 } 4400 return this; 4401 } 4402 4403 4404 /** 4405 * @see #removeAll(ohos.global.icu.text.UnicodeSet) 4406 */ removeAll(Iterable<T> collection)4407 public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) { 4408 checkFrozen(); 4409 for (T o : collection) { 4410 remove(o); 4411 } 4412 return this; 4413 } 4414 4415 /** 4416 * @see #retainAll(ohos.global.icu.text.UnicodeSet) 4417 */ retainAll(Iterable<T> collection)4418 public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) { 4419 checkFrozen(); 4420 // TODO optimize 4421 UnicodeSet toRetain = new UnicodeSet(); 4422 toRetain.addAll(collection); 4423 retainAll(toRetain); 4424 return this; 4425 } 4426 4427 /** 4428 * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}. 4429 */ 4430 public enum ComparisonStyle { 4431 /** 4432 */ 4433 SHORTER_FIRST, 4434 /** 4435 */ 4436 LEXICOGRAPHIC, 4437 /** 4438 */ 4439 LONGER_FIRST 4440 } 4441 4442 /** 4443 * Compares UnicodeSets, where shorter come first, and otherwise lexigraphically 4444 * (according to the comparison of the first characters that differ). 4445 * @see java.lang.Comparable#compareTo(java.lang.Object) 4446 */ 4447 @Override compareTo(UnicodeSet o)4448 public int compareTo(UnicodeSet o) { 4449 return compareTo(o, ComparisonStyle.SHORTER_FIRST); 4450 } 4451 /** 4452 * Compares UnicodeSets, in three different ways. 4453 * @see java.lang.Comparable#compareTo(java.lang.Object) 4454 */ compareTo(UnicodeSet o, ComparisonStyle style)4455 public int compareTo(UnicodeSet o, ComparisonStyle style) { 4456 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4457 int diff = size() - o.size(); 4458 if (diff != 0) { 4459 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4460 } 4461 } 4462 int result; 4463 for (int i = 0; ; ++i) { 4464 if (0 != (result = list[i] - o.list[i])) { 4465 // if either list ran out, compare to the last string 4466 if (list[i] == HIGH) { 4467 if (!hasStrings()) return 1; 4468 String item = strings.first(); 4469 return compare(item, o.list[i]); 4470 } 4471 if (o.list[i] == HIGH) { 4472 if (!o.hasStrings()) return -1; 4473 String item = o.strings.first(); 4474 int compareResult = compare(item, list[i]); 4475 return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order. 4476 } 4477 // otherwise return the result if even index, or the reversal if not 4478 return (i & 1) == 0 ? result : -result; 4479 } 4480 if (list[i] == HIGH) { 4481 break; 4482 } 4483 } 4484 return compare(strings, o.strings); 4485 } 4486 4487 /** 4488 */ compareTo(Iterable<String> other)4489 public int compareTo(Iterable<String> other) { 4490 return compare(this, other); 4491 } 4492 4493 /** 4494 * Utility to compare a string to a code point. 4495 * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) 4496 * and comparing, but much faster (no object creation). 4497 * Actually, there is one difference; a null compares as less. 4498 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4499 * @hide unsupported on OHOS 4500 */ 4501 compare(CharSequence string, int codePoint)4502 public static int compare(CharSequence string, int codePoint) { 4503 return CharSequences.compare(string, codePoint); 4504 } 4505 4506 /** 4507 * Utility to compare a string to a code point. 4508 * Same results as turning the code point into a string and comparing, but much faster (no object creation). 4509 * Actually, there is one difference; a null compares as less. 4510 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4511 * @hide unsupported on OHOS 4512 */ compare(int codePoint, CharSequence string)4513 public static int compare(int codePoint, CharSequence string) { 4514 return -CharSequences.compare(string, codePoint); 4515 } 4516 4517 4518 /** 4519 * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered, 4520 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4521 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4522 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4523 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4524 * @hide unsupported on OHOS 4525 */ compare(Iterable<T> collection1, Iterable<T> collection2)4526 public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) { 4527 return compare(collection1.iterator(), collection2.iterator()); 4528 } 4529 4530 /** 4531 * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, 4532 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4533 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4534 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4535 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4536 * @deprecated This API is ICU internal only. 4537 * @hide deprecated on icu4j-org 4538 * @hide draft / provisional / internal are hidden on OHOS 4539 */ 4540 @Deprecated compare(Iterator<T> first, Iterator<T> other)4541 public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) { 4542 while (true) { 4543 if (!first.hasNext()) { 4544 return other.hasNext() ? -1 : 0; 4545 } else if (!other.hasNext()) { 4546 return 1; 4547 } 4548 T item1 = first.next(); 4549 T item2 = other.next(); 4550 int result = item1.compareTo(item2); 4551 if (result != 0) { 4552 return result; 4553 } 4554 } 4555 } 4556 4557 4558 /** 4559 * Utility to compare two collections, optionally by size, and then lexicographically. 4560 * @hide unsupported on OHOS 4561 */ compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style)4562 public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) { 4563 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4564 int diff = collection1.size() - collection2.size(); 4565 if (diff != 0) { 4566 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4567 } 4568 } 4569 return compare(collection1, collection2); 4570 } 4571 4572 /** 4573 * Utility for adding the contents of an iterable to a collection. 4574 * @hide unsupported on OHOS 4575 */ addAllTo(Iterable<T> source, U target)4576 public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) { 4577 for (T item : source) { 4578 target.add(item); 4579 } 4580 return target; 4581 } 4582 4583 /** 4584 * Utility for adding the contents of an iterable to a collection. 4585 * @hide unsupported on OHOS 4586 */ addAllTo(Iterable<T> source, T[] target)4587 public static <T> T[] addAllTo(Iterable<T> source, T[] target) { 4588 int i = 0; 4589 for (T item : source) { 4590 target[i++] = item; 4591 } 4592 return target; 4593 } 4594 4595 /** 4596 * For iterating through the strings in the set. Example: 4597 * <pre> 4598 * for (String key : myUnicodeSet.strings()) { 4599 * doSomethingWith(key); 4600 * } 4601 * </pre> 4602 */ strings()4603 public Collection<String> strings() { 4604 if (hasStrings()) { 4605 return Collections.unmodifiableSortedSet(strings); 4606 } else { 4607 return EMPTY_STRINGS; 4608 } 4609 } 4610 4611 /** 4612 * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE. 4613 * @deprecated This API is ICU internal only. 4614 * @hide deprecated on icu4j-org 4615 * @hide draft / provisional / internal are hidden on OHOS 4616 */ 4617 @Deprecated getSingleCodePoint(CharSequence s)4618 public static int getSingleCodePoint(CharSequence s) { 4619 return CharSequences.getSingleCodePoint(s); 4620 } 4621 4622 /** 4623 * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set. 4624 * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E 4625 * if the dontCare set includes unassigned characters (for a particular version of Unicode). 4626 * @param dontCare Set with the don't-care characters for spanning 4627 * @return the input set, modified 4628 * @deprecated This API is ICU internal only. 4629 * @hide deprecated on icu4j-org 4630 * @hide draft / provisional / internal are hidden on OHOS 4631 */ 4632 @Deprecated addBridges(UnicodeSet dontCare)4633 public UnicodeSet addBridges(UnicodeSet dontCare) { 4634 UnicodeSet notInInput = new UnicodeSet(this).complement(); 4635 for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) { 4636 if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) { 4637 add(it.codepoint,it.codepointEnd); 4638 } 4639 } 4640 return this; 4641 } 4642 4643 /** 4644 * Find the first index at or after fromIndex where the UnicodeSet matches at that index. 4645 * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. 4646 * If there is no match, length is returned. 4647 * @deprecated This API is ICU internal only. Use span instead. 4648 * @hide deprecated on icu4j-org 4649 * @hide draft / provisional / internal are hidden on OHOS 4650 */ 4651 @Deprecated findIn(CharSequence value, int fromIndex, boolean findNot)4652 public int findIn(CharSequence value, int fromIndex, boolean findNot) { 4653 //TODO add strings, optimize, using ICU4C algorithms 4654 int cp; 4655 for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) { 4656 cp = UTF16.charAt(value, fromIndex); 4657 if (contains(cp) != findNot) { 4658 break; 4659 } 4660 } 4661 return fromIndex; 4662 } 4663 4664 /** 4665 * Find the last index before fromIndex where the UnicodeSet matches at that index. 4666 * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match. 4667 * If there is no match, -1 is returned. 4668 * BEFORE index is not in the UnicodeSet. 4669 * @deprecated This API is ICU internal only. Use spanBack instead. 4670 * @hide deprecated on icu4j-org 4671 * @hide draft / provisional / internal are hidden on OHOS 4672 */ 4673 @Deprecated findLastIn(CharSequence value, int fromIndex, boolean findNot)4674 public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { 4675 //TODO add strings, optimize, using ICU4C algorithms 4676 int cp; 4677 fromIndex -= 1; 4678 for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) { 4679 cp = UTF16.charAt(value, fromIndex); 4680 if (contains(cp) != findNot) { 4681 break; 4682 } 4683 } 4684 return fromIndex < 0 ? -1 : fromIndex; 4685 } 4686 4687 /** 4688 * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match. 4689 * @param source The source of the CharSequence to strip from. 4690 * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. 4691 * @return The string after it has been stripped. 4692 * @deprecated This API is ICU internal only. Use replaceFrom. 4693 * @hide deprecated on icu4j-org 4694 * @hide draft / provisional / internal are hidden on OHOS 4695 */ 4696 @Deprecated stripFrom(CharSequence source, boolean matches)4697 public String stripFrom(CharSequence source, boolean matches) { 4698 StringBuilder result = new StringBuilder(); 4699 for (int pos = 0; pos < source.length();) { 4700 int inside = findIn(source, pos, !matches); 4701 result.append(source.subSequence(pos, inside)); 4702 pos = findIn(source, inside, matches); // get next start 4703 } 4704 return result.toString(); 4705 } 4706 4707 /** 4708 * Argument values for whether span() and similar functions continue while the current character is contained vs. 4709 * not contained in the set. 4710 * <p> 4711 * The functionality is straightforward for sets with only single code points, without strings (which is the common 4712 * case): 4713 * <ul> 4714 * <li>CONTAINED and SIMPLE work the same. 4715 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. 4716 * <li>span() and spanBack() partition any string the 4717 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). 4718 * <li>Using a 4719 * complemented (inverted) set and the opposite span conditions yields the same results. 4720 * </ul> 4721 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in 4722 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with 4723 * strings: 4724 * <ul> 4725 * <li>The complement of the set contains the opposite set of code points, but the same set of strings. 4726 * Therefore, complementing both the set and the span conditions may yield different results. 4727 * <li>When starting spans 4728 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 4729 * because a set string may start before the later position. 4730 * <li>span(SIMPLE) may be shorter than 4731 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which 4732 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", 4733 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). 4734 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, 4735 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield 4736 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. 4737 * </ul> 4738 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then 4739 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could 4740 * be used. 4741 * <p> 4742 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point 4743 * boundaries, never in the middle of a surrogate pair. 4744 */ 4745 public enum SpanCondition { 4746 /** 4747 * Continues a span() while there is no set element at the current position. 4748 * Increments by one code point at a time. 4749 * Stops before the first set element (character or string). 4750 * (For code points only, this is like while contains(current)==false). 4751 * <p> 4752 * When span() returns, the substring between where it started and the position it returned consists only of 4753 * characters that are not in the set, and none of its strings overlap with the span. 4754 */ 4755 NOT_CONTAINED, 4756 4757 /** 4758 * Spans the longest substring that is a concatenation of set elements (characters or strings). 4759 * (For characters only, this is like while contains(current)==true). 4760 * <p> 4761 * When span() returns, the substring between where it started and the position it returned consists only of set 4762 * elements (characters or strings) that are in the set. 4763 * <p> 4764 * If a set contains strings, then the span will be the longest substring for which there 4765 * exists at least one non-overlapping concatenation of set elements (characters or strings). 4766 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 4767 * (Java/ICU/Perl regex stops at the first match of an OR.) 4768 */ 4769 CONTAINED, 4770 4771 /** 4772 * Continues a span() while there is a set element at the current position. 4773 * Increments by the longest matching element at each position. 4774 * (For characters only, this is like while contains(current)==true). 4775 * <p> 4776 * When span() returns, the substring between where it started and the position it returned consists only of set 4777 * elements (characters or strings) that are in the set. 4778 * <p> 4779 * If a set only contains single characters, then this is the same as CONTAINED. 4780 * <p> 4781 * If a set contains strings, then the span will be the longest substring with a match at each position with the 4782 * longest single set element (character or string). 4783 * <p> 4784 * Use this span condition together with other longest-match algorithms, such as ICU converters 4785 * (ucnv_getUnicodeSet()). 4786 */ 4787 SIMPLE, 4788 4789 /** 4790 * One more than the last span condition. 4791 */ 4792 CONDITION_COUNT 4793 } 4794 4795 /** 4796 * Get the default symbol table. Null means ordinary processing. For internal use only. 4797 * @return the symbol table 4798 * @deprecated This API is ICU internal only. 4799 * @hide deprecated on icu4j-org 4800 * @hide draft / provisional / internal are hidden on OHOS 4801 */ 4802 @Deprecated getDefaultXSymbolTable()4803 public static XSymbolTable getDefaultXSymbolTable() { 4804 return XSYMBOL_TABLE; 4805 } 4806 4807 /** 4808 * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing 4809 * of UnicodeSets. 4810 * <p> 4811 * WARNING: If this function is used with a UnicodeProperty, and the 4812 * Unassigned characters (gc=Cn) are different than in ICU, you MUST call 4813 * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} 4814 * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. 4815 * 4816 * @param xSymbolTable the new default symbol table. 4817 * @deprecated This API is ICU internal only. 4818 * @hide deprecated on icu4j-org 4819 * @hide draft / provisional / internal are hidden on OHOS 4820 */ 4821 @Deprecated setDefaultXSymbolTable(XSymbolTable xSymbolTable)4822 public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { 4823 // If the properties override inclusions, these have to be regenerated. 4824 // TODO: Check if the Unicode Tools or Unicode Utilities really need this. 4825 CharacterPropertiesImpl.clear(); 4826 XSYMBOL_TABLE = xSymbolTable; 4827 } 4828 } 4829 //eof 4830