1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2015, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.dev.test.lang; 11 12 import java.text.NumberFormat; 13 import java.text.ParsePosition; 14 import java.util.ArrayList; 15 import java.util.Arrays; 16 import java.util.Collection; 17 import java.util.Collections; 18 import java.util.Comparator; 19 import java.util.HashMap; 20 import java.util.HashSet; 21 import java.util.Iterator; 22 import java.util.LinkedHashSet; 23 import java.util.List; 24 import java.util.Set; 25 import java.util.SortedSet; 26 import java.util.TreeSet; 27 28 import org.junit.Test; 29 import org.junit.runner.RunWith; 30 import org.junit.runners.JUnit4; 31 32 import ohos.global.icu.dev.test.TestFmwk; 33 import ohos.global.icu.dev.util.CollectionUtilities; 34 import ohos.global.icu.impl.SortedSetRelation; 35 import ohos.global.icu.impl.Utility; 36 import ohos.global.icu.lang.UCharacter; 37 import ohos.global.icu.lang.UCharacterEnums.ECharacterCategory; 38 import ohos.global.icu.lang.UProperty; 39 import ohos.global.icu.lang.UScript; 40 import ohos.global.icu.text.SymbolTable; 41 import ohos.global.icu.text.UTF16; 42 import ohos.global.icu.text.UnicodeMatcher; 43 import ohos.global.icu.text.UnicodeSet; 44 import ohos.global.icu.text.UnicodeSet.ComparisonStyle; 45 import ohos.global.icu.text.UnicodeSet.EntryRange; 46 import ohos.global.icu.text.UnicodeSet.SpanCondition; 47 import ohos.global.icu.text.UnicodeSetIterator; 48 import ohos.global.icu.text.UnicodeSetSpanner; 49 import ohos.global.icu.text.UnicodeSetSpanner.CountMethod; 50 import ohos.global.icu.text.UnicodeSetSpanner.TrimOption; 51 import ohos.global.icu.util.OutputInt; 52 53 54 /** 55 * @test 56 * @summary General test of UnicodeSet 57 */ 58 59 @RunWith(JUnit4.class) 60 public class UnicodeSetTest extends TestFmwk { 61 62 static final String NOT = "%%%%"; 63 isCccValue(int ccc)64 private static final boolean isCccValue(int ccc) { 65 switch (ccc) { 66 case 0: 67 case 1: 68 case 7: 69 case 8: 70 case 9: 71 case 200: 72 case 202: 73 case 216: 74 case 218: 75 case 220: 76 case 222: 77 case 224: 78 case 226: 79 case 228: 80 case 230: 81 case 232: 82 case 233: 83 case 234: 84 case 240: 85 return true; 86 default: 87 return false; 88 } 89 } 90 91 @Test TestPropertyAccess()92 public void TestPropertyAccess() { 93 int count = 0; 94 // test to see that all of the names work 95 for (int propNum = UProperty.BINARY_START; propNum < UProperty.INT_LIMIT; ++propNum) { 96 count++; 97 //Skipping tests in the non-exhaustive mode to shorten the test time ticket#6475 98 if(TestFmwk.getExhaustiveness()<=5 && count%5!=0){ 99 continue; 100 } 101 if (propNum >= UProperty.BINARY_LIMIT && propNum < UProperty.INT_START) { // skip the gap 102 propNum = UProperty.INT_START; 103 } 104 for (int nameChoice = UProperty.NameChoice.SHORT; nameChoice <= UProperty.NameChoice.LONG; ++nameChoice) { 105 String propName; 106 try { 107 propName = UCharacter.getPropertyName(propNum, nameChoice); 108 if (propName == null) { 109 if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names 110 throw new NullPointerException(); 111 } 112 } catch (RuntimeException e1) { 113 errln("Can't get property name for: " 114 + "Property (" + propNum + ")" 115 + ", NameChoice: " + nameChoice + ", " 116 + e1.getClass().getName()); 117 continue; 118 } 119 logln("Property (" + propNum + "): " + propName); 120 for (int valueNum = UCharacter.getIntPropertyMinValue(propNum); valueNum <= UCharacter.getIntPropertyMaxValue(propNum); ++valueNum) { 121 String valueName; 122 try { 123 valueName = UCharacter.getPropertyValueName(propNum, valueNum, nameChoice); 124 if (valueName == null) { 125 if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names 126 if ((propNum == UProperty.CANONICAL_COMBINING_CLASS || 127 propNum == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 128 propNum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) && 129 !isCccValue(valueNum)) { 130 // Only a few of the canonical combining classes have names. 131 // Otherwise they are just integer values. 132 continue; 133 } else { 134 throw new NullPointerException(); 135 } 136 } 137 } catch (RuntimeException e1) { 138 errln("Can't get property value name for: " 139 + "Property (" + propNum + "): " + propName + ", " 140 + "Value (" + valueNum + ") " 141 + ", NameChoice: " + nameChoice + ", " 142 + e1.getClass().getName()); 143 continue; 144 } 145 logln("Value (" + valueNum + "): " + valueName); 146 UnicodeSet testSet; 147 try { 148 testSet = new UnicodeSet("[:" + propName + "=" + valueName + ":]"); 149 } catch (RuntimeException e) { 150 errln("Can't create UnicodeSet for: " 151 + "Property (" + propNum + "): " + propName + ", " 152 + "Value (" + valueNum + "): " + valueName + ", " 153 + e.getClass().getName()); 154 continue; 155 } 156 UnicodeSet collectedErrors = new UnicodeSet(); 157 for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) { 158 int value = UCharacter.getIntPropertyValue(it.codepoint, propNum); 159 if (value != valueNum) { 160 collectedErrors.add(it.codepoint); 161 } 162 } 163 if (collectedErrors.size() != 0) { 164 errln("Property Value Differs: " 165 + "Property (" + propNum + "): " + propName + ", " 166 + "Value (" + valueNum + "): " + valueName + ", " 167 + "Differing values: " + collectedErrors.toPattern(true)); 168 } 169 } 170 } 171 } 172 } 173 174 175 /** 176 * Test toPattern(). 177 */ 178 @Test TestToPattern()179 public void TestToPattern() throws Exception { 180 // Test that toPattern() round trips with syntax characters 181 // and whitespace. 182 for (int i = 0; i < OTHER_TOPATTERN_TESTS.length; ++i) { 183 checkPat(OTHER_TOPATTERN_TESTS[i], new UnicodeSet(OTHER_TOPATTERN_TESTS[i])); 184 } 185 for (int i = 0; i <= 0x10FFFF; ++i) { 186 if ((i <= 0xFF && !UCharacter.isLetter(i)) || UCharacter.isWhitespace(i)) { 187 // check various combinations to make sure they all work. 188 if (i != 0 && !toPatternAux(i, i)) continue; 189 if (!toPatternAux(0, i)) continue; 190 if (!toPatternAux(i, 0xFFFF)) continue; 191 } 192 } 193 194 // Test pattern behavior of multicharacter strings. 195 UnicodeSet s = new UnicodeSet("[a-z {aa} {ab}]"); 196 expectToPattern(s, "[a-z{aa}{ab}]", 197 new String[] {"aa", "ab", NOT, "ac"}); 198 s.add("ac"); 199 expectToPattern(s, "[a-z{aa}{ab}{ac}]", 200 new String[] {"aa", "ab", "ac", NOT, "xy"}); 201 202 s.applyPattern("[a-z {\\{l} {r\\}}]"); 203 expectToPattern(s, "[a-z{r\\}}{\\{l}]", 204 new String[] {"{l", "r}", NOT, "xy"}); 205 s.add("[]"); 206 expectToPattern(s, "[a-z{\\[\\]}{r\\}}{\\{l}]", 207 new String[] {"{l", "r}", "[]", NOT, "xy"}); 208 209 s.applyPattern("[a-z {\u4E01\u4E02}{\\n\\r}]"); 210 expectToPattern(s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", 211 new String[] {"\u4E01\u4E02", "\n\r"}); 212 213 s.clear(); 214 s.add("abc"); 215 s.add("abc"); 216 expectToPattern(s, "[{abc}]", 217 new String[] {"abc", NOT, "ab"}); 218 219 // JB#3400: For 2 character ranges prefer [ab] to [a-b] 220 s.clear(); 221 s.add('a', 'b'); 222 expectToPattern(s, "[ab]", null); 223 224 // Cover applyPattern, applyPropertyAlias 225 s.clear(); 226 s.applyPattern("[ab ]", true); 227 expectToPattern(s, "[ab]", new String[] {"a", NOT, "ab", " "}); 228 s.clear(); 229 s.applyPattern("[ab ]", false); 230 expectToPattern(s, "[\\ ab]", new String[] {"a", "\u0020", NOT, "ab"}); 231 232 s.clear(); 233 s.applyPropertyAlias("nv", "0.5"); 234 s.retainAll(new UnicodeSet("[:age=6.0:]")); // stabilize this test 235 expectToPattern(s, "[\\u00BD\\u0B73\\u0D74\\u0F2A\\u2CFD\\uA831\\U00010141\\U00010175\\U00010176\\U00010E7B]", null); 236 // Unicode 5.1 adds Malayalam 1/2 (\u0D74) 237 // Unicode 5.2 adds U+A831 NORTH INDIC FRACTION ONE HALF and U+10E7B RUMI FRACTION ONE HALF 238 // Unicode 6.0 adds U+0B73 ORIYA FRACTION ONE HALF 239 240 s.clear(); 241 s.applyPropertyAlias("gc", "Lu"); 242 // TODO expectToPattern(s, what?) 243 244 // RemoveAllStrings() 245 s.clear(); 246 s.applyPattern("[a-z{abc}{def}]"); 247 expectToPattern(s, "[a-z{abc}{def}]", null); 248 s.removeAllStrings(); 249 expectToPattern(s, "[a-z]", null); 250 } 251 252 static String[] OTHER_TOPATTERN_TESTS = { 253 "[[:latin:]&[:greek:]]", 254 "[[:latin:]-[:greek:]]", 255 "[:nonspacing mark:]" 256 }; 257 258 toPatternAux(int start, int end)259 public boolean toPatternAux(int start, int end) { 260 // use Integer.toString because Utility.hex doesn't handle ints 261 String source = "0x" + Integer.toString(start,16).toUpperCase(); 262 if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); 263 UnicodeSet testSet = new UnicodeSet(); 264 testSet.add(start, end); 265 return checkPat(source, testSet); 266 } 267 checkPat(String source, UnicodeSet testSet)268 boolean checkPat (String source, UnicodeSet testSet) { 269 String pat = ""; 270 try { 271 // What we want to make sure of is that a pattern generated 272 // by toPattern(), with or without escaped unprintables, can 273 // be passed back into the UnicodeSet constructor. 274 String pat0 = testSet.toPattern(true); 275 if (!checkPat(source + " (escaped)", testSet, pat0)) return false; 276 277 //String pat1 = unescapeLeniently(pat0); 278 //if (!checkPat(source + " (in code)", testSet, pat1)) return false; 279 280 String pat2 = testSet.toPattern(false); 281 if (!checkPat(source, testSet, pat2)) return false; 282 283 //String pat3 = unescapeLeniently(pat2); 284 //if (!checkPat(source + " (in code)", testSet, pat3)) return false; 285 286 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); 287 logln(source + " => " + pat0 + ", " + pat2); 288 } catch (Exception e) { 289 errln("EXCEPTION in toPattern: " + source + " => " + pat); 290 return false; 291 } 292 return true; 293 } 294 checkPat(String source, UnicodeSet testSet, String pat)295 boolean checkPat (String source, UnicodeSet testSet, String pat) { 296 UnicodeSet testSet2 = new UnicodeSet(pat); 297 if (!testSet2.equals(testSet)) { 298 errln("Fail toPattern: " + source + "; " + pat + " => " + 299 testSet2.toPattern(false) + ", expected " + 300 testSet.toPattern(false)); 301 return false; 302 } 303 return true; 304 } 305 306 // NOTE: copied the following from Utility. There ought to be a version in there with a flag 307 // that does the Java stuff 308 unescapeAt(String s, int[] offset16)309 public static int unescapeAt(String s, int[] offset16) { 310 int c; 311 int result = 0; 312 int n = 0; 313 int minDig = 0; 314 int maxDig = 0; 315 int bitsPerDigit = 4; 316 int dig; 317 int i; 318 319 /* Check that offset is in range */ 320 int offset = offset16[0]; 321 int length = s.length(); 322 if (offset < 0 || offset >= length) { 323 return -1; 324 } 325 326 /* Fetch first UChar after '\\' */ 327 c = UTF16.charAt(s, offset); 328 offset += UTF16.getCharCount(c); 329 330 /* Convert hexadecimal and octal escapes */ 331 switch (c) { 332 case 'u': 333 minDig = maxDig = 4; 334 break; 335 /* 336 case 'U': 337 minDig = maxDig = 8; 338 break; 339 case 'x': 340 minDig = 1; 341 maxDig = 2; 342 break; 343 */ 344 default: 345 dig = UCharacter.digit(c, 8); 346 if (dig >= 0) { 347 minDig = 1; 348 maxDig = 3; 349 n = 1; /* Already have first octal digit */ 350 bitsPerDigit = 3; 351 result = dig; 352 } 353 break; 354 } 355 if (minDig != 0) { 356 while (offset < length && n < maxDig) { 357 // TEMPORARY 358 // TODO: Restore the char32-based code when UCharacter.digit 359 // is working (Bug 66). 360 361 //c = UTF16.charAt(s, offset); 362 //dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 363 c = s.charAt(offset); 364 dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16); 365 if (dig < 0) { 366 break; 367 } 368 result = (result << bitsPerDigit) | dig; 369 //offset += UTF16.getCharCount(c); 370 ++offset; 371 ++n; 372 } 373 if (n < minDig) { 374 return -1; 375 } 376 offset16[0] = offset; 377 return result; 378 } 379 380 /* Convert C-style escapes in table */ 381 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 382 if (c == UNESCAPE_MAP[i]) { 383 offset16[0] = offset; 384 return UNESCAPE_MAP[i+1]; 385 } else if (c < UNESCAPE_MAP[i]) { 386 break; 387 } 388 } 389 390 /* If no special forms are recognized, then consider 391 * the backslash to generically escape the next character. */ 392 offset16[0] = offset; 393 return c; 394 } 395 396 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 397 static private final char[] UNESCAPE_MAP = { 398 /*" 0x22, 0x22 */ 399 /*' 0x27, 0x27 */ 400 /*? 0x3F, 0x3F */ 401 /*\ 0x5C, 0x5C */ 402 /*a*/ 0x61, 0x07, 403 /*b*/ 0x62, 0x08, 404 /*f*/ 0x66, 0x0c, 405 /*n*/ 0x6E, 0x0a, 406 /*r*/ 0x72, 0x0d, 407 /*t*/ 0x74, 0x09, 408 /*v*/ 0x76, 0x0b 409 }; 410 411 /** 412 * Convert all escapes in a given string using unescapeAt(). 413 * Leave invalid escape sequences unchanged. 414 */ unescapeLeniently(String s)415 public static String unescapeLeniently(String s) { 416 StringBuffer buf = new StringBuffer(); 417 int[] pos = new int[1]; 418 for (int i=0; i<s.length(); ) { 419 char c = s.charAt(i++); 420 if (c == '\\') { 421 pos[0] = i; 422 int e = unescapeAt(s, pos); 423 if (e < 0) { 424 buf.append(c); 425 } else { 426 UTF16.append(buf, e); 427 i = pos[0]; 428 } 429 } else { 430 buf.append(c); 431 } 432 } 433 return buf.toString(); 434 } 435 436 @Test TestPatterns()437 public void TestPatterns() { 438 UnicodeSet set = new UnicodeSet(); 439 expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km"); 440 expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz"); 441 expectPattern(set, "[a\\-z]", "--aazz"); 442 expectPattern(set, "[-az]", "--aazz"); 443 expectPattern(set, "[az-]", "--aazz"); 444 expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz"); 445 446 // Throw in a test of complement 447 set.complement(); 448 String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF'; 449 expectPairs(set, exp); 450 } 451 452 @Test TestCategories()453 public void TestCategories() { 454 int failures = 0; 455 UnicodeSet set = new UnicodeSet("[:Lu:]"); 456 expectContainment(set, "ABC", "abc"); 457 458 // Make sure generation of L doesn't pollute cached Lu set 459 // First generate L, then Lu 460 // not used int TOP = 0x200; // Don't need to go over the whole range: 461 set = new UnicodeSet("[:L:]"); 462 for (int i=0; i<0x200; ++i) { 463 boolean l = UCharacter.isLetter(i); 464 if (l != set.contains((char)i)) { 465 errln("FAIL: L contains " + (char)i + " = " + 466 set.contains((char)i)); 467 if (++failures == 10) break; 468 } 469 } 470 471 set = new UnicodeSet("[:Lu:]"); 472 for (int i=0; i<0x200; ++i) { 473 boolean lu = (UCharacter.getType(i) == ECharacterCategory.UPPERCASE_LETTER); 474 if (lu != set.contains((char)i)) { 475 errln("FAIL: Lu contains " + (char)i + " = " + 476 set.contains((char)i)); 477 if (++failures == 20) break; 478 } 479 } 480 } 481 482 @Test TestAddRemove()483 public void TestAddRemove() { 484 UnicodeSet set = new UnicodeSet(); 485 set.add('a', 'z'); 486 expectPairs(set, "az"); 487 set.remove('m', 'p'); 488 expectPairs(set, "alqz"); 489 set.remove('e', 'g'); 490 expectPairs(set, "adhlqz"); 491 set.remove('d', 'i'); 492 expectPairs(set, "acjlqz"); 493 set.remove('c', 'r'); 494 expectPairs(set, "absz"); 495 set.add('f', 'q'); 496 expectPairs(set, "abfqsz"); 497 set.remove('a', 'g'); 498 expectPairs(set, "hqsz"); 499 set.remove('a', 'z'); 500 expectPairs(set, ""); 501 502 // Try removing an entire set from another set 503 expectPattern(set, "[c-x]", "cx"); 504 UnicodeSet set2 = new UnicodeSet(); 505 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); 506 set.removeAll(set2); 507 expectPairs(set, "deluxx"); 508 509 // Try adding an entire set to another set 510 expectPattern(set, "[jackiemclean]", "aacceein"); 511 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); 512 set.addAll(set2); 513 expectPairs(set, "aacehort"); 514 515 // Test commutativity 516 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); 517 expectPattern(set2, "[jackiemclean]", "aacceein"); 518 set.addAll(set2); 519 expectPairs(set, "aacehort"); 520 } 521 522 /** 523 * Make sure minimal representation is maintained. 524 */ 525 @Test TestMinimalRep()526 public void TestMinimalRep() { 527 // This is pretty thoroughly tested by checkCanonicalRep() 528 // run against the exhaustive operation results. Use the code 529 // here for debugging specific spot problems. 530 531 // 1 overlap against 2 532 UnicodeSet set = new UnicodeSet("[h-km-q]"); 533 UnicodeSet set2 = new UnicodeSet("[i-o]"); 534 set.addAll(set2); 535 expectPairs(set, "hq"); 536 // right 537 set.applyPattern("[a-m]"); 538 set2.applyPattern("[e-o]"); 539 set.addAll(set2); 540 expectPairs(set, "ao"); 541 // left 542 set.applyPattern("[e-o]"); 543 set2.applyPattern("[a-m]"); 544 set.addAll(set2); 545 expectPairs(set, "ao"); 546 // 1 overlap against 3 547 set.applyPattern("[a-eg-mo-w]"); 548 set2.applyPattern("[d-q]"); 549 set.addAll(set2); 550 expectPairs(set, "aw"); 551 } 552 553 @Test TestAPI()554 public void TestAPI() { 555 // default ct 556 UnicodeSet set = new UnicodeSet(); 557 if (!set.isEmpty() || set.getRangeCount() != 0) { 558 errln("FAIL, set should be empty but isn't: " + 559 set); 560 } 561 562 // clear(), isEmpty() 563 set.add('a'); 564 if (set.isEmpty()) { 565 errln("FAIL, set shouldn't be empty but is: " + 566 set); 567 } 568 set.clear(); 569 if (!set.isEmpty()) { 570 errln("FAIL, set should be empty but isn't: " + 571 set); 572 } 573 574 // size() 575 set.clear(); 576 if (set.size() != 0) { 577 errln("FAIL, size should be 0, but is " + set.size() + 578 ": " + set); 579 } 580 set.add('a'); 581 if (set.size() != 1) { 582 errln("FAIL, size should be 1, but is " + set.size() + 583 ": " + set); 584 } 585 set.add('1', '9'); 586 if (set.size() != 10) { 587 errln("FAIL, size should be 10, but is " + set.size() + 588 ": " + set); 589 } 590 set.clear(); 591 set.complement(); 592 if (set.size() != 0x110000) { 593 errln("FAIL, size should be 0x110000, but is" + set.size()); 594 } 595 596 // contains(first, last) 597 set.clear(); 598 set.applyPattern("[A-Y 1-8 b-d l-y]"); 599 for (int i = 0; i<set.getRangeCount(); ++i) { 600 int a = set.getRangeStart(i); 601 int b = set.getRangeEnd(i); 602 if (!set.contains(a, b)) { 603 errln("FAIL, should contain " + (char)a + '-' + (char)b + 604 " but doesn't: " + set); 605 } 606 if (set.contains((char)(a-1), b)) { 607 errln("FAIL, shouldn't contain " + 608 (char)(a-1) + '-' + (char)b + 609 " but does: " + set); 610 } 611 if (set.contains(a, (char)(b+1))) { 612 errln("FAIL, shouldn't contain " + 613 (char)a + '-' + (char)(b+1) + 614 " but does: " + set); 615 } 616 } 617 618 // Ported InversionList test. 619 UnicodeSet a = new UnicodeSet((char)3,(char)10); 620 UnicodeSet b = new UnicodeSet((char)7,(char)15); 621 UnicodeSet c = new UnicodeSet(); 622 623 logln("a [3-10]: " + a); 624 logln("b [7-15]: " + b); 625 c.set(a); c.addAll(b); 626 UnicodeSet exp = new UnicodeSet((char)3,(char)15); 627 if (c.equals(exp)) { 628 logln("c.set(a).add(b): " + c); 629 } else { 630 errln("FAIL: c.set(a).add(b) = " + c + ", expect " + exp); 631 } 632 c.complement(); 633 exp.set((char)0, (char)2); 634 exp.add((char)16, UnicodeSet.MAX_VALUE); 635 if (c.equals(exp)) { 636 logln("c.complement(): " + c); 637 } else { 638 errln(Utility.escape("FAIL: c.complement() = " + c + ", expect " + exp)); 639 } 640 c.complement(); 641 exp.set((char)3, (char)15); 642 if (c.equals(exp)) { 643 logln("c.complement(): " + c); 644 } else { 645 errln("FAIL: c.complement() = " + c + ", expect " + exp); 646 } 647 c.set(a); c.complementAll(b); 648 exp.set((char)3,(char)6); 649 exp.add((char)11,(char) 15); 650 if (c.equals(exp)) { 651 logln("c.set(a).complement(b): " + c); 652 } else { 653 errln("FAIL: c.set(a).complement(b) = " + c + ", expect " + exp); 654 } 655 656 exp.set(c); 657 c = bitsToSet(setToBits(c)); 658 if (c.equals(exp)) { 659 logln("bitsToSet(setToBits(c)): " + c); 660 } else { 661 errln("FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp); 662 } 663 664 // Additional tests for coverage JB#2118 665 //UnicodeSet::complement(class UnicodeString const &) 666 //UnicodeSet::complementAll(class UnicodeString const &) 667 //UnicodeSet::containsNone(class UnicodeSet const &) 668 //UnicodeSet::containsNone(long,long) 669 //UnicodeSet::containsSome(class UnicodeSet const &) 670 //UnicodeSet::containsSome(long,long) 671 //UnicodeSet::removeAll(class UnicodeString const &) 672 //UnicodeSet::retain(long) 673 //UnicodeSet::retainAll(class UnicodeString const &) 674 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &) 675 //UnicodeSetIterator::getString(void) 676 set.clear(); 677 set.complement("ab"); 678 exp.applyPattern("[{ab}]"); 679 if (!set.equals(exp)) { errln("FAIL: complement(\"ab\")"); return; } 680 681 UnicodeSetIterator iset = new UnicodeSetIterator(set); 682 if (!iset.next() || iset.codepoint != UnicodeSetIterator.IS_STRING) { 683 errln("FAIL: UnicodeSetIterator.next/IS_STRING"); 684 } else if (!iset.string.equals("ab")) { 685 errln("FAIL: UnicodeSetIterator.string"); 686 } 687 688 set.add((char)0x61, (char)0x7A); 689 set.complementAll("alan"); 690 exp.applyPattern("[{ab}b-kmo-z]"); 691 if (!set.equals(exp)) { errln("FAIL: complementAll(\"alan\")"); return; } 692 693 exp.applyPattern("[a-z]"); 694 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 695 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 696 exp.applyPattern("[aln]"); 697 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); } 698 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); } 699 700 if (set.containsNone((char)0x61, (char)0x7A)) { 701 errln("FAIL: containsNone(char, char)"); 702 } 703 if (!set.containsSome((char)0x61, (char)0x7A)) { 704 errln("FAIL: containsSome(char, char)"); 705 } 706 if (!set.containsNone((char)0x41, (char)0x5A)) { 707 errln("FAIL: containsNone(char, char)"); 708 } 709 if (set.containsSome((char)0x41, (char)0x5A)) { 710 errln("FAIL: containsSome(char, char)"); 711 } 712 713 set.removeAll("liu"); 714 exp.applyPattern("[{ab}b-hj-kmo-tv-z]"); 715 if (!set.equals(exp)) { errln("FAIL: removeAll(\"liu\")"); return; } 716 717 set.retainAll("star"); 718 exp.applyPattern("[rst]"); 719 if (!set.equals(exp)) { errln("FAIL: retainAll(\"star\")"); return; } 720 721 set.retain((char)0x73); 722 exp.applyPattern("[s]"); 723 if (!set.equals(exp)) { errln("FAIL: retain('s')"); return; } 724 725 // ICU 2.6 coverage tests 726 // public final UnicodeSet retain(String s); 727 // public final UnicodeSet remove(int c); 728 // public final UnicodeSet remove(String s); 729 // public int hashCode(); 730 set.applyPattern("[a-z{ab}{cd}]"); 731 set.retain("cd"); 732 exp.applyPattern("[{cd}]"); 733 if (!set.equals(exp)) { errln("FAIL: retain(\"cd\")"); return; } 734 735 set.applyPattern("[a-z{ab}{cd}]"); 736 set.remove((char)0x63); 737 exp.applyPattern("[abd-z{ab}{cd}]"); 738 if (!set.equals(exp)) { errln("FAIL: remove('c')"); return; } 739 740 set.remove("cd"); 741 exp.applyPattern("[abd-z{ab}]"); 742 if (!set.equals(exp)) { errln("FAIL: remove(\"cd\")"); return; } 743 744 if (set.hashCode() != exp.hashCode()) { 745 errln("FAIL: hashCode() unequal"); 746 } 747 exp.clear(); 748 if (set.hashCode() == exp.hashCode()) { 749 errln("FAIL: hashCode() equal"); 750 } 751 752 { 753 //Cover addAll(Collection) and addAllTo(Collection) 754 // Seems that there is a bug in addAll(Collection) operation 755 // Ram also add a similar test to UtilityTest.java 756 logln("Testing addAll(Collection) ... "); 757 String[] array = {"a", "b", "c", "de"}; 758 List list = Arrays.asList(array); 759 Set aset = new HashSet(list); 760 logln(" *** The source set's size is: " + aset.size()); 761 762 set.clear(); 763 set.addAll(aset); 764 if (set.size() != aset.size()) { 765 errln("FAIL: After addAll, the UnicodeSet size expected " + aset.size() + 766 ", " + set.size() + " seen instead!"); 767 } else { 768 logln("OK: After addAll, the UnicodeSet size got " + set.size()); 769 } 770 771 List list2 = new ArrayList(); 772 set.addAllTo(list2); 773 774 //verify the result 775 log(" *** The elements are: "); 776 String s = set.toPattern(true); 777 logln(s); 778 Iterator myiter = list2.iterator(); 779 while(myiter.hasNext()) { 780 log(myiter.next().toString() + " "); 781 } 782 logln(""); // a new line 783 } 784 785 } 786 787 @Test TestStrings()788 public void TestStrings() { 789 // Object[][] testList = { 790 // {I_EQUALS, UnicodeSet.fromAll("abc"), 791 // new UnicodeSet("[a-c]")}, 792 // 793 // {I_EQUALS, UnicodeSet.from("ch").add('a','z').add("ll"), 794 // new UnicodeSet("[{ll}{ch}a-z]")}, 795 // 796 // {I_EQUALS, UnicodeSet.from("ab}c"), 797 // new UnicodeSet("[{ab\\}c}]")}, 798 // 799 // {I_EQUALS, new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'), 800 // new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")}, 801 // }; 802 // 803 // for (int i = 0; i < testList.length; ++i) { 804 // expectRelation(testList[i][0], testList[i][1], testList[i][2], "(" + i + ")"); 805 // } 806 807 UnicodeSet[][] testList = { 808 {UnicodeSet.fromAll("abc"), 809 new UnicodeSet("[a-c]")}, 810 811 {UnicodeSet.from("ch").add('a','z').add("ll"), 812 new UnicodeSet("[{ll}{ch}a-z]")}, 813 814 {UnicodeSet.from("ab}c"), 815 new UnicodeSet("[{ab\\}c}]")}, 816 817 {new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'), 818 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")}, 819 }; 820 821 for (int i = 0; i < testList.length; ++i) { 822 if (!testList[i][0].equals(testList[i][1])) { 823 errln("FAIL: sets unequal; see source code (" + i + ")"); 824 } 825 } 826 } 827 828 static final Integer 829 I_ANY = new Integer(SortedSetRelation.ANY), 830 I_CONTAINS = new Integer(SortedSetRelation.CONTAINS), 831 I_DISJOINT = new Integer(SortedSetRelation.DISJOINT), 832 I_NO_B = new Integer(SortedSetRelation.NO_B), 833 I_ISCONTAINED = new Integer(SortedSetRelation.ISCONTAINED), 834 I_EQUALS = new Integer(SortedSetRelation.EQUALS), 835 I_NO_A = new Integer(SortedSetRelation.NO_A), 836 I_NONE = new Integer(SortedSetRelation.NONE); 837 838 @Test TestSetRelation()839 public void TestSetRelation() { 840 841 String[] choices = {"a", "b", "cd", "ef"}; 842 int limit = 1 << choices.length; 843 844 SortedSet iset = new TreeSet(); 845 SortedSet jset = new TreeSet(); 846 847 for (int i = 0; i < limit; ++i) { 848 pick(i, choices, iset); 849 for (int j = 0; j < limit; ++j) { 850 pick(j, choices, jset); 851 checkSetRelation(iset, jset, "(" + i + ")"); 852 } 853 } 854 } 855 856 @Test TestSetSpeed()857 public void TestSetSpeed() { 858 // skip unless verbose 859 if (!isVerbose()) return; 860 861 SetSpeed2(100); 862 SetSpeed2(1000); 863 } 864 SetSpeed2(int size)865 public void SetSpeed2(int size) { 866 867 SortedSet iset = new TreeSet(); 868 SortedSet jset = new TreeSet(); 869 870 for (int i = 0; i < size*2; i += 2) { // only even values 871 iset.add(new Integer(i)); 872 jset.add(new Integer(i)); 873 } 874 875 int iterations = 1000000 / size; 876 877 logln("Timing comparison of Java vs Utility"); 878 logln("For about " + size + " objects that are almost all the same."); 879 880 CheckSpeed(iset, jset, "when a = b", iterations); 881 882 iset.add(new Integer(size + 1)); // add odd value in middle 883 884 CheckSpeed(iset, jset, "when a contains b", iterations); 885 CheckSpeed(jset, iset, "when b contains a", iterations); 886 887 jset.add(new Integer(size - 1)); // add different odd value in middle 888 889 CheckSpeed(jset, iset, "when a, b are disjoint", iterations); 890 } 891 CheckSpeed(SortedSet iset, SortedSet jset, String message, int iterations)892 void CheckSpeed(SortedSet iset, SortedSet jset, String message, int iterations) { 893 CheckSpeed2(iset, jset, message, iterations); 894 CheckSpeed3(iset, jset, message, iterations); 895 } 896 CheckSpeed2(SortedSet iset, SortedSet jset, String message, int iterations)897 void CheckSpeed2(SortedSet iset, SortedSet jset, String message, int iterations) { 898 boolean x; 899 boolean y; 900 901 // make sure code is loaded: 902 x = iset.containsAll(jset); 903 y = SortedSetRelation.hasRelation(iset, SortedSetRelation.CONTAINS, jset); 904 if (x != y) errln("FAIL contains comparison"); 905 906 double start = System.currentTimeMillis(); 907 for (int i = 0; i < iterations; ++i) { 908 x |= iset.containsAll(jset); 909 } 910 double middle = System.currentTimeMillis(); 911 for (int i = 0; i < iterations; ++i) { 912 y |= SortedSetRelation.hasRelation(iset, SortedSetRelation.CONTAINS, jset); 913 } 914 double end = System.currentTimeMillis(); 915 916 double jtime = (middle - start)/iterations; 917 double utime = (end - middle)/iterations; 918 919 NumberFormat nf = NumberFormat.getPercentInstance(); 920 logln("Test contains: " + message + ": Java: " + jtime 921 + ", Utility: " + utime + ", u:j: " + nf.format(utime/jtime)); 922 } 923 CheckSpeed3(SortedSet iset, SortedSet jset, String message, int iterations)924 void CheckSpeed3(SortedSet iset, SortedSet jset, String message, int iterations) { 925 boolean x; 926 boolean y; 927 928 // make sure code is loaded: 929 x = iset.equals(jset); 930 y = SortedSetRelation.hasRelation(iset, SortedSetRelation.EQUALS, jset); 931 if (x != y) errln("FAIL equality comparison"); 932 933 934 double start = System.currentTimeMillis(); 935 for (int i = 0; i < iterations; ++i) { 936 x |= iset.equals(jset); 937 } 938 double middle = System.currentTimeMillis(); 939 for (int i = 0; i < iterations; ++i) { 940 y |= SortedSetRelation.hasRelation(iset, SortedSetRelation.EQUALS, jset); 941 } 942 double end = System.currentTimeMillis(); 943 944 double jtime = (middle - start)/iterations; 945 double utime = (end - middle)/iterations; 946 947 NumberFormat nf = NumberFormat.getPercentInstance(); 948 logln("Test equals: " + message + ": Java: " + jtime 949 + ", Utility: " + utime + ", u:j: " + nf.format(utime/jtime)); 950 } 951 pick(int bits, Object[] examples, SortedSet output)952 void pick(int bits, Object[] examples, SortedSet output) { 953 output.clear(); 954 for (int k = 0; k < 32; ++k) { 955 if (((1<<k) & bits) != 0) output.add(examples[k]); 956 } 957 } 958 959 public static final String[] RELATION_NAME = { 960 "both-are-null", 961 "a-is-null", 962 "equals", 963 "is-contained-in", 964 "b-is-null", 965 "is-disjoint_with", 966 "contains", 967 "any", }; 968 dumbHasRelation(Collection A, int filter, Collection B)969 boolean dumbHasRelation(Collection A, int filter, Collection B) { 970 Collection ab = new TreeSet(A); 971 ab.retainAll(B); 972 if (ab.size() > 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false; 973 974 // A - B size == A.size - A&B.size 975 if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false; 976 977 // B - A size == B.size - A&B.size 978 if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false; 979 980 981 return true; 982 } 983 checkSetRelation(SortedSet a, SortedSet b, String message)984 void checkSetRelation(SortedSet a, SortedSet b, String message) { 985 for (int i = 0; i < 8; ++i) { 986 987 boolean hasRelation = SortedSetRelation.hasRelation(a, i, b); 988 boolean dumbHasRelation = dumbHasRelation(a, i, b); 989 990 logln(message + " " + hasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); 991 992 if (hasRelation != dumbHasRelation) { 993 errln("FAIL: " + 994 message + " " + dumbHasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); 995 } 996 } 997 logln(""); 998 } 999 1000 /** 1001 * Test the [:Latin:] syntax. 1002 */ 1003 @Test TestScriptSet()1004 public void TestScriptSet() { 1005 1006 expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1")); 1007 1008 expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA"); 1009 1010 /* Jitterbug 1423 */ 1011 expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); 1012 1013 } 1014 1015 /** 1016 * Test the [:Latin:] syntax. 1017 */ 1018 @Test TestPropertySet()1019 public void TestPropertySet() { 1020 String[] DATA = { 1021 // Pattern, Chars IN, Chars NOT in 1022 1023 "[:Latin:]", 1024 "aA", 1025 "\u0391\u03B1", 1026 1027 "[\\p{Greek}]", 1028 "\u0391\u03B1", 1029 "aA", 1030 1031 "\\P{ GENERAL Category = upper case letter }", 1032 "abc", 1033 "ABC", 1034 1035 // Combining class: @since ICU 2.2 1036 // Check both symbolic and numeric 1037 "\\p{ccc=Nukta}", 1038 "\u0ABC", 1039 "abc", 1040 1041 "\\p{Canonical Combining Class = 11}", 1042 "\u05B1", 1043 "\u05B2", 1044 1045 "[:c c c = iota subscript :]", 1046 "\u0345", 1047 "xyz", 1048 1049 // Bidi class: @since ICU 2.2 1050 "\\p{bidiclass=lefttoright}", 1051 "abc", 1052 "\u0671\u0672", 1053 1054 // Binary properties: @since ICU 2.2 1055 "\\p{ideographic}", 1056 "\u4E0A", 1057 "x", 1058 1059 "[:math=false:]", 1060 "q)*(", // )(and * were removed from math in Unicode 4.0.1 1061 "+<>^", 1062 1063 // JB#1767 \N{}, \p{ASCII} 1064 "[:Ascii:]", 1065 "abc\u0000\u007F", 1066 "\u0080\u4E00", 1067 1068 "[\\N{ latin small letter a }[:name= latin small letter z:]]", 1069 "az", 1070 "qrs", 1071 1072 // JB#2015 1073 "[:any:]", 1074 "a\\U0010FFFF", 1075 "", 1076 1077 "[:nv=0.5:]", 1078 "\u00BD\u0F2A", 1079 "\u00BC", 1080 1081 // JB#2653: Age 1082 "[:Age=1.1:]", 1083 "\u03D6", // 1.1 1084 "\u03D8\u03D9", // 3.2 1085 1086 "[:Age=3.1:]", 1087 "\\u1800\\u3400\\U0002f800", 1088 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", 1089 1090 // JB#2350: Case_Sensitive 1091 "[:Case Sensitive:]", 1092 "A\u1FFC\\U00010410", 1093 ";\u00B4\\U00010500", 1094 1095 1096 // Regex compatibility test 1097 "[-b]", // leading '-' is literal 1098 "-b", 1099 "ac", 1100 1101 "[^-b]", // leading '-' is literal 1102 "ac", 1103 "-b", 1104 1105 "[b-]", // trailing '-' is literal 1106 "-b", 1107 "ac", 1108 1109 "[^b-]", // trailing '-' is literal 1110 "ac", 1111 "-b", 1112 1113 "[a-b-]", // trailing '-' is literal 1114 "ab-", 1115 "c=", 1116 1117 "[[a-q]&[p-z]-]", // trailing '-' is literal 1118 "pq-", 1119 "or=", 1120 1121 "[\\s|\\)|:|$|\\>]", // from regex tests 1122 "s|):$>", 1123 "\\abc", 1124 1125 "[\uDC00cd]", // JB#2906: isolated trail at start 1126 "cd\uDC00", 1127 "ab\uD800\\U00010000", 1128 1129 "[ab\uD800]", // JB#2906: isolated trail at start 1130 "ab\uD800", 1131 "cd\uDC00\\U00010000", 1132 1133 "[ab\uD800cd]", // JB#2906: isolated lead in middle 1134 "abcd\uD800", 1135 "ef\uDC00\\U00010000", 1136 1137 "[ab\uDC00cd]", // JB#2906: isolated trail in middle 1138 "abcd\uDC00", 1139 "ef\uD800\\U00010000", 1140 1141 "[:^lccc=0:]", // Lead canonical class 1142 "\u0300\u0301", 1143 "abcd\u00c0\u00c5", 1144 1145 "[:^tccc=0:]", // Trail canonical class 1146 "\u0300\u0301\u00c0\u00c5", 1147 "abcd", 1148 1149 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class 1150 "\u0300\u0301\u00c0\u00c5", 1151 "abcd", 1152 1153 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) 1154 "", 1155 "abcd\u0300\u0301\u00c0\u00c5", 1156 1157 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not 1158 "\u0F73\u0F75\u0F81", 1159 "abcd\u0300\u0301\u00c0\u00c5", 1160 1161 "[:Assigned:]", 1162 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", 1163 "\\u0888\\uFDD3\\uFFFE\\U00050005", 1164 1165 // Script_Extensions, new in Unicode 6.0 1166 "[:scx=Arab:]", 1167 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3", 1168 "\\u061D\\uFDEF\\uFDFE", 1169 1170 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions, 1171 // so scx-sc is missing U+FDF2. 1172 "[[:Script_Extensions=Arabic:]-[:Arab:]]", 1173 "\\u0640\\u064B\\u0650\\u0655", 1174 "\\uFDF2" 1175 }; 1176 1177 for (int i=0; i<DATA.length; i+=3) { 1178 expectContainment(DATA[i], DATA[i+1], DATA[i+2]); 1179 } 1180 } 1181 1182 @Test TestUnicodeSetStrings()1183 public void TestUnicodeSetStrings() { 1184 UnicodeSet uset = new UnicodeSet("[a{bc}{cd}pqr\u0000]"); 1185 logln(uset + " ~ " + uset.getRegexEquivalent()); 1186 String[][] testStrings = {{"x", "none"}, 1187 {"bc", "all"}, 1188 {"cdbca", "all"}, 1189 {"a", "all"}, 1190 {"bcx", "some"}, 1191 {"ab", "some"}, 1192 {"acb", "some"}, 1193 {"bcda", "some"}, 1194 {"dccbx", "none"}, 1195 }; 1196 for (int i = 0; i < testStrings.length; ++i) { 1197 check(uset, testStrings[i][0], testStrings[i][1]); 1198 } 1199 } 1200 1201 check(UnicodeSet uset, String string, String desiredStatus)1202 private void check(UnicodeSet uset, String string, String desiredStatus) { 1203 boolean shouldContainAll = desiredStatus.equals("all"); 1204 boolean shouldContainNone = desiredStatus.equals("none"); 1205 if (uset.containsAll(string) != shouldContainAll) { 1206 errln("containsAll " + string + " should be " + shouldContainAll); 1207 } else { 1208 logln("containsAll " + string + " = " + shouldContainAll); 1209 } 1210 if (uset.containsNone(string) != shouldContainNone) { 1211 errln("containsNone " + string + " should be " + shouldContainNone); 1212 } else { 1213 logln("containsNone " + string + " = " + shouldContainNone); 1214 } 1215 } 1216 1217 /** 1218 * Test cloning of UnicodeSet 1219 */ 1220 @Test TestClone()1221 public void TestClone() { 1222 UnicodeSet s = new UnicodeSet("[abcxyz]"); 1223 UnicodeSet t = (UnicodeSet) s.clone(); 1224 expectContainment(t, "abc", "def"); 1225 } 1226 1227 /** 1228 * Test the indexOf() and charAt() methods. 1229 */ 1230 @Test TestIndexOf()1231 public void TestIndexOf() { 1232 UnicodeSet set = new UnicodeSet("[a-cx-y3578]"); 1233 for (int i=0; i<set.size(); ++i) { 1234 int c = set.charAt(i); 1235 if (set.indexOf(c) != i) { 1236 errln("FAIL: charAt(" + i + ") = " + c + 1237 " => indexOf() => " + set.indexOf(c)); 1238 } 1239 } 1240 int c = set.charAt(set.size()); 1241 if (c != -1) { 1242 errln("FAIL: charAt(<out of range>) = " + 1243 Utility.escape(String.valueOf(c))); 1244 } 1245 int j = set.indexOf('q'); 1246 if (j != -1) { 1247 errln("FAIL: indexOf('q') = " + j); 1248 } 1249 } 1250 1251 @Test TestContainsString()1252 public void TestContainsString() { 1253 UnicodeSet x = new UnicodeSet("[a{bc}]"); 1254 if (x.contains("abc")) errln("FAIL"); 1255 } 1256 1257 @Test TestExhaustive()1258 public void TestExhaustive() { 1259 // exhaustive tests. Simulate UnicodeSets with integers. 1260 // That gives us very solid tests (except for large memory tests). 1261 1262 char limit = (char)128; 1263 1264 for (char i = 0; i < limit; ++i) { 1265 logln("Testing " + i + ", " + bitsToSet(i)); 1266 _testComplement(i); 1267 1268 // AS LONG AS WE ARE HERE, check roundtrip 1269 checkRoundTrip(bitsToSet(i)); 1270 1271 for (char j = 0; j < limit; ++j) { 1272 _testAdd(i,j); 1273 _testXor(i,j); 1274 _testRetain(i,j); 1275 _testRemove(i,j); 1276 } 1277 } 1278 } 1279 1280 /** 1281 * Make sure each script name and abbreviated name can be used 1282 * to construct a UnicodeSet. 1283 */ 1284 @Test TestScriptNames()1285 public void TestScriptNames() { 1286 for (int i=0; i<UScript.CODE_LIMIT; ++i) { 1287 for (int j=0; j<2; ++j) { 1288 String pat = ""; 1289 try { 1290 String name = 1291 (j==0) ? UScript.getName(i) : UScript.getShortName(i); 1292 pat = "[:" + name + ":]"; 1293 UnicodeSet set = new UnicodeSet(pat); 1294 logln("Ok: " + pat + " -> " + set.toPattern(false)); 1295 } catch (IllegalArgumentException e) { 1296 if (pat.length() == 0) { 1297 errln("FAIL (in UScript): No name for script " + i); 1298 } else { 1299 errln("FAIL: Couldn't create " + pat); 1300 } 1301 } 1302 } 1303 } 1304 } 1305 1306 /** 1307 * Test closure API. 1308 */ 1309 @Test TestCloseOver()1310 public void TestCloseOver() { 1311 String CASE = String.valueOf(UnicodeSet.CASE); 1312 String[] DATA = { 1313 // selector, input, output 1314 CASE, 1315 "[aq\u00DF{Bc}{bC}{Fi}]", 1316 "[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 1317 1318 CASE, 1319 "[\u01F1]", // 'DZ' 1320 "[\u01F1\u01F2\u01F3]", 1321 1322 CASE, 1323 "[\u1FB4]", 1324 "[\u1FB4{\u03AC\u03B9}]", 1325 1326 CASE, 1327 "[{F\uFB01}]", 1328 "[\uFB03{ffi}]", 1329 1330 CASE, 1331 "[a-z]","[A-Za-z\u017F\u212A]", 1332 CASE, 1333 "[abc]","[A-Ca-c]", 1334 CASE, 1335 "[ABC]","[A-Ca-c]", 1336 }; 1337 1338 UnicodeSet s = new UnicodeSet(); 1339 UnicodeSet t = new UnicodeSet(); 1340 for (int i=0; i<DATA.length; i+=3) { 1341 int selector = Integer.parseInt(DATA[i]); 1342 String pat = DATA[i+1]; 1343 String exp = DATA[i+2]; 1344 s.applyPattern(pat); 1345 s.closeOver(selector); 1346 t.applyPattern(exp); 1347 if (s.equals(t)) { 1348 logln("Ok: " + pat + ".closeOver(" + selector + ") => " + exp); 1349 } else { 1350 errln("FAIL: " + pat + ".closeOver(" + selector + ") => " + 1351 s.toPattern(true) + ", expected " + exp); 1352 } 1353 } 1354 1355 // Test the pattern API 1356 s.applyPattern("[abc]", UnicodeSet.CASE); 1357 expectContainment(s, "abcABC", "defDEF"); 1358 s = new UnicodeSet("[^abc]", UnicodeSet.CASE); 1359 expectContainment(s, "defDEF", "abcABC"); 1360 } 1361 1362 @Test TestEscapePattern()1363 public void TestEscapePattern() { 1364 // The following pattern must contain at least one range "c-d" 1365 // where c or d is a Pattern_White_Space. 1366 String pattern = 1367 "[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; 1368 String exp = 1369 "[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; 1370 // We test this with two passes; in the second pass we 1371 // pre-unescape the pattern. Since U+200E is Pattern_White_Space, 1372 // this fails -- which is what we expect. 1373 for (int pass=1; pass<=2; ++pass) { 1374 String pat = pattern; 1375 if (pass==2) { 1376 pat = Utility.unescape(pat); 1377 } 1378 // Pattern is only good for pass 1 1379 boolean isPatternValid = (pass==1); 1380 1381 UnicodeSet set = null; 1382 try { 1383 set = new UnicodeSet(pat); 1384 } catch (IllegalArgumentException e) { 1385 set = null; 1386 } 1387 if ((set != null) != isPatternValid){ 1388 errln("FAIL: applyPattern(" + 1389 Utility.escape(pat) + ") => " + set); 1390 continue; 1391 } 1392 if (set == null) { 1393 continue; 1394 } 1395 if (set.contains((char)0x0644)){ 1396 errln("FAIL: " + Utility.escape(pat) + " contains(U+0664)"); 1397 } 1398 1399 String newpat = set.toPattern(true); 1400 if (newpat.equals(exp)) { 1401 logln(Utility.escape(pat) + " => " + newpat); 1402 } else { 1403 errln("FAIL: " + Utility.escape(pat) + " => " + newpat); 1404 } 1405 1406 for (int i=0; i<set.getRangeCount(); ++i) { 1407 StringBuffer str = new StringBuffer("Range "); 1408 str.append((char)(0x30 + i)) 1409 .append(": "); 1410 UTF16.append(str, set.getRangeStart(i)); 1411 str.append(" - "); 1412 UTF16.append(str, set.getRangeEnd(i)); 1413 String s = Utility.escape(str.toString() + " (" + set.getRangeStart(i) + " - " + 1414 set.getRangeEnd(i) + ")"); 1415 if (set.getRangeStart(i) < 0) { 1416 errln("FAIL: " + s); 1417 } else { 1418 logln(s); 1419 } 1420 } 1421 } 1422 } 1423 1424 @Test TestSymbolTable()1425 public void TestSymbolTable() { 1426 // Multiple test cases can be set up here. Each test case 1427 // is terminated by null: 1428 // var, value, var, value,..., input pat., exp. output pat., null 1429 String DATA[] = { 1430 "us", "a-z", "[0-1$us]", "[0-1a-z]", null, 1431 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", null, 1432 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", null 1433 }; 1434 1435 for (int i=0; i<DATA.length; ++i) { 1436 TokenSymbolTable sym = new TokenSymbolTable(); 1437 1438 // Set up variables 1439 while (DATA[i+2] != null) { 1440 sym.add(DATA[i], DATA[i+1]); 1441 i += 2; 1442 } 1443 1444 // Input pattern and expected output pattern 1445 String inpat = DATA[i], exppat = DATA[i+1]; 1446 i += 2; 1447 1448 ParsePosition pos = new ParsePosition(0); 1449 UnicodeSet us = new UnicodeSet(inpat, pos, sym); 1450 1451 // results 1452 if (pos.getIndex() != inpat.length()) { 1453 errln("Failed to read to end of string \"" 1454 + inpat + "\": read to " 1455 + pos.getIndex() + ", length is " 1456 + inpat.length()); 1457 } 1458 1459 UnicodeSet us2 = new UnicodeSet(exppat); 1460 if (!us.equals(us2)) { 1461 errln("Failed, got " + us + ", expected " + us2); 1462 } else { 1463 logln("Ok, got " + us); 1464 } 1465 1466 //cover Unicode(String,ParsePosition,SymbolTable,int) 1467 ParsePosition inpos = new ParsePosition(0); 1468 UnicodeSet inSet = new UnicodeSet(inpat, inpos, sym, UnicodeSet.IGNORE_SPACE); 1469 UnicodeSet expSet = new UnicodeSet(exppat); 1470 if (!inSet.equals(expSet)) { 1471 errln("FAIL: Failed, got " + inSet + ", expected " + expSet); 1472 } else { 1473 logln("OK: got " + inSet); 1474 } 1475 } 1476 } 1477 1478 /** 1479 * Test that Posix style character classes [:digit:], etc. 1480 * have the Unicode definitions from TR 18. 1481 */ 1482 @Test TestPosixClasses()1483 public void TestPosixClasses() { 1484 expectEqual("POSIX alpha", "[:alpha:]", "\\p{Alphabetic}"); 1485 expectEqual("POSIX lower", "[:lower:]", "\\p{lowercase}"); 1486 expectEqual("POSIX upper", "[:upper:]", "\\p{Uppercase}"); 1487 expectEqual("POSIX punct", "[:punct:]", "\\p{gc=Punctuation}"); 1488 expectEqual("POSIX digit", "[:digit:]", "\\p{gc=DecimalNumber}"); 1489 expectEqual("POSIX xdigit", "[:xdigit:]", "[\\p{DecimalNumber}\\p{HexDigit}]"); 1490 expectEqual("POSIX alnum", "[:alnum:]", "[\\p{Alphabetic}\\p{DecimalNumber}]"); 1491 expectEqual("POSIX space", "[:space:]", "\\p{Whitespace}"); 1492 expectEqual("POSIX blank", "[:blank:]", "[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"); 1493 expectEqual("POSIX cntrl", "[:cntrl:]", "\\p{Control}"); 1494 expectEqual("POSIX graph", "[:graph:]", "[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"); 1495 expectEqual("POSIX print", "[:print:]", "[[:graph:][:blank:]-[\\p{Control}]]"); 1496 } 1497 1498 @Test TestHangulSyllable()1499 public void TestHangulSyllable() { 1500 final UnicodeSet lvt = new UnicodeSet("[:Hangul_Syllable_Type=LVT_Syllable:]"); 1501 assertNotEquals("LVT count", new UnicodeSet(), lvt); 1502 logln(lvt + ": " + lvt.size()); 1503 final UnicodeSet lv = new UnicodeSet("[:Hangul_Syllable_Type=LV_Syllable:]"); 1504 assertNotEquals("LV count", new UnicodeSet(), lv); 1505 logln(lv + ": " + lv.size()); 1506 } 1507 1508 /** 1509 * Test that frozen classes disallow changes. For 4217 1510 */ 1511 @Test TestFrozen()1512 public void TestFrozen() { 1513 UnicodeSet test = new UnicodeSet("[[:whitespace:]A]"); 1514 test.freeze(); 1515 checkModification(test, true); 1516 checkModification(test, false); 1517 } 1518 1519 /** 1520 * Test Generic support 1521 */ 1522 @Test TestGenerics()1523 public void TestGenerics() { 1524 UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze(); 1525 UnicodeSet set2 = new UnicodeSet("[e-f {ch}]").freeze(); 1526 UnicodeSet set3 = new UnicodeSet("[d m-n {dh}]").freeze(); 1527 // A useful range of sets for testing, including both characters and strings 1528 // set 1 contains set2 1529 // set 1 is overlaps with set 3 1530 // set 2 is disjoint with set 3 1531 1532 //public Iterator<String> iterator() { 1533 1534 ArrayList<String> oldList = new ArrayList<String>(); 1535 for (UnicodeSetIterator it = new UnicodeSetIterator(set1); it.next();) { 1536 oldList.add(it.getString()); 1537 } 1538 1539 ArrayList<String> list1 = new ArrayList<String>(); 1540 for (String s : set1) { 1541 list1.add(s); 1542 } 1543 assertEquals("iteration test", oldList, list1); 1544 1545 //addAllTo(Iterable<T>, U) 1546 list1.clear(); 1547 set1.addAllTo(list1); 1548 assertEquals("iteration test", oldList, list1); 1549 1550 list1 = set1.addAllTo(new ArrayList<String>()); 1551 assertEquals("addAllTo", oldList, list1); 1552 1553 ArrayList<String> list2 = set2.addAllTo(new ArrayList<String>()); 1554 ArrayList<String> list3 = set3.addAllTo(new ArrayList<String>()); 1555 1556 // put them into different order, to check that order doesn't matter 1557 TreeSet sorted1 = set1.addAllTo(new TreeSet<String>()); 1558 TreeSet sorted2 = set2.addAllTo(new TreeSet<String>()); 1559 TreeSet sorted3 = set3.addAllTo(new TreeSet<String>()); 1560 1561 //containsAll(Collection<String> collection) 1562 assertTrue("containsAll", set1.containsAll(list1)); 1563 assertTrue("containsAll", set1.containsAll(sorted1)); 1564 assertTrue("containsAll", set1.containsAll(list2)); 1565 assertTrue("containsAll", set1.containsAll(sorted2)); 1566 assertFalse("containsAll", set1.containsAll(list3)); 1567 assertFalse("containsAll", set1.containsAll(sorted3)); 1568 assertFalse("containsAll", set2.containsAll(list3)); 1569 assertFalse("containsAll", set2.containsAll(sorted3)); 1570 1571 //containsSome(Collection<String>) 1572 assertTrue("containsSome", set1.containsSome(list1)); 1573 assertTrue("containsSome", set1.containsSome(sorted1)); 1574 assertTrue("containsSome", set1.containsSome(list2)); 1575 assertTrue("containsSome", set1.containsSome(sorted2)); 1576 assertTrue("containsSome", set1.containsSome(list3)); 1577 assertTrue("containsSome", set1.containsSome(sorted3)); 1578 assertFalse("containsSome", set2.containsSome(list3)); 1579 assertFalse("containsSome", set2.containsSome(sorted3)); 1580 1581 //containsNone(Collection<String>) 1582 assertFalse("containsNone", set1.containsNone(list1)); 1583 assertFalse("containsNone", set1.containsNone(sorted1)); 1584 assertFalse("containsNone", set1.containsNone(list2)); 1585 assertFalse("containsNone", set1.containsNone(sorted2)); 1586 assertFalse("containsNone", set1.containsNone(list3)); 1587 assertFalse("containsNone", set1.containsNone(sorted3)); 1588 assertTrue("containsNone", set2.containsNone(list3)); 1589 assertTrue("containsNone", set2.containsNone(sorted3)); 1590 1591 //addAll(String...) 1592 UnicodeSet other3 = new UnicodeSet().addAll("d", "m", "n", "dh"); 1593 assertEquals("addAll", set3, other3); 1594 1595 //removeAll(Collection<String>) 1596 UnicodeSet mod1 = new UnicodeSet(set1).removeAll(set2); 1597 UnicodeSet mod2 = new UnicodeSet(set1).removeAll(list2); 1598 assertEquals("remove all", mod1, mod2); 1599 1600 //retainAll(Collection<String>) 1601 mod1 = new UnicodeSet(set1).retainAll(set2); 1602 mod2 = new UnicodeSet(set1).retainAll(set2.addAllTo(new LinkedHashSet<String>())); 1603 assertEquals("remove all", mod1, mod2); 1604 } 1605 1606 @Test TestComparison()1607 public void TestComparison() { 1608 UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze(); 1609 UnicodeSet set2 = new UnicodeSet("[c-e {ch}]").freeze(); 1610 UnicodeSet set3 = new UnicodeSet("[d m-n z {dh}]").freeze(); 1611 1612 //compareTo(UnicodeSet) 1613 // do indirectly, by sorting 1614 List<UnicodeSet> unsorted = Arrays.asList(set3, set2, set1); 1615 List<UnicodeSet> goalShortest = Arrays.asList(set2, set3, set1); 1616 List<UnicodeSet> goalLongest = Arrays.asList(set1, set3, set2); 1617 List<UnicodeSet> goalLex = Arrays.asList(set1, set2, set3); 1618 1619 List<UnicodeSet> sorted = new ArrayList(new TreeSet<UnicodeSet>(unsorted)); 1620 assertNotEquals("compareTo-shorter-first", unsorted, sorted); 1621 assertEquals("compareTo-shorter-first", goalShortest, sorted); 1622 1623 TreeSet<UnicodeSet> sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){ 1624 @Override 1625 public int compare(UnicodeSet o1, UnicodeSet o2) { 1626 // TODO Auto-generated method stub 1627 return o1.compareTo(o2, ComparisonStyle.LONGER_FIRST); 1628 }}); 1629 sorted1.addAll(unsorted); 1630 sorted = new ArrayList(sorted1); 1631 assertNotEquals("compareTo-longer-first", unsorted, sorted); 1632 assertEquals("compareTo-longer-first", goalLongest, sorted); 1633 1634 sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){ 1635 @Override 1636 public int compare(UnicodeSet o1, UnicodeSet o2) { 1637 // TODO Auto-generated method stub 1638 return o1.compareTo(o2, ComparisonStyle.LEXICOGRAPHIC); 1639 }}); 1640 sorted1.addAll(unsorted); 1641 sorted = new ArrayList(sorted1); 1642 assertNotEquals("compareTo-lex", unsorted, sorted); 1643 assertEquals("compareTo-lex", goalLex, sorted); 1644 1645 //compare(String, int) 1646 // make a list of interesting combinations 1647 List<String> sources = Arrays.asList("\u0000", "a", "b", "\uD7FF", "\uD800", "\uDBFF", "\uDC00", "\uDFFF", "\uE000", "\uFFFD", "\uFFFF"); 1648 TreeSet<String> target = new TreeSet<String>(); 1649 for (String s : sources) { 1650 target.add(s); 1651 for (String t : sources) { 1652 target.add(s + t); 1653 for (String u : sources) { 1654 target.add(s + t + u); 1655 } 1656 } 1657 } 1658 // now compare all the combinations. If any of them is a code point, use it. 1659 int maxErrorCount = 0; 1660 compare: 1661 for (String last : target) { 1662 for (String curr : target) { 1663 int lastCount = Character.codePointCount(last, 0, last.length()); 1664 int currCount = Character.codePointCount(curr, 0, curr.length()); 1665 int comparison; 1666 if (lastCount == 1) { 1667 comparison = UnicodeSet.compare(last.codePointAt(0), curr); 1668 } else if (currCount == 1) { 1669 comparison = UnicodeSet.compare(last, curr.codePointAt(0)); 1670 } else { 1671 continue; 1672 } 1673 if (comparison != last.compareTo(curr)) { 1674 // repeat for debugging 1675 if (lastCount == 1) { 1676 comparison = UnicodeSet.compare(last.codePointAt(0), curr); 1677 } else if (currCount == 1) { 1678 comparison = UnicodeSet.compare(last, curr.codePointAt(0)); 1679 } 1680 if (maxErrorCount++ > 10) { 1681 errln(maxErrorCount + " Failure in comparing " + last + " & " + curr + "\tOmitting others..."); 1682 break compare; 1683 } 1684 errln(maxErrorCount + " Failure in comparing " + last + " & " + curr); 1685 } 1686 } 1687 } 1688 1689 //compare(Iterable<T>, Iterable<T>) 1690 int max = 10; 1691 List<String> test1 = new ArrayList<String>(max); 1692 List<String> test2 = new ArrayList<String>(max); 1693 for (int i = 0; i <= max; ++i) { 1694 test1.add("a" + i); 1695 test2.add("a" + (max - i)); // add in reverse order 1696 } 1697 assertNotEquals("compare iterable test", test1, test2); 1698 TreeSet<CharSequence> sortedTest1 = new TreeSet<CharSequence>(test1); 1699 TreeSet<CharSequence> sortedTest2 = new TreeSet<CharSequence>(test2); 1700 assertEquals("compare iterable test", sortedTest1, sortedTest2); 1701 } 1702 1703 @Test TestRangeConstructor()1704 public void TestRangeConstructor() { 1705 UnicodeSet w = new UnicodeSet().addAll(3,5); 1706 UnicodeSet s = new UnicodeSet(3,5); 1707 assertEquals("new constructor", w, s); 1708 1709 w = new UnicodeSet().addAll(3,5).addAll(7,7); 1710 UnicodeSet t = new UnicodeSet(3,5, 7,7); 1711 assertEquals("new constructor", w, t); 1712 // check to make sure right exceptions are thrown 1713 Class expected = IllegalArgumentException.class; 1714 Class actual; 1715 1716 try { 1717 actual = null; 1718 @SuppressWarnings("unused") 1719 UnicodeSet u = new UnicodeSet(5); 1720 } catch (IllegalArgumentException e) { 1721 actual = e.getClass(); 1722 } 1723 assertEquals("exception if odd", expected, actual); 1724 1725 try { 1726 actual = null; 1727 @SuppressWarnings("unused") 1728 UnicodeSet u = new UnicodeSet(3, 2, 7, 9); 1729 } catch (IllegalArgumentException e) { 1730 actual = e.getClass(); 1731 } 1732 assertEquals("exception for start/end problem", expected, actual); 1733 1734 try { 1735 actual = null; 1736 @SuppressWarnings("unused") 1737 UnicodeSet u = new UnicodeSet(3, 5, 6, 9); 1738 } catch (IllegalArgumentException e) { 1739 actual = e.getClass(); 1740 } 1741 assertEquals("exception for end/start problem", expected, actual); 1742 1743 CheckRangeSpeed(10000, new UnicodeSet("[:whitespace:]")); 1744 CheckRangeSpeed(1000, new UnicodeSet("[:letter:]")); 1745 } 1746 1747 /** 1748 * @param iterations 1749 * @param testSet 1750 */ CheckRangeSpeed(int iterations, UnicodeSet testSet)1751 private void CheckRangeSpeed(int iterations, UnicodeSet testSet) { 1752 testSet.complement().complement(); 1753 String testPattern = testSet.toString(); 1754 // fill a set of pairs from the pattern 1755 int[] pairs = new int[testSet.getRangeCount()*2]; 1756 int j = 0; 1757 for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.nextRange();) { 1758 pairs[j++] = it.codepoint; 1759 pairs[j++] = it.codepointEnd; 1760 } 1761 UnicodeSet fromRange = new UnicodeSet(testSet); 1762 assertEquals("from range vs pattern", testSet, fromRange); 1763 1764 double start = System.currentTimeMillis(); 1765 for (int i = 0; i < iterations; ++i) { 1766 fromRange = new UnicodeSet(testSet); 1767 } 1768 double middle = System.currentTimeMillis(); 1769 for (int i = 0; i < iterations; ++i) { 1770 new UnicodeSet(testPattern); 1771 } 1772 double end = System.currentTimeMillis(); 1773 1774 double rangeConstructorTime = (middle - start)/iterations; 1775 double patternConstructorTime = (end - middle)/iterations; 1776 String message = "Range constructor:\t" + rangeConstructorTime + ";\tPattern constructor:\t" + patternConstructorTime + "\t\t" 1777 + percent.format(rangeConstructorTime/patternConstructorTime-1); 1778 if (rangeConstructorTime < 2*patternConstructorTime) { 1779 logln(message); 1780 } else { 1781 errln(message); 1782 } 1783 } 1784 1785 NumberFormat percent = NumberFormat.getPercentInstance(); 1786 { 1787 percent.setMaximumFractionDigits(2); 1788 } 1789 // **************************************** 1790 // UTILITIES 1791 // **************************************** 1792 checkModification(UnicodeSet original, boolean isFrozen)1793 public void checkModification(UnicodeSet original, boolean isFrozen) { 1794 main: 1795 for (int i = 0; ;++i) { 1796 UnicodeSet test = (UnicodeSet) (isFrozen ? original.clone() : original.cloneAsThawed()); 1797 boolean gotException = true; 1798 boolean checkEquals = true; 1799 try { 1800 switch(i) { 1801 case 0: test.add(0); break; 1802 case 1: test.add(0,1); break; 1803 case 2: test.add("a"); break; 1804 case 3: List a = new ArrayList(); a.add("a"); test.addAll(a); break; 1805 case 4: test.addAll("ab"); break; 1806 case 5: test.addAll(new UnicodeSet("[ab]")); break; 1807 case 6: test.applyIntPropertyValue(0,0); break; 1808 case 7: test.applyPattern("[ab]"); break; 1809 case 8: test.applyPattern("[ab]", true); break; 1810 case 9: test.applyPattern("[ab]", 0); break; 1811 case 10: test.applyPropertyAlias("hex","true"); break; 1812 case 11: test.applyPropertyAlias("hex", "true", null); break; 1813 case 12: test.closeOver(UnicodeSet.CASE); break; 1814 case 13: test.compact(); checkEquals = false; break; 1815 case 14: test.complement(0); break; 1816 case 15: test.complement(0,0); break; 1817 case 16: test.complement("ab"); break; 1818 case 17: test.complementAll("ab"); break; 1819 case 18: test.complementAll(new UnicodeSet("[ab]")); break; 1820 case 19: test.remove(' '); break; 1821 case 20: test.remove(' ','a'); break; 1822 case 21: test.remove(" "); break; 1823 case 22: test.removeAll(" a"); break; 1824 case 23: test.removeAll(new UnicodeSet("[\\ a]")); break; 1825 case 24: test.retain(' '); break; 1826 case 25: test.retain(' ','a'); break; 1827 case 26: test.retain(" "); break; 1828 case 27: test.retainAll(" a"); break; 1829 case 28: test.retainAll(new UnicodeSet("[\\ a]")); break; 1830 case 29: test.set(0,1); break; 1831 case 30: test.set(new UnicodeSet("[ab]")); break; 1832 1833 default: continue main; // so we don't keep having to change the endpoint, and gaps are not skipped. 1834 case 35: return; 1835 } 1836 gotException = false; 1837 } catch (UnsupportedOperationException e) { 1838 // do nothing 1839 } 1840 if (isFrozen && !gotException) errln(i + ") attempt to modify frozen object didn't result in an exception"); 1841 if (!isFrozen && gotException) errln(i + ") attempt to modify thawed object did result in an exception"); 1842 if (checkEquals) { 1843 if (test.equals(original)) { 1844 if (!isFrozen) errln(i + ") attempt to modify thawed object didn't change the object"); 1845 } else { // unequal 1846 if (isFrozen) errln(i + ") attempt to modify frozen object changed the object"); 1847 } 1848 } 1849 } 1850 } 1851 1852 // Following cod block is commented out to eliminate PrettyPrinter depenencies 1853 1854 // String[] prettyData = { 1855 // "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case 1856 // "[:any:]", 1857 // "[:whitespace:]", 1858 // "[:linebreak=AL:]", 1859 // }; 1860 // 1861 // public void TestPrettyPrinting() { 1862 // try{ 1863 // PrettyPrinter pp = new PrettyPrinter(); 1864 // 1865 // int i = 0; 1866 // for (; i < prettyData.length; ++i) { 1867 // UnicodeSet test = new UnicodeSet(prettyData[i]); 1868 // checkPrettySet(pp, i, test); 1869 // } 1870 // Random random = new Random(0); 1871 // UnicodeSet test = new UnicodeSet(); 1872 // 1873 // // To keep runtimes under control, make the number of random test cases 1874 // // to try depends on the test framework exhaustive setting. 1875 // // params.inclusions = 5: default exhaustive value 1876 // // params.inclusions = 10: max exhaustive value. 1877 // int iterations = 50; 1878 // if (params.inclusion > 5) { 1879 // iterations = (params.inclusion-5) * 200; 1880 // } 1881 // for (; i < iterations; ++i) { 1882 // double start = random.nextGaussian() * 0x10000; 1883 // if (start < 0) start = - start; 1884 // if (start > 0x10FFFF) { 1885 // start = 0x10FFFF; 1886 // } 1887 // double end = random.nextGaussian() * 0x100; 1888 // if (end < 0) end = -end; 1889 // end = start + end; 1890 // if (end > 0x10FFFF) { 1891 // end = 0x10FFFF; 1892 // } 1893 // test.complement((int)start, (int)end); 1894 // checkPrettySet(pp, i, test); 1895 // } 1896 // }catch(RuntimeException ex){ 1897 // warnln("Could not load Collator"); 1898 // } 1899 // } 1900 // 1901 // private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) { 1902 // String pretty = pp.toPattern(test); 1903 // UnicodeSet retry = new UnicodeSet(pretty); 1904 // if (!test.equals(retry)) { 1905 // errln(i + ". Failed test: " + test + " != " + pretty); 1906 // } else { 1907 // logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty)); 1908 // } 1909 // } 1910 // 1911 // private String truncate(String string) { 1912 // if (string.length() <= 100) return string; 1913 // return string.substring(0,97) + "..."; 1914 // } 1915 1916 public class TokenSymbolTable implements SymbolTable { 1917 HashMap contents = new HashMap(); 1918 1919 /** 1920 * (Non-SymbolTable API) Add the given variable and value to 1921 * the table. Variable should NOT contain leading '$'. 1922 */ add(String var, String value)1923 public void add(String var, String value) { 1924 char[] buffer = new char[value.length()]; 1925 value.getChars(0, value.length(), buffer, 0); 1926 add(var, buffer); 1927 } 1928 1929 /** 1930 * (Non-SymbolTable API) Add the given variable and value to 1931 * the table. Variable should NOT contain leading '$'. 1932 */ add(String var, char[] body)1933 public void add(String var, char[] body) { 1934 logln("TokenSymbolTable: add \"" + var + "\" => \"" + 1935 new String(body) + "\""); 1936 contents.put(var, body); 1937 } 1938 1939 /* (non-Javadoc) 1940 * @see ohos.global.icu.text.SymbolTable#lookup(java.lang.String) 1941 */ 1942 @Override lookup(String s)1943 public char[] lookup(String s) { 1944 logln("TokenSymbolTable: lookup \"" + s + "\" => \"" + 1945 new String((char[]) contents.get(s)) + "\""); 1946 return (char[])contents.get(s); 1947 } 1948 1949 /* (non-Javadoc) 1950 * @see ohos.global.icu.text.SymbolTable#lookupMatcher(int) 1951 */ 1952 @Override lookupMatcher(int ch)1953 public UnicodeMatcher lookupMatcher(int ch) { 1954 return null; 1955 } 1956 1957 /* (non-Javadoc) 1958 * @see ohos.global.icu.text.SymbolTable#parseReference(java.lang.String, 1959 java.text.ParsePosition, int) 1960 */ 1961 @Override parseReference(String text, ParsePosition pos, int limit)1962 public String parseReference(String text, ParsePosition pos, int 1963 limit) { 1964 int cp; 1965 int start = pos.getIndex(); 1966 int i; 1967 for (i = start; i < limit; i += UTF16.getCharCount(cp)) { 1968 cp = UTF16.charAt(text, i); 1969 if (!ohos.global.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { 1970 break; 1971 } 1972 } 1973 logln("TokenSymbolTable: parse \"" + text + "\" from " + 1974 start + " to " + i + 1975 " => \"" + text.substring(start,i) + "\""); 1976 pos.setIndex(i); 1977 return text.substring(start,i); 1978 } 1979 } 1980 1981 @Test TestSurrogate()1982 public void TestSurrogate() { 1983 String DATA[] = { 1984 // These should all behave identically 1985 "[abc\\uD800\\uDC00]", 1986 "[abc\uD800\uDC00]", 1987 "[abc\\U00010000]", 1988 }; 1989 for (int i=0; i<DATA.length; ++i) { 1990 logln("Test pattern " + i + " :" + Utility.escape(DATA[i])); 1991 UnicodeSet set = new UnicodeSet(DATA[i]); 1992 expectContainment(set, 1993 CharsToUnicodeString("abc\\U00010000"), 1994 "\uD800;\uDC00"); // split apart surrogate-pair 1995 if (set.size() != 4) { 1996 errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " + 1997 set.size() + ", expected 4")); 1998 } 1999 } 2000 } 2001 2002 @Test TestContains()2003 public void TestContains() { 2004 int limit = 256; // combinations to test 2005 for (int i = 0; i < limit; ++i) { 2006 logln("Trying: " + i); 2007 UnicodeSet x = bitsToSet(i); 2008 for (int j = 0; j < limit; ++j) { 2009 UnicodeSet y = bitsToSet(j); 2010 boolean containsNone = (i & j) == 0; 2011 boolean containsAll = (i & j) == j; 2012 boolean equals = i == j; 2013 if (containsNone != x.containsNone(y)) { 2014 x.containsNone(y); // repeat for debugging 2015 errln("FAILED: " + x + " containsSome " + y); 2016 } 2017 if (containsAll != x.containsAll(y)) { 2018 x.containsAll(y); // repeat for debugging 2019 errln("FAILED: " + x + " containsAll " + y); 2020 } 2021 if (equals != x.equals(y)) { 2022 x.equals(y); // repeat for debugging 2023 errln("FAILED: " + x + " equals " + y); 2024 } 2025 } 2026 } 2027 } 2028 _testComplement(int a)2029 void _testComplement(int a) { 2030 UnicodeSet x = bitsToSet(a); 2031 UnicodeSet z = bitsToSet(a); 2032 z.complement(); 2033 int c = setToBits(z); 2034 if (c != (~a)) { 2035 errln("FAILED: add: ~" + x + " != " + z); 2036 errln("FAILED: add: ~" + a + " != " + c); 2037 } 2038 checkCanonicalRep(z, "complement " + a); 2039 } 2040 _testAdd(int a, int b)2041 void _testAdd(int a, int b) { 2042 UnicodeSet x = bitsToSet(a); 2043 UnicodeSet y = bitsToSet(b); 2044 UnicodeSet z = bitsToSet(a); 2045 z.addAll(y); 2046 int c = setToBits(z); 2047 if (c != (a | b)) { 2048 errln(Utility.escape("FAILED: add: " + x + " | " + y + " != " + z)); 2049 errln("FAILED: add: " + a + " | " + b + " != " + c); 2050 } 2051 checkCanonicalRep(z, "add " + a + "," + b); 2052 } 2053 _testRetain(int a, int b)2054 void _testRetain(int a, int b) { 2055 UnicodeSet x = bitsToSet(a); 2056 UnicodeSet y = bitsToSet(b); 2057 UnicodeSet z = bitsToSet(a); 2058 z.retainAll(y); 2059 int c = setToBits(z); 2060 if (c != (a & b)) { 2061 errln("FAILED: retain: " + x + " & " + y + " != " + z); 2062 errln("FAILED: retain: " + a + " & " + b + " != " + c); 2063 } 2064 checkCanonicalRep(z, "retain " + a + "," + b); 2065 } 2066 _testRemove(int a, int b)2067 void _testRemove(int a, int b) { 2068 UnicodeSet x = bitsToSet(a); 2069 UnicodeSet y = bitsToSet(b); 2070 UnicodeSet z = bitsToSet(a); 2071 z.removeAll(y); 2072 int c = setToBits(z); 2073 if (c != (a &~ b)) { 2074 errln("FAILED: remove: " + x + " &~ " + y + " != " + z); 2075 errln("FAILED: remove: " + a + " &~ " + b + " != " + c); 2076 } 2077 checkCanonicalRep(z, "remove " + a + "," + b); 2078 } 2079 _testXor(int a, int b)2080 void _testXor(int a, int b) { 2081 UnicodeSet x = bitsToSet(a); 2082 UnicodeSet y = bitsToSet(b); 2083 UnicodeSet z = bitsToSet(a); 2084 z.complementAll(y); 2085 int c = setToBits(z); 2086 if (c != (a ^ b)) { 2087 errln("FAILED: complement: " + x + " ^ " + y + " != " + z); 2088 errln("FAILED: complement: " + a + " ^ " + b + " != " + c); 2089 } 2090 checkCanonicalRep(z, "complement " + a + "," + b); 2091 } 2092 2093 /** 2094 * Check that ranges are monotonically increasing and non- 2095 * overlapping. 2096 */ checkCanonicalRep(UnicodeSet set, String msg)2097 void checkCanonicalRep(UnicodeSet set, String msg) { 2098 int n = set.getRangeCount(); 2099 if (n < 0) { 2100 errln("FAIL result of " + msg + 2101 ": range count should be >= 0 but is " + 2102 n + " for " + Utility.escape(set.toString())); 2103 return; 2104 } 2105 int last = 0; 2106 for (int i=0; i<n; ++i) { 2107 int start = set.getRangeStart(i); 2108 int end = set.getRangeEnd(i); 2109 if (start > end) { 2110 errln("FAIL result of " + msg + 2111 ": range " + (i+1) + 2112 " start > end: " + start + ", " + end + 2113 " for " + Utility.escape(set.toString())); 2114 } 2115 if (i > 0 && start <= last) { 2116 errln("FAIL result of " + msg + 2117 ": range " + (i+1) + 2118 " overlaps previous range: " + start + ", " + end + 2119 " for " + Utility.escape(set.toString())); 2120 } 2121 last = end; 2122 } 2123 } 2124 2125 /** 2126 * Convert a bitmask to a UnicodeSet. 2127 */ bitsToSet(int a)2128 UnicodeSet bitsToSet(int a) { 2129 UnicodeSet result = new UnicodeSet(); 2130 for (int i = 0; i < 32; ++i) { 2131 if ((a & (1<<i)) != 0) { 2132 result.add((char)i,(char)i); 2133 } 2134 } 2135 2136 return result; 2137 } 2138 2139 /** 2140 * Convert a UnicodeSet to a bitmask. Only the characters 2141 * U+0000 to U+0020 are represented in the bitmask. 2142 */ setToBits(UnicodeSet x)2143 static int setToBits(UnicodeSet x) { 2144 int result = 0; 2145 for (int i = 0; i < 32; ++i) { 2146 if (x.contains((char)i)) { 2147 result |= (1<<i); 2148 } 2149 } 2150 return result; 2151 } 2152 2153 /** 2154 * Return the representation of an inversion list based UnicodeSet 2155 * as a pairs list. Ranges are listed in ascending Unicode order. 2156 * For example, the set [a-zA-M3] is represented as "33AMaz". 2157 */ getPairs(UnicodeSet set)2158 static String getPairs(UnicodeSet set) { 2159 StringBuffer pairs = new StringBuffer(); 2160 for (int i=0; i<set.getRangeCount(); ++i) { 2161 int start = set.getRangeStart(i); 2162 int end = set.getRangeEnd(i); 2163 if (end > 0xFFFF) { 2164 end = 0xFFFF; 2165 i = set.getRangeCount(); // Should be unnecessary 2166 } 2167 pairs.append((char)start).append((char)end); 2168 } 2169 return pairs.toString(); 2170 } 2171 2172 /** 2173 * Test function. Make sure that the sets have the right relation 2174 */ 2175 expectRelation(Object relationObj, Object set1Obj, Object set2Obj, String message)2176 void expectRelation(Object relationObj, Object set1Obj, Object set2Obj, String message) { 2177 int relation = ((Integer) relationObj).intValue(); 2178 UnicodeSet set1 = (UnicodeSet) set1Obj; 2179 UnicodeSet set2 = (UnicodeSet) set2Obj; 2180 2181 // by-the-by, check the iterator 2182 checkRoundTrip(set1); 2183 checkRoundTrip(set2); 2184 2185 boolean contains = set1.containsAll(set2); 2186 boolean isContained = set2.containsAll(set1); 2187 boolean disjoint = set1.containsNone(set2); 2188 boolean equals = set1.equals(set2); 2189 2190 UnicodeSet intersection = new UnicodeSet(set1).retainAll(set2); 2191 UnicodeSet minus12 = new UnicodeSet(set1).removeAll(set2); 2192 UnicodeSet minus21 = new UnicodeSet(set2).removeAll(set1); 2193 2194 // test basic properties 2195 2196 if (contains != (intersection.size() == set2.size())) { 2197 errln("FAIL contains1" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2198 } 2199 2200 if (contains != (intersection.equals(set2))) { 2201 errln("FAIL contains2" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2202 } 2203 2204 if (isContained != (intersection.size() == set1.size())) { 2205 errln("FAIL isContained1" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2206 } 2207 2208 if (isContained != (intersection.equals(set1))) { 2209 errln("FAIL isContained2" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2210 } 2211 2212 if ((contains && isContained) != equals) { 2213 errln("FAIL equals" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2214 } 2215 2216 if (disjoint != (intersection.size() == 0)) { 2217 errln("FAIL disjoint" + set1.toPattern(true) + ", " + set2.toPattern(true)); 2218 } 2219 2220 // Now see if the expected relation is true 2221 int status = (minus12.size() != 0 ? 4 : 0) 2222 | (intersection.size() != 0 ? 2 : 0) 2223 | (minus21.size() != 0 ? 1 : 0); 2224 2225 if (status != relation) { 2226 errln("FAIL relation incorrect" + message 2227 + "; desired = " + RELATION_NAME[relation] 2228 + "; found = " + RELATION_NAME[status] 2229 + "; set1 = " + set1.toPattern(true) 2230 + "; set2 = " + set2.toPattern(true) 2231 ); 2232 } 2233 } 2234 2235 /** 2236 * Basic consistency check for a few items. 2237 * That the iterator works, and that we can create a pattern and 2238 * get the same thing back 2239 */ 2240 checkRoundTrip(UnicodeSet s)2241 void checkRoundTrip(UnicodeSet s) { 2242 String pat = s.toPattern(false); 2243 UnicodeSet t = copyWithIterator(s, false); 2244 checkEqual(s, t, "iterator roundtrip"); 2245 2246 t = copyWithIterator(s, true); // try range 2247 checkEqual(s, t, "iterator roundtrip"); 2248 2249 t = new UnicodeSet(pat); 2250 checkEqual(s, t, "toPattern(false)"); 2251 2252 pat = s.toPattern(true); 2253 t = new UnicodeSet(pat); 2254 checkEqual(s, t, "toPattern(true)"); 2255 } 2256 copyWithIterator(UnicodeSet s, boolean withRange)2257 UnicodeSet copyWithIterator(UnicodeSet s, boolean withRange) { 2258 UnicodeSet t = new UnicodeSet(); 2259 UnicodeSetIterator it = new UnicodeSetIterator(s); 2260 if (withRange) { 2261 while (it.nextRange()) { 2262 if (it.codepoint == UnicodeSetIterator.IS_STRING) { 2263 t.add(it.string); 2264 } else { 2265 t.add(it.codepoint, it.codepointEnd); 2266 } 2267 } 2268 } else { 2269 while (it.next()) { 2270 if (it.codepoint == UnicodeSetIterator.IS_STRING) { 2271 t.add(it.string); 2272 } else { 2273 t.add(it.codepoint); 2274 } 2275 } 2276 } 2277 return t; 2278 } 2279 checkEqual(UnicodeSet s, UnicodeSet t, String message)2280 boolean checkEqual(UnicodeSet s, UnicodeSet t, String message) { 2281 if (!s.equals(t)) { 2282 errln("FAIL " + message 2283 + "; source = " + s.toPattern(true) 2284 + "; result = " + t.toPattern(true) 2285 ); 2286 return false; 2287 } 2288 return true; 2289 } 2290 expectEqual(String name, String pat1, String pat2)2291 void expectEqual(String name, String pat1, String pat2) { 2292 UnicodeSet set1, set2; 2293 try { 2294 set1 = new UnicodeSet(pat1); 2295 set2 = new UnicodeSet(pat2); 2296 } catch (IllegalArgumentException e) { 2297 errln("FAIL: Couldn't create UnicodeSet from pattern for \"" + name + "\": " + e.getMessage()); 2298 return; 2299 } 2300 if(!set1.equals(set2)) { 2301 errln("FAIL: Sets built from patterns differ for \"" + name + "\""); 2302 } 2303 } 2304 2305 /** 2306 * Expect the given set to contain the characters in charsIn and 2307 * to not contain those in charsOut. 2308 */ expectContainment(String pat, String charsIn, String charsOut)2309 void expectContainment(String pat, String charsIn, String charsOut) { 2310 UnicodeSet set; 2311 try { 2312 set = new UnicodeSet(pat); 2313 } catch (IllegalArgumentException e) { 2314 errln("FAIL: Couldn't create UnicodeSet from pattern \"" + 2315 pat + "\": " + e.getMessage()); 2316 return; 2317 } 2318 expectContainment(set, charsIn, charsOut); 2319 } 2320 2321 /** 2322 * Expect the given set to contain the characters in charsIn and 2323 * to not contain those in charsOut. 2324 */ expectContainment(UnicodeSet set, String charsIn, String charsOut)2325 void expectContainment(UnicodeSet set, String charsIn, String charsOut) { 2326 StringBuffer bad = new StringBuffer(); 2327 if (charsIn != null) { 2328 charsIn = Utility.unescape(charsIn); 2329 for (int i=0; i<charsIn.length(); ) { 2330 int c = UTF16.charAt(charsIn,i); 2331 i += UTF16.getCharCount(c); 2332 if (!set.contains(c)) { 2333 UTF16.append(bad,c); 2334 } 2335 } 2336 if (bad.length() > 0) { 2337 errln(Utility.escape("FAIL: set " + set + " does not contain " + bad + 2338 ", expected containment of " + charsIn)); 2339 } else { 2340 logln(Utility.escape("Ok: set " + set + " contains " + charsIn)); 2341 } 2342 } 2343 if (charsOut != null) { 2344 charsOut = Utility.unescape(charsOut); 2345 bad.setLength(0); 2346 for (int i=0; i<charsOut.length(); ) { 2347 int c = UTF16.charAt(charsOut,i); 2348 i += UTF16.getCharCount(c); 2349 if (set.contains(c)) { 2350 UTF16.append(bad, c); 2351 } 2352 } 2353 if (bad.length() > 0) { 2354 errln(Utility.escape("FAIL: set " + set + " contains " + bad + 2355 ", expected non-containment of " + charsOut)); 2356 } else { 2357 logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut)); 2358 } 2359 } 2360 } 2361 expectPattern(UnicodeSet set, String pattern, String expectedPairs)2362 void expectPattern(UnicodeSet set, 2363 String pattern, 2364 String expectedPairs) { 2365 set.applyPattern(pattern); 2366 if (!getPairs(set).equals(expectedPairs)) { 2367 errln("FAIL: applyPattern(\"" + pattern + 2368 "\") => pairs \"" + 2369 Utility.escape(getPairs(set)) + "\", expected \"" + 2370 Utility.escape(expectedPairs) + "\""); 2371 } else { 2372 logln("Ok: applyPattern(\"" + pattern + 2373 "\") => pairs \"" + 2374 Utility.escape(getPairs(set)) + "\""); 2375 } 2376 } 2377 expectToPattern(UnicodeSet set, String expPat, String[] expStrings)2378 void expectToPattern(UnicodeSet set, 2379 String expPat, 2380 String[] expStrings) { 2381 String pat = set.toPattern(true); 2382 if (pat.equals(expPat)) { 2383 logln("Ok: toPattern() => \"" + pat + "\""); 2384 } else { 2385 errln("FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); 2386 return; 2387 } 2388 if (expStrings == null) { 2389 return; 2390 } 2391 boolean in = true; 2392 for (int i=0; i<expStrings.length; ++i) { 2393 if (expStrings[i] == NOT) { // sic; pointer comparison 2394 in = false; 2395 continue; 2396 } 2397 boolean contained = set.contains(expStrings[i]); 2398 if (contained == in) { 2399 logln("Ok: " + expPat + 2400 (contained ? " contains {" : " does not contain {") + 2401 Utility.escape(expStrings[i]) + "}"); 2402 } else { 2403 errln("FAIL: " + expPat + 2404 (contained ? " contains {" : " does not contain {") + 2405 Utility.escape(expStrings[i]) + "}"); 2406 } 2407 } 2408 } 2409 expectPairs(UnicodeSet set, String expectedPairs)2410 void expectPairs(UnicodeSet set, String expectedPairs) { 2411 if (!getPairs(set).equals(expectedPairs)) { 2412 errln("FAIL: Expected pair list \"" + 2413 Utility.escape(expectedPairs) + "\", got \"" + 2414 Utility.escape(getPairs(set)) + "\""); 2415 } 2416 } CharsToUnicodeString(String s)2417 static final String CharsToUnicodeString(String s) { 2418 return Utility.unescape(s); 2419 } 2420 2421 /* Test the method public UnicodeSet getSet() */ 2422 @Test TestGetSet()2423 public void TestGetSet() { 2424 UnicodeSetIterator us = new UnicodeSetIterator(); 2425 try { 2426 us.getSet(); 2427 } catch (Exception e) { 2428 errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception."); 2429 } 2430 } 2431 2432 /* Tests the method public UnicodeSet add(Collection<?> source) */ 2433 @Test TestAddCollection()2434 public void TestAddCollection() { 2435 UnicodeSet us = new UnicodeSet(); 2436 Collection<?> s = null; 2437 try { 2438 us.add(s); 2439 errln("UnicodeSet.add(Collection<?>) was suppose to return an exception for a null parameter."); 2440 } catch (Exception e) { 2441 } 2442 } 2443 2444 @Test TestConstants()2445 public void TestConstants() { 2446 assertEquals("Empty", new UnicodeSet(), UnicodeSet.EMPTY); 2447 assertEquals("All", new UnicodeSet(0,0x10FFFF), UnicodeSet.ALL_CODE_POINTS); 2448 } 2449 2450 @Test TestIteration()2451 public void TestIteration() { 2452 UnicodeSet us1 = new UnicodeSet("[abcM{xy}]"); 2453 assertEquals("", "M, a-c", CollectionUtilities.join(us1.ranges(), ", ")); 2454 2455 // Sample code 2456 for (@SuppressWarnings("unused") EntryRange range : us1.ranges()) { 2457 // do something with code points between range.codepointEnd and range.codepointEnd; 2458 } 2459 for (@SuppressWarnings("unused") String s : us1.strings()) { 2460 // do something with each string; 2461 } 2462 2463 String[] tests = { 2464 "[M-Qzab{XY}{ZW}]", 2465 "[]", 2466 "[a]", 2467 "[a-c]", 2468 "[{XY}]", 2469 }; 2470 for (String test : tests) { 2471 UnicodeSet us = new UnicodeSet(test); 2472 UnicodeSetIterator it = new UnicodeSetIterator(us); 2473 for (EntryRange range : us.ranges()) { 2474 final String title = range.toString(); 2475 logln(title); 2476 it.nextRange(); 2477 assertEquals(title, it.codepoint, range.codepoint); 2478 assertEquals(title, it.codepointEnd, range.codepointEnd); 2479 } 2480 for (String s : us.strings()) { 2481 it.nextRange(); 2482 assertEquals("strings", it.string, s); 2483 } 2484 assertFalse("", it.next()); 2485 } 2486 } 2487 2488 @Test TestReplaceAndDelete()2489 public void TestReplaceAndDelete() { 2490 UnicodeSetSpanner m; 2491 2492 m = new UnicodeSetSpanner(new UnicodeSet("[._]")); 2493 assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._")); 2494 assertEquals("", "_.__.__.__._", m.deleteFrom("_._a_._b_._c_._", SpanCondition.NOT_CONTAINED)); 2495 2496 assertEquals("", "a_._b_._c", m.trim("_._a_._b_._c_._")); 2497 assertEquals("", "a_._b_._c_._", m.trim("_._a_._b_._c_._", TrimOption.LEADING)); 2498 assertEquals("", "_._a_._b_._c", m.trim("_._a_._b_._c_._", TrimOption.TRAILING)); 2499 2500 assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", CountMethod.WHOLE_SPAN)); 2501 assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", CountMethod.WHOLE_SPAN)); 2502 assertEquals("", "XYXYXYaXYXYXYbXYXYXYcXYXYXY", m.replaceFrom("_._a_._b_._c_._", "XY")); 2503 assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", CountMethod.WHOLE_SPAN)); 2504 2505 m = new UnicodeSetSpanner(new UnicodeSet("\\p{uppercase}")); 2506 assertEquals("", "TQBF", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED)); 2507 2508 m = new UnicodeSetSpanner(m.getUnicodeSet().addAll(new UnicodeSet("\\p{lowercase}"))); 2509 assertEquals("", "TheQuickBrownFox", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED)); 2510 2511 m = new UnicodeSetSpanner(new UnicodeSet("[{ab}]")); 2512 assertEquals("", "XXc acb", m.replaceFrom("ababc acb", "X")); 2513 assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", CountMethod.WHOLE_SPAN)); 2514 assertEquals("", "ababX", m.replaceFrom("ababc acb", "X", CountMethod.WHOLE_SPAN, SpanCondition.NOT_CONTAINED)); 2515 } 2516 2517 @Test TestCodePoints()2518 public void TestCodePoints() { 2519 // test supplemental code points and strings clusters 2520 checkCodePoints("x\u0308", "z\u0308", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1); 2521 checkCodePoints("", "", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1); 2522 checkCodePoints("", "", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1); 2523 } 2524 checkCodePoints(String a, String b, CountMethod quantifier, SpanCondition spanCondition, String expectedReplaced, int expectedCount)2525 private void checkCodePoints(String a, String b, CountMethod quantifier, SpanCondition spanCondition, 2526 String expectedReplaced, int expectedCount) { 2527 final String ab = a+b; 2528 UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]")); 2529 assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")", 2530 expectedCount, 2531 callCountIn(m, ab, quantifier, spanCondition) 2532 ); 2533 2534 if (expectedReplaced == null) { 2535 expectedReplaced = "-" + b; 2536 } 2537 assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")", 2538 expectedReplaced, m.replaceFrom(ab, "-", quantifier)); 2539 } 2540 2541 @Test TestCountIn()2542 public void TestCountIn() { 2543 UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[ab]")); 2544 checkCountIn(m, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, "abc", 2); 2545 checkCountIn(m, CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE, "abc", 1); 2546 checkCountIn(m, CountMethod.MIN_ELEMENTS, SpanCondition.NOT_CONTAINED, "acccb", 3); 2547 } 2548 checkCountIn(UnicodeSetSpanner m, CountMethod countMethod, SpanCondition spanCondition, String target, int expected)2549 public void checkCountIn(UnicodeSetSpanner m, CountMethod countMethod, SpanCondition spanCondition, String target, int expected) { 2550 final String message = "countIn " + countMethod + ", " + spanCondition; 2551 assertEquals(message, callCountIn(m, target, countMethod, spanCondition), expected); 2552 } 2553 callCountIn(UnicodeSetSpanner m, final String ab, CountMethod countMethod, SpanCondition spanCondition)2554 public int callCountIn(UnicodeSetSpanner m, final String ab, CountMethod countMethod, SpanCondition spanCondition) { 2555 return spanCondition != SpanCondition.SIMPLE ? m.countIn(ab, countMethod, spanCondition) 2556 : countMethod != CountMethod.MIN_ELEMENTS ? m.countIn(ab, countMethod) 2557 : m.countIn(ab); 2558 } 2559 2560 @Test testForSpanGaps()2561 public void testForSpanGaps() { 2562 String[] items = {"a", "b", "c", "{ab}", "{bc}", "{cd}", "{abc}", "{bcd}"}; 2563 final int limit = 1<<items.length; 2564 // build long string for testing 2565 StringBuilder longBuffer = new StringBuilder(); 2566 for (int i = 1; i < limit; ++i) { 2567 longBuffer.append("x"); 2568 longBuffer.append(getCombinations(items, i)); 2569 } 2570 String longString = longBuffer.toString(); 2571 longString = longString.replace("{","").replace("}",""); 2572 2573 long start = System.nanoTime(); 2574 for (int i = 1; i < limit; ++i) { 2575 UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]"); 2576 int problemFound = checkSpan(longString, us, SpanCondition.SIMPLE); 2577 if (problemFound >= 0) { 2578 assertEquals("Testing " + longString + ", found gap at", -1, problemFound); 2579 break; 2580 } 2581 } 2582 long end = System.nanoTime(); 2583 logln("Time for SIMPLE :\t" + (end-start)); 2584 start = System.nanoTime(); 2585 for (int i = 1; i < limit; ++i) { 2586 UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]"); 2587 int problemFound = checkSpan(longString, us, SpanCondition.CONTAINED); 2588 if (problemFound >= 0) { 2589 assertEquals("Testing " + longString + ", found gap at", -1, problemFound); 2590 break; 2591 } 2592 } 2593 end = System.nanoTime(); 2594 logln("Time for CONTAINED:\t" + (end-start)); 2595 } 2596 2597 /** 2598 * Check that there are no gaps, when we alternate spanning. That is, there 2599 * should only be a zero length span at the very start. 2600 */ checkSpan(String longString, UnicodeSet us, SpanCondition spanCondition)2601 private int checkSpan(String longString, UnicodeSet us, SpanCondition spanCondition) { 2602 int start = 0; 2603 while (start < longString.length()) { 2604 int limit = us.span(longString, start, spanCondition); 2605 if (limit == longString.length()) { 2606 break; 2607 } else if (limit == start && start != 0) { 2608 return start; 2609 } 2610 start = limit; 2611 limit = us.span(longString, start, SpanCondition.NOT_CONTAINED); 2612 if (limit == start) { 2613 return start; 2614 } 2615 start = limit; 2616 } 2617 return -1; // all ok 2618 } 2619 getCombinations(String[] items, int bitset)2620 private String getCombinations(String[] items, int bitset) { 2621 StringBuilder result = new StringBuilder(); 2622 for (int i = 0; bitset != 0; ++i) { 2623 int other = bitset & (1 << i); 2624 if (other != 0) { 2625 bitset ^= other; 2626 result.append(items[i]); 2627 } 2628 } 2629 return result.toString(); 2630 } 2631 2632 @Test TestCharSequenceArgs()2633 public void TestCharSequenceArgs() { 2634 // statics 2635 assertEquals("CharSequence from", new UnicodeSet("[{abc}]"), UnicodeSet.from(new StringBuilder("abc"))); 2636 assertEquals("CharSequence fromAll", new UnicodeSet("[a-c]"), UnicodeSet.fromAll(new StringBuilder("abc"))); 2637 assertEquals("CharSequence compare", 1.0f, Math.signum(UnicodeSet.compare(new StringBuilder("abc"), 0x61))); 2638 assertEquals("CharSequence compare", -1.0f, Math.signum(UnicodeSet.compare(0x61, new StringBuilder("abc")))); 2639 assertEquals("CharSequence compare", 0.0f, Math.signum(UnicodeSet.compare(new StringBuilder("a"), 0x61))); 2640 assertEquals("CharSequence compare", 0.0f, Math.signum(UnicodeSet.compare(0x61, new StringBuilder("a")))); 2641 assertEquals("CharSequence getSingleCodePoint", 0x1F466, UnicodeSet.getSingleCodePoint(new StringBuilder(""))); 2642 2643 // iterables/arrays 2644 Iterable<StringBuilder> iterable = Arrays.asList(new StringBuilder("A"), new StringBuilder("B")); 2645 assertEquals("CharSequence containsAll", true, new UnicodeSet("[AB]").containsAll(iterable)); 2646 assertEquals("CharSequence containsAll", false, new UnicodeSet("[a-cA]").containsAll(iterable)); 2647 assertEquals("CharSequence containsNone", true, new UnicodeSet("[a-c]").containsNone(iterable) ); 2648 assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]").containsNone(iterable) ); 2649 assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA]").containsSome(iterable) ); 2650 assertEquals("CharSequence containsSome", false, new UnicodeSet("[a-c]").containsSome(iterable) ); 2651 assertEquals("CharSequence addAll", new UnicodeSet("[a-cAB]"), new UnicodeSet("[a-cA]").addAll(new StringBuilder("A"), new StringBuilder("B")) ); 2652 assertEquals("CharSequence removeAll", new UnicodeSet("[a-c]"), new UnicodeSet("[a-cA]").removeAll( iterable) ); 2653 assertEquals("CharSequence retainAll", new UnicodeSet("[A]"), new UnicodeSet("[a-cA]").retainAll( iterable) ); 2654 2655 // UnicodeSet results 2656 assertEquals("CharSequence add", new UnicodeSet("[Aa-c{abc}{qr}]"), new UnicodeSet("[a-cA{qr}]").add(new StringBuilder("abc")) ); 2657 assertEquals("CharSequence retain", new UnicodeSet("[{abc}]"), new UnicodeSet("[a-cA{abc}{qr}]").retain(new StringBuilder("abc")) ); 2658 assertEquals("CharSequence remove", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").remove(new StringBuilder("abc")) ); 2659 assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{qr}]"), new UnicodeSet("[a-cA{abc}{qr}]").complement(new StringBuilder("abc")) ); 2660 assertEquals("CharSequence complement", new UnicodeSet("[Aa-c{abc}{qr}]"), new UnicodeSet("[a-cA{qr}]").complement(new StringBuilder("abc")) ); 2661 2662 assertEquals("CharSequence addAll", new UnicodeSet("[a-cABC]"), new UnicodeSet("[a-cA]").addAll(new StringBuilder("ABC")) ); 2663 assertEquals("CharSequence retainAll", new UnicodeSet("[a-c]"), new UnicodeSet("[a-cA]").retainAll(new StringBuilder("abcB")) ); 2664 assertEquals("CharSequence removeAll", new UnicodeSet("[Aab]"), new UnicodeSet("[a-cA]").removeAll(new StringBuilder("cC")) ); 2665 assertEquals("CharSequence complementAll", new UnicodeSet("[ABbc]"), new UnicodeSet("[a-cA]").complementAll(new StringBuilder("aB")) ); 2666 2667 // containment 2668 assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) ); 2669 assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]"). containsNone(new StringBuilder("ab")) ); 2670 assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA{ab}]"). containsSome(new StringBuilder("ab")) ); 2671 2672 // spanning 2673 assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), SpanCondition.SIMPLE) ); 2674 assertEquals("CharSequence span", 3, new UnicodeSet("[a-cA]"). span(new StringBuilder("abc"), 1, SpanCondition.SIMPLE) ); 2675 assertEquals("CharSequence spanBack", 0, new UnicodeSet("[a-cA]"). spanBack(new StringBuilder("abc"), SpanCondition.SIMPLE) ); 2676 assertEquals("CharSequence spanBack", 0, new UnicodeSet("[a-cA]"). spanBack(new StringBuilder("abc"), 1, SpanCondition.SIMPLE) ); 2677 2678 // internal 2679 OutputInt outCount = new OutputInt(); 2680 assertEquals("CharSequence matchesAt", 2, new UnicodeSet("[a-cA]"). matchesAt(new StringBuilder("abc"), 1) ); 2681 assertEquals("CharSequence spanAndCount", 3, new UnicodeSet("[a-cA]"). spanAndCount(new StringBuilder("abc"), 1, SpanCondition.SIMPLE, outCount ) ); 2682 assertEquals("CharSequence findIn", 3, new UnicodeSet("[a-cA]"). findIn(new StringBuilder("abc"), 1, true) ); 2683 assertEquals("CharSequence findLastIn", -1, new UnicodeSet("[a-cA]"). findLastIn(new StringBuilder("abc"), 1, true) ); 2684 assertEquals("CharSequence add", "c", new UnicodeSet("[abA]"). stripFrom(new StringBuilder("abc"), true)); 2685 } 2686 2687 @Test TestAStringRange()2688 public void TestAStringRange() { 2689 String[][] tests = { 2690 {"[{ax}-{bz}]", "[{ax}{ay}{az}{bx}{by}{bz}]"}, 2691 {"[{a}-{c}]", "[a-c]"}, 2692 //{"[a-{c}]", "[a-c]"}, // don't handle these yet: enable once we do 2693 //{"[{a}-c]", "[a-c]"}, // don't handle these yet: enable once we do 2694 {"[{ax}-{by}-{cz}]", "Error: '-' not after char, string, or set at \"[{ax}-{by}-{|cz}]\""}, 2695 {"[{a}-{bz}]", "Error: Range must have equal-length strings at \"[{a}-{bz}|]\""}, 2696 {"[{ax}-{b}]", "Error: Range must have equal-length strings at \"[{ax}-{b}|]\""}, 2697 {"[{ax}-bz]", "Error: Invalid range at \"[{ax}-b|z]\""}, 2698 {"[ax-{bz}]", "Error: Range must have 2 valid strings at \"[ax-{bz}|]\""}, 2699 {"[{bx}-{az}]", "Error: Range must have xᵢ ≤ yᵢ for each index i at \"[{bx}-{az}|]\""}, 2700 }; 2701 int i = 0; 2702 for (String[] test : tests) { 2703 String expected = test[1]; 2704 if (test[1].startsWith("[")) { 2705 expected = new UnicodeSet(expected).toPattern(false); 2706 } 2707 String actual; 2708 try { 2709 actual = new UnicodeSet(test[0]).toPattern(false); 2710 } catch (Exception e) { 2711 actual = e.getMessage(); 2712 } 2713 assertEquals("StringRange " + i, expected, actual); 2714 ++i; 2715 } 2716 } 2717 2718 @Test testAddAll_CharacterSequences()2719 public void testAddAll_CharacterSequences() { 2720 UnicodeSet unicodeSet = new UnicodeSet(); 2721 unicodeSet.addAll("a", "b"); 2722 assertEquals("Wrong UnicodeSet pattern", "[ab]", unicodeSet.toPattern(true)); 2723 unicodeSet.addAll("b", "x"); 2724 assertEquals("Wrong UnicodeSet pattern", "[abx]", unicodeSet.toPattern(true)); 2725 unicodeSet.addAll(new CharSequence[]{new StringBuilder("foo"), new StringBuffer("bar")}); 2726 assertEquals("Wrong UnicodeSet pattern", "[abx{bar}{foo}]", unicodeSet.toPattern(true)); 2727 } 2728 2729 @Test testCompareTo()2730 public void testCompareTo() { 2731 Set<String> test_set = Collections.emptySet(); 2732 assertEquals("UnicodeSet not empty", 0, UnicodeSet.EMPTY.compareTo(test_set)); 2733 assertEquals("UnicodeSet comparison wrong", 2734 0, UnicodeSet.fromAll("a").compareTo(Collections.singleton("a"))); 2735 2736 // Longer is bigger 2737 assertTrue("UnicodeSet is empty", 2738 UnicodeSet.ALL_CODE_POINTS.compareTo(test_set) > 0); 2739 assertTrue("UnicodeSet not empty", 2740 UnicodeSet.EMPTY.compareTo(Collections.singleton("a")) < 0); 2741 2742 // Equal length compares on first difference. 2743 assertTrue("UnicodeSet comparison wrong", 2744 UnicodeSet.fromAll("a").compareTo(Collections.singleton("b")) < 0); 2745 assertTrue("UnicodeSet comparison wrong", 2746 UnicodeSet.fromAll("ab").compareTo(Arrays.asList("a", "c")) < 0); 2747 assertTrue("UnicodeSet comparison wrong", 2748 UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0); 2749 } 2750 2751 @Test TestUnusedCcc()2752 public void TestUnusedCcc() { 2753 // All numeric ccc values 0..255 are valid, but many are unused. 2754 UnicodeSet ccc2 = new UnicodeSet("[:ccc=2:]"); 2755 assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty()); 2756 2757 UnicodeSet ccc255 = new UnicodeSet("[:ccc=255:]"); 2758 assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty()); 2759 2760 // Non-integer values and values outside 0..255 are invalid. 2761 try { 2762 new UnicodeSet("[:ccc=-1:]"); 2763 fail("[:ccc=-1:] -> illegal argument"); 2764 } catch (IllegalArgumentException expected) { 2765 } 2766 2767 try { 2768 new UnicodeSet("[:ccc=256:]"); 2769 fail("[:ccc=256:] -> illegal argument"); 2770 } catch (IllegalArgumentException expected) { 2771 } 2772 2773 try { 2774 new UnicodeSet("[:ccc=1.1:]"); 2775 fail("[:ccc=1.1:] -> illegal argument"); 2776 } catch (IllegalArgumentException expected) { 2777 } 2778 } 2779 2780 @Test TestDeepPattern()2781 public void TestDeepPattern() { 2782 // Nested ranges are parsed via recursion which can use a lot of stack space. 2783 // After a reasonable limit, we should get an error. 2784 final int DEPTH = 20000; 2785 StringBuilder pattern = new StringBuilder(); 2786 StringBuilder suffix = new StringBuilder(); 2787 for (int i = 0; i < DEPTH; ++i) { 2788 pattern.append("[a"); 2789 suffix.append(']'); 2790 } 2791 pattern.append(suffix); 2792 try { 2793 new UnicodeSet(pattern.toString()); 2794 fail("[a[a[a...1000s...]]] did not throw an exception"); 2795 } catch(RuntimeException expected) { 2796 } 2797 } 2798 } 2799