1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 * 03/22/2000 Madhu Added additional tests
11 ********************************************************************************
12 */
13
14 #include <stdio.h>
15
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/utf8.h"
27 #include "unicode/utf16.h"
28 #include "unicode/uversion.h"
29 #include "cmemory.h"
30 #include "hash.h"
31
32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
33 if (U_FAILURE(status)) { \
34 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
35 u_errorName(status)); \
36 } \
37 } UPRV_BLOCK_MACRO_END
38
39 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
40 if (!(expr)) { \
41 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
42 } \
43 } UPRV_BLOCK_MACRO_END
44
operator +(const UnicodeString & left,const UnicodeSet & set)45 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
46 UnicodeString pat;
47 set.toPattern(pat);
48 return left + UnicodeSetTest::escape(pat);
49 }
50
UnicodeSetTest()51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53
openUTF8Converter()54 UConverter *UnicodeSetTest::openUTF8Converter() {
55 if(utf8Cnv==NULL) {
56 UErrorCode errorCode=U_ZERO_ERROR;
57 utf8Cnv=ucnv_open("UTF-8", &errorCode);
58 }
59 return utf8Cnv;
60 }
61
~UnicodeSetTest()62 UnicodeSetTest::~UnicodeSetTest() {
63 ucnv_close(utf8Cnv);
64 }
65
66 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68 const char* &name, char* /*par*/) {
69 if (exec) {
70 logln(u"TestSuite UnicodeSetTest");
71 }
72 TESTCASE_AUTO_BEGIN;
73 TESTCASE_AUTO(TestPatterns);
74 TESTCASE_AUTO(TestAddRemove);
75 TESTCASE_AUTO(TestCategories);
76 TESTCASE_AUTO(TestCloneEqualHash);
77 TESTCASE_AUTO(TestMinimalRep);
78 TESTCASE_AUTO(TestAPI);
79 TESTCASE_AUTO(TestScriptSet);
80 TESTCASE_AUTO(TestPropertySet);
81 TESTCASE_AUTO(TestClone);
82 TESTCASE_AUTO(TestExhaustive);
83 TESTCASE_AUTO(TestToPattern);
84 TESTCASE_AUTO(TestIndexOf);
85 TESTCASE_AUTO(TestStrings);
86 TESTCASE_AUTO(Testj2268);
87 TESTCASE_AUTO(TestCloseOver);
88 TESTCASE_AUTO(TestEscapePattern);
89 TESTCASE_AUTO(TestInvalidCodePoint);
90 TESTCASE_AUTO(TestSymbolTable);
91 TESTCASE_AUTO(TestSurrogate);
92 TESTCASE_AUTO(TestPosixClasses);
93 TESTCASE_AUTO(TestIteration);
94 TESTCASE_AUTO(TestFreezable);
95 TESTCASE_AUTO(TestSpan);
96 TESTCASE_AUTO(TestStringSpan);
97 TESTCASE_AUTO(TestUCAUnsafeBackwards);
98 TESTCASE_AUTO(TestIntOverflow);
99 TESTCASE_AUTO(TestUnusedCcc);
100 TESTCASE_AUTO(TestDeepPattern);
101 TESTCASE_AUTO(TestEmptyString);
102 TESTCASE_AUTO_END;
103 }
104
105 static const char NOT[] = "%%%%";
106
107 /**
108 * UVector was improperly copying contents
109 * This code will crash this is still true
110 */
Testj2268()111 void UnicodeSetTest::Testj2268() {
112 UnicodeSet t;
113 t.add(UnicodeString("abc"));
114 UnicodeSet test(t);
115 UnicodeString ustrPat;
116 test.toPattern(ustrPat, TRUE);
117 }
118
119 /**
120 * Test toPattern().
121 */
TestToPattern()122 void UnicodeSetTest::TestToPattern() {
123 UErrorCode ec = U_ZERO_ERROR;
124
125 // Test that toPattern() round trips with syntax characters and
126 // whitespace.
127 {
128 static const char* OTHER_TOPATTERN_TESTS[] = {
129 "[[:latin:]&[:greek:]]",
130 "[[:latin:]-[:greek:]]",
131 "[:nonspacing mark:]",
132 NULL
133 };
134
135 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
136 ec = U_ZERO_ERROR;
137 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
138 if (U_FAILURE(ec)) {
139 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
140 continue;
141 }
142 checkPat(OTHER_TOPATTERN_TESTS[j], s);
143 }
144
145 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
146 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
147
148 // check various combinations to make sure they all work.
149 if (i != 0 && !toPatternAux(i, i)){
150 continue;
151 }
152 if (!toPatternAux(0, i)){
153 continue;
154 }
155 if (!toPatternAux(i, 0xFFFF)){
156 continue;
157 }
158 }
159 }
160 }
161
162 // Test pattern behavior of multicharacter strings.
163 {
164 ec = U_ZERO_ERROR;
165 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
166
167 // This loop isn't a loop. It's here to make the compiler happy.
168 // If you're curious, try removing it and changing the 'break'
169 // statements (except for the last) to goto's.
170 for (;;) {
171 if (U_FAILURE(ec)) break;
172 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
173 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
174
175 s->add("ac");
176 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
177 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
178
179 s->applyPattern(u"[a-z {\\{l} {r\\}}]", ec);
180 if (U_FAILURE(ec)) break;
181 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
182 expectToPattern(*s, u"[a-z{r\\}}{\\{l}]", exp3);
183
184 s->add("[]");
185 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
186 expectToPattern(*s, u"[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
187
188 s->applyPattern(u"[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
189 if (U_FAILURE(ec)) break;
190 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
191 expectToPattern(*s, u"[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
192
193 // j2189
194 s->clear();
195 s->add(UnicodeString("abc", ""));
196 s->add(UnicodeString("abc", ""));
197 const char* exp6[] = {"abc", NOT, "ab", NULL};
198 expectToPattern(*s, "[{abc}]", exp6);
199
200 break;
201 }
202
203 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
204 delete s;
205 }
206
207 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
208 UnicodeSet s;
209 s.add(u'a', u'b');
210 expectToPattern(s, "[ab]", NULL);
211 }
212
toPatternAux(UChar32 start,UChar32 end)213 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
214
215 // use Integer.toString because Utility.hex doesn't handle ints
216 UnicodeString pat = "";
217 // TODO do these in hex
218 //String source = "0x" + Integer.toString(start,16).toUpperCase();
219 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
220 UnicodeString source;
221 source = source + (uint32_t)start;
222 if (start != end)
223 source = source + ".." + (uint32_t)end;
224 UnicodeSet testSet;
225 testSet.add(start, end);
226 return checkPat(source, testSet);
227 }
228
checkPat(const UnicodeString & source,const UnicodeSet & testSet)229 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
230 const UnicodeSet& testSet) {
231 // What we want to make sure of is that a pattern generated
232 // by toPattern(), with or without escaped unprintables, can
233 // be passed back into the UnicodeSet constructor.
234 UnicodeString pat0;
235
236 testSet.toPattern(pat0, TRUE);
237
238 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
239
240 //String pat1 = unescapeLeniently(pat0);
241 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
242
243 UnicodeString pat2;
244 testSet.toPattern(pat2, FALSE);
245 if (!checkPat(source, testSet, pat2)) return FALSE;
246
247 //String pat3 = unescapeLeniently(pat2);
248 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
249
250 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
251 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
252 return TRUE;
253 }
254
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)255 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
256 const UnicodeSet& testSet,
257 const UnicodeString& pat) {
258 UErrorCode ec = U_ZERO_ERROR;
259 UnicodeSet testSet2(pat, ec);
260 if (testSet2 != testSet) {
261 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
262 return FALSE;
263 }
264 return TRUE;
265 }
266
267 void
TestPatterns(void)268 UnicodeSetTest::TestPatterns(void) {
269 UnicodeSet set;
270 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
271 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
272 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
273 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
274 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
275 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
276
277 // Throw in a test of complement
278 set.complement();
279 UnicodeString exp;
280 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(u'z'+1)).append(u'\uFFFF');
281 expectPairs(set, exp);
282 }
283
284 void
TestCategories(void)285 UnicodeSetTest::TestCategories(void) {
286 UErrorCode status = U_ZERO_ERROR;
287 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
288 UnicodeSet set(pat, status);
289 if (U_FAILURE(status)) {
290 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
291 return;
292 } else {
293 expectContainment(set, pat, "ABC", "abc");
294 }
295
296 UChar32 i;
297 int32_t failures = 0;
298 // Make sure generation of L doesn't pollute cached Lu set
299 // First generate L, then Lu
300 set.applyPattern("[:L:]", status);
301 if (U_FAILURE(status)) { errln("FAIL"); return; }
302 for (i=0; i<0x200; ++i) {
303 UBool l = u_isalpha((UChar)i);
304 if (l != set.contains(i)) {
305 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
306 set.contains(i));
307 if (++failures == 10) break;
308 }
309 }
310
311 set.applyPattern("[:Lu:]", status);
312 if (U_FAILURE(status)) { errln("FAIL"); return; }
313 for (i=0; i<0x200; ++i) {
314 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
315 if (lu != set.contains(i)) {
316 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
317 set.contains(i));
318 if (++failures == 20) break;
319 }
320 }
321 }
322 void
TestCloneEqualHash(void)323 UnicodeSetTest::TestCloneEqualHash(void) {
324 UErrorCode status = U_ZERO_ERROR;
325 // set1 and set2 used to be built with the obsolete constructor taking
326 // UCharCategory values; replaced with pattern constructors
327 // markus 20030502
328 UnicodeSet *set1=new UnicodeSet(u"\\p{Lowercase Letter}", status); // :Ll: Letter, lowercase
329 UnicodeSet *set1a=new UnicodeSet(u"[:Ll:]", status); // Letter, lowercase
330 if (U_FAILURE(status)){
331 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
332 return;
333 }
334 UnicodeSet *set2=new UnicodeSet(u"\\p{Decimal Number}", status); //Number, Decimal digit
335 UnicodeSet *set2a=new UnicodeSet(u"[:Nd:]", status); //Number, Decimal digit
336 if (U_FAILURE(status)){
337 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
338 return;
339 }
340
341 if (*set1 != *set1a) {
342 errln("FAIL: category constructor for Ll broken");
343 }
344 if (*set2 != *set2a) {
345 errln("FAIL: category constructor for Nd broken");
346 }
347 delete set1a;
348 delete set2a;
349
350 logln("Testing copy construction");
351 UnicodeSet *set1copy=new UnicodeSet(*set1);
352 if(*set1 != *set1copy || *set1 == *set2 ||
353 getPairs(*set1) != getPairs(*set1copy) ||
354 set1->hashCode() != set1copy->hashCode()){
355 errln("FAIL : Error in copy construction");
356 return;
357 }
358
359 logln("Testing =operator");
360 UnicodeSet set1equal=*set1;
361 UnicodeSet set2equal=*set2;
362 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
363 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
364 errln("FAIL: Error in =operator");
365 }
366
367 logln("Testing clone()");
368 UnicodeSet *set1clone=set1->clone();
369 UnicodeSet *set2clone=set2->clone();
370 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
371 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
372 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
373 errln("FAIL: Error in clone");
374 }
375
376 logln("Testing hashcode");
377 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
378 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
379 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
380 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
381 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
382 errln("FAIL: Error in hashCode()");
383 }
384
385 delete set1;
386 delete set1copy;
387 delete set2;
388 delete set1clone;
389 delete set2clone;
390
391
392 }
393 void
TestAddRemove(void)394 UnicodeSetTest::TestAddRemove(void) {
395 UnicodeSet set; // Construct empty set
396 doAssert(set.isEmpty() == TRUE, "set should be empty");
397 doAssert(set.size() == 0, "size should be 0");
398 set.complement();
399 doAssert(set.size() == 0x110000, "size should be 0x110000");
400 set.clear();
401 set.add(0x0061, 0x007a);
402 expectPairs(set, "az");
403 doAssert(set.isEmpty() == FALSE, "set should not be empty");
404 doAssert(set.size() != 0, "size should not be equal to 0");
405 doAssert(set.size() == 26, "size should be equal to 26");
406 set.remove(0x006d, 0x0070);
407 expectPairs(set, "alqz");
408 doAssert(set.size() == 22, "size should be equal to 22");
409 set.remove(0x0065, 0x0067);
410 expectPairs(set, "adhlqz");
411 doAssert(set.size() == 19, "size should be equal to 19");
412 set.remove(0x0064, 0x0069);
413 expectPairs(set, "acjlqz");
414 doAssert(set.size() == 16, "size should be equal to 16");
415 set.remove(0x0063, 0x0072);
416 expectPairs(set, "absz");
417 doAssert(set.size() == 10, "size should be equal to 10");
418 set.add(0x0066, 0x0071);
419 expectPairs(set, "abfqsz");
420 doAssert(set.size() == 22, "size should be equal to 22");
421 set.remove(0x0061, 0x0067);
422 expectPairs(set, "hqsz");
423 set.remove(0x0061, 0x007a);
424 expectPairs(set, "");
425 doAssert(set.isEmpty() == TRUE, "set should be empty");
426 doAssert(set.size() == 0, "size should be 0");
427 set.add(0x0061);
428 doAssert(set.isEmpty() == FALSE, "set should not be empty");
429 doAssert(set.size() == 1, "size should not be equal to 1");
430 set.add(0x0062);
431 set.add(0x0063);
432 expectPairs(set, "ac");
433 doAssert(set.size() == 3, "size should not be equal to 3");
434 set.add(0x0070);
435 set.add(0x0071);
436 expectPairs(set, "acpq");
437 doAssert(set.size() == 5, "size should not be equal to 5");
438 set.clear();
439 expectPairs(set, "");
440 doAssert(set.isEmpty() == TRUE, "set should be empty");
441 doAssert(set.size() == 0, "size should be 0");
442
443 // Try removing an entire set from another set
444 expectPattern(set, "[c-x]", "cx");
445 UnicodeSet set2;
446 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
447 set.removeAll(set2);
448 expectPairs(set, "deluxx");
449
450 // Try adding an entire set to another set
451 expectPattern(set, "[jackiemclean]", "aacceein");
452 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
453 set.addAll(set2);
454 expectPairs(set, "aacehort");
455 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
456
457 // Try retaining an set of elements contained in another set (intersection)
458 UnicodeSet set3;
459 expectPattern(set3, "[a-c]", "ac");
460 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
461 set3.remove(0x0062);
462 expectPairs(set3, "aacc");
463 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
464 set.retainAll(set3);
465 expectPairs(set, "aacc");
466 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
467 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
468 set.clear();
469 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
470
471 // Test commutativity
472 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
473 expectPattern(set2, "[jackiemclean]", "aacceein");
474 set.addAll(set2);
475 expectPairs(set, "aacehort");
476 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
477
478
479
480
481 }
482
483 /**
484 * Make sure minimal representation is maintained.
485 */
TestMinimalRep()486 void UnicodeSetTest::TestMinimalRep() {
487 UErrorCode status = U_ZERO_ERROR;
488 // This is pretty thoroughly tested by checkCanonicalRep()
489 // run against the exhaustive operation results. Use the code
490 // here for debugging specific spot problems.
491
492 // 1 overlap against 2
493 UnicodeSet set("[h-km-q]", status);
494 if (U_FAILURE(status)) { errln("FAIL"); return; }
495 UnicodeSet set2("[i-o]", status);
496 if (U_FAILURE(status)) { errln("FAIL"); return; }
497 set.addAll(set2);
498 expectPairs(set, "hq");
499 // right
500 set.applyPattern("[a-m]", status);
501 if (U_FAILURE(status)) { errln("FAIL"); return; }
502 set2.applyPattern("[e-o]", status);
503 if (U_FAILURE(status)) { errln("FAIL"); return; }
504 set.addAll(set2);
505 expectPairs(set, "ao");
506 // left
507 set.applyPattern("[e-o]", status);
508 if (U_FAILURE(status)) { errln("FAIL"); return; }
509 set2.applyPattern("[a-m]", status);
510 if (U_FAILURE(status)) { errln("FAIL"); return; }
511 set.addAll(set2);
512 expectPairs(set, "ao");
513 // 1 overlap against 3
514 set.applyPattern("[a-eg-mo-w]", status);
515 if (U_FAILURE(status)) { errln("FAIL"); return; }
516 set2.applyPattern("[d-q]", status);
517 if (U_FAILURE(status)) { errln("FAIL"); return; }
518 set.addAll(set2);
519 expectPairs(set, "aw");
520 }
521
TestAPI()522 void UnicodeSetTest::TestAPI() {
523 UErrorCode status = U_ZERO_ERROR;
524 // default ct
525 UnicodeSet set;
526 if (!set.isEmpty() || set.getRangeCount() != 0) {
527 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
528 set);
529 }
530
531 // clear(), isEmpty()
532 set.add(0x0061);
533 if (set.isEmpty()) {
534 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
535 set);
536 }
537 set.clear();
538 if (!set.isEmpty()) {
539 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
540 set);
541 }
542
543 // size()
544 set.clear();
545 if (set.size() != 0) {
546 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
547 ": " + set);
548 }
549 set.add(0x0061);
550 if (set.size() != 1) {
551 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
552 ": " + set);
553 }
554 set.add(0x0031, 0x0039);
555 if (set.size() != 10) {
556 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
557 ": " + set);
558 }
559
560 // contains(first, last)
561 set.clear();
562 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
563 if (U_FAILURE(status)) { errln("FAIL"); return; }
564 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
565 UChar32 a = set.getRangeStart(i);
566 UChar32 b = set.getRangeEnd(i);
567 if (!set.contains(a, b)) {
568 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
569 " but doesn't: " + set);
570 }
571 if (set.contains((UChar32)(a-1), b)) {
572 errln((UnicodeString)"FAIL, shouldn't contain " +
573 (unsigned short)(a-1) + '-' + (unsigned short)b +
574 " but does: " + set);
575 }
576 if (set.contains(a, (UChar32)(b+1))) {
577 errln((UnicodeString)"FAIL, shouldn't contain " +
578 (unsigned short)a + '-' + (unsigned short)(b+1) +
579 " but does: " + set);
580 }
581 }
582
583 // Ported InversionList test.
584 UnicodeSet a((UChar32)3,(UChar32)10);
585 UnicodeSet b((UChar32)7,(UChar32)15);
586 UnicodeSet c;
587
588 logln((UnicodeString)"a [3-10]: " + a);
589 logln((UnicodeString)"b [7-15]: " + b);
590 c = a;
591 c.addAll(b);
592 UnicodeSet exp((UChar32)3,(UChar32)15);
593 if (c == exp) {
594 logln((UnicodeString)"c.set(a).add(b): " + c);
595 } else {
596 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
597 }
598 c.complement();
599 exp.set((UChar32)0, (UChar32)2);
600 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
601 if (c == exp) {
602 logln((UnicodeString)"c.complement(): " + c);
603 } else {
604 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
605 }
606 c.complement();
607 exp.set((UChar32)3, (UChar32)15);
608 if (c == exp) {
609 logln((UnicodeString)"c.complement(): " + c);
610 } else {
611 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
612 }
613 c = a;
614 c.complementAll(b);
615 exp.set((UChar32)3,(UChar32)6);
616 exp.add((UChar32)11,(UChar32) 15);
617 if (c == exp) {
618 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
619 } else {
620 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
621 }
622
623 exp = c;
624 bitsToSet(setToBits(c), c);
625 if (c == exp) {
626 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
627 } else {
628 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
629 }
630
631 // Additional tests for coverage JB#2118
632 //UnicodeSet::complement(class UnicodeString const &)
633 //UnicodeSet::complementAll(class UnicodeString const &)
634 //UnicodeSet::containsNone(class UnicodeSet const &)
635 //UnicodeSet::containsNone(long,long)
636 //UnicodeSet::containsSome(class UnicodeSet const &)
637 //UnicodeSet::containsSome(long,long)
638 //UnicodeSet::removeAll(class UnicodeString const &)
639 //UnicodeSet::retain(long)
640 //UnicodeSet::retainAll(class UnicodeString const &)
641 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
642 //UnicodeSetIterator::getString(void)
643 set.clear();
644 set.complement("ab");
645 exp.applyPattern("[{ab}]", status);
646 if (U_FAILURE(status)) { errln("FAIL"); return; }
647 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
648
649 UnicodeSetIterator iset(set);
650 if (!iset.next() || !iset.isString()) {
651 errln("FAIL: UnicodeSetIterator::next/isString");
652 } else if (iset.getString() != "ab") {
653 errln("FAIL: UnicodeSetIterator::getString");
654 }
655
656 set.add(u'a', u'z');
657 set.complementAll("alan");
658 exp.applyPattern("[{ab}b-kmo-z]", status);
659 if (U_FAILURE(status)) { errln("FAIL"); return; }
660 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
661
662 exp.applyPattern("[a-z]", status);
663 if (U_FAILURE(status)) { errln("FAIL"); return; }
664 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
665 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
666 exp.applyPattern("[aln]", status);
667 if (U_FAILURE(status)) { errln("FAIL"); return; }
668 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
669 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
670
671 if (set.containsNone(u'a', u'z')) {
672 errln("FAIL: containsNone(UChar32, UChar32)");
673 }
674 if (!set.containsSome(u'a', u'z')) {
675 errln("FAIL: containsSome(UChar32, UChar32)");
676 }
677 if (!set.containsNone(u'A', u'Z')) {
678 errln("FAIL: containsNone(UChar32, UChar32)");
679 }
680 if (set.containsSome(u'A', u'Z')) {
681 errln("FAIL: containsSome(UChar32, UChar32)");
682 }
683
684 set.removeAll("liu");
685 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
686 if (U_FAILURE(status)) { errln("FAIL"); return; }
687 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
688
689 set.retainAll("star");
690 exp.applyPattern("[rst]", status);
691 if (U_FAILURE(status)) { errln("FAIL"); return; }
692 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
693
694 set.retain(u's');
695 exp.applyPattern("[s]", status);
696 if (U_FAILURE(status)) { errln("FAIL"); return; }
697 if (set != exp) { errln("FAIL: retain('s')"); return; }
698
699 // ICU 2.6 coverage tests
700 // public final UnicodeSet retain(String s);
701 // public final UnicodeSet remove(int c);
702 // public final UnicodeSet remove(String s);
703 // public int hashCode();
704 set.applyPattern(u"[a-z{ab}{cd}]", status);
705 if (U_FAILURE(status)) { errln("FAIL"); return; }
706 set.retain(u"cd");
707 exp.applyPattern(u"[{cd}]", status);
708 if (U_FAILURE(status)) { errln("FAIL"); return; }
709 if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
710
711 set.applyPattern(u"[a-z{ab}{yz}]", status);
712 if (U_FAILURE(status)) { errln("FAIL"); return; }
713 set.retain(u"cd");
714 exp.clear();
715 if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
716
717 set.applyPattern(u"[a-z{ab}{cd}]", status);
718 if (U_FAILURE(status)) { errln("FAIL"); return; }
719 set.remove(u'c');
720 exp.applyPattern(u"[abd-z{ab}{cd}]", status);
721 if (set != exp) { errln("FAIL: remove('c')"); return; }
722
723 set.remove(u"cd");
724 exp.applyPattern(u"[abd-z{ab}]", status);
725 if (U_FAILURE(status)) { errln("FAIL"); return; }
726 if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
727
728 set.applyPattern("[s]", status);
729 if (U_FAILURE(status)) { errln("FAIL"); return; }
730 uint16_t buf[32];
731 int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
732 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
733 if (slen != 3 || buf[0] != 2 || buf[1] != u's' || buf[2] != u't') {
734 errln("FAIL: serialize");
735 return;
736 }
737
738 // Conversions to and from USet
739 UnicodeSet *uniset = &set;
740 USet *uset = uniset->toUSet();
741 TEST_ASSERT((void *)uset == (void *)uniset);
742 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
743 TEST_ASSERT((void *)setx == (void *)uset);
744 const UnicodeSet *constSet = uniset;
745 const USet *constUSet = constSet->toUSet();
746 TEST_ASSERT((void *)constUSet == (void *)constSet);
747 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
748 TEST_ASSERT((void *)constSetx == (void *)constUSet);
749
750 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
751 UnicodeString longString=u"aaaaaaaaaabbbbbbbbbbcccccccccc";
752 UnicodeSet ac(0x61, 0x63);
753 ac.remove(0x62).freeze();
754 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
755 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
756 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
757 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
758 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
759 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
760 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
761 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
762 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
763 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
764 ) {
765 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
766 }
767 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
768 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
769 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
770 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
771 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
772 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
773 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
774 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
775 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
776 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
777 ) {
778 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
779 }
780 }
781
TestIteration()782 void UnicodeSetTest::TestIteration() {
783 UErrorCode ec = U_ZERO_ERROR;
784 int i = 0;
785 int outerLoop;
786
787 // 6 code points, 3 ranges, 2 strings, 8 total elements
788 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
789 UnicodeSet set(u"[zabyc\\U0001abcd{str1}{str2}]", ec);
790 TEST_ASSERT_SUCCESS(ec);
791 UnicodeSetIterator it(set);
792
793 for (outerLoop=0; outerLoop<3; outerLoop++) {
794 // Run the test multiple times, to check that iterator.reset() is working.
795 for (i=0; i<10; i++) {
796 UBool nextv = it.next();
797 UBool isString = it.isString();
798 int32_t codePoint = it.getCodepoint();
799 //int32_t codePointEnd = it.getCodepointEnd();
800 UnicodeString s = it.getString();
801 switch (i) {
802 case 0:
803 TEST_ASSERT(nextv == TRUE);
804 TEST_ASSERT(isString == FALSE);
805 TEST_ASSERT(codePoint==0x61);
806 TEST_ASSERT(s == "a");
807 break;
808 case 1:
809 TEST_ASSERT(nextv == TRUE);
810 TEST_ASSERT(isString == FALSE);
811 TEST_ASSERT(codePoint==0x62);
812 TEST_ASSERT(s == "b");
813 break;
814 case 2:
815 TEST_ASSERT(nextv == TRUE);
816 TEST_ASSERT(isString == FALSE);
817 TEST_ASSERT(codePoint==0x63);
818 TEST_ASSERT(s == "c");
819 break;
820 case 3:
821 TEST_ASSERT(nextv == TRUE);
822 TEST_ASSERT(isString == FALSE);
823 TEST_ASSERT(codePoint==0x79);
824 TEST_ASSERT(s == "y");
825 break;
826 case 4:
827 TEST_ASSERT(nextv == TRUE);
828 TEST_ASSERT(isString == FALSE);
829 TEST_ASSERT(codePoint==0x7a);
830 TEST_ASSERT(s == "z");
831 break;
832 case 5:
833 TEST_ASSERT(nextv == TRUE);
834 TEST_ASSERT(isString == FALSE);
835 TEST_ASSERT(codePoint==0x1abcd);
836 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
837 break;
838 case 6:
839 TEST_ASSERT(nextv == TRUE);
840 TEST_ASSERT(isString == TRUE);
841 TEST_ASSERT(s == "str1");
842 break;
843 case 7:
844 TEST_ASSERT(nextv == TRUE);
845 TEST_ASSERT(isString == TRUE);
846 TEST_ASSERT(s == "str2");
847 break;
848 case 8:
849 TEST_ASSERT(nextv == FALSE);
850 break;
851 case 9:
852 TEST_ASSERT(nextv == FALSE);
853 break;
854 }
855 }
856 it.reset(); // prepare to run the iteration again.
857 }
858 }
859
860
861
862
TestStrings()863 void UnicodeSetTest::TestStrings() {
864 UErrorCode ec = U_ZERO_ERROR;
865
866 UnicodeSet* testList[] = {
867 UnicodeSet::createFromAll("abc"),
868 new UnicodeSet("[a-c]", ec),
869
870 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
871 new UnicodeSet("[{ll}{ch}a-z]", ec),
872
873 UnicodeSet::createFrom("ab}c"),
874 new UnicodeSet("[{ab\\}c}]", ec),
875
876 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
877 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
878
879 NULL
880 };
881
882 if (U_FAILURE(ec)) {
883 errln("FAIL: couldn't construct test sets");
884 }
885
886 for (int32_t i = 0; testList[i] != NULL; i+=2) {
887 if (U_SUCCESS(ec)) {
888 UnicodeString pat0, pat1;
889 testList[i]->toPattern(pat0, TRUE);
890 testList[i+1]->toPattern(pat1, TRUE);
891 if (*testList[i] == *testList[i+1]) {
892 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
893 } else {
894 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
895 }
896 }
897 delete testList[i];
898 delete testList[i+1];
899 }
900 }
901
902 /**
903 * Test the [:Latin:] syntax.
904 */
TestScriptSet()905 void UnicodeSetTest::TestScriptSet() {
906 expectContainment(u"[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
907
908 expectContainment(u"[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
909
910 /* Jitterbug 1423 */
911 expectContainment(u"[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
912
913 }
914
915 /**
916 * Test the [:Latin:] syntax.
917 */
TestPropertySet()918 void UnicodeSetTest::TestPropertySet() {
919 static const char* const DATA[] = {
920 // Pattern, Chars IN, Chars NOT in
921
922 "[:Latin:]",
923 "aA",
924 "\\u0391\\u03B1",
925
926 "[\\p{Greek}]",
927 "\\u0391\\u03B1",
928 "aA",
929
930 "\\P{ GENERAL Category = upper case letter }",
931 "abc",
932 "ABC",
933
934 #if !UCONFIG_NO_NORMALIZATION
935 // Combining class: @since ICU 2.2
936 // Check both symbolic and numeric
937 "\\p{ccc=Nukta}",
938 "\\u0ABC",
939 "abc",
940
941 "\\p{Canonical Combining Class = 11}",
942 "\\u05B1",
943 "\\u05B2",
944
945 "[:c c c = iota subscript :]",
946 "\\u0345",
947 "xyz",
948 #endif
949
950 // Bidi class: @since ICU 2.2
951 "\\p{bidiclass=lefttoright}",
952 "abc",
953 "\\u0671\\u0672",
954
955 // Binary properties: @since ICU 2.2
956 "\\p{ideographic}",
957 "\\u4E0A",
958 "x",
959
960 "[:math=false:]",
961 "q)*(",
962 // weiv: )(and * were removed from math in Unicode 4.0.1
963 //"(*+)",
964 "+<>^",
965
966 // JB#1767 \N{}, \p{ASCII}
967 "[:Ascii:]",
968 "abc\\u0000\\u007F",
969 "\\u0080\\u4E00",
970
971 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
972 "az",
973 "qrs",
974
975 // JB#2015
976 "[:any:]",
977 "a\\U0010FFFF",
978 "",
979
980 "[:nv=0.5:]",
981 "\\u00BD\\u0F2A",
982 "\\u00BC",
983
984 // JB#2653: Age
985 "[:Age=1.1:]",
986 "\\u03D6", // 1.1
987 "\\u03D8\\u03D9", // 3.2
988
989 "[:Age=3.1:]",
990 "\\u1800\\u3400\\U0002f800",
991 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
992
993 // JB#2350: Case_Sensitive
994 "[:Case Sensitive:]",
995 "A\\u1FFC\\U00010410",
996 ";\\u00B4\\U00010500",
997
998 // JB#2832: C99-compatibility props
999 "[:blank:]",
1000 " \\u0009",
1001 "1-9A-Z",
1002
1003 "[:graph:]",
1004 "19AZ",
1005 " \\u0003\\u0007\\u0009\\u000A\\u000D",
1006
1007 "[:punct:]",
1008 "!@#%&*()[]{}-_\\/;:,.?'\"",
1009 "09azAZ",
1010
1011 "[:xdigit:]",
1012 "09afAF",
1013 "gG!",
1014
1015 // Regex compatibility test
1016 "[-b]", // leading '-' is literal
1017 "-b",
1018 "ac",
1019
1020 "[^-b]", // leading '-' is literal
1021 "ac",
1022 "-b",
1023
1024 "[b-]", // trailing '-' is literal
1025 "-b",
1026 "ac",
1027
1028 "[^b-]", // trailing '-' is literal
1029 "ac",
1030 "-b",
1031
1032 "[a-b-]", // trailing '-' is literal
1033 "ab-",
1034 "c=",
1035
1036 "[[a-q]&[p-z]-]", // trailing '-' is literal
1037 "pq-",
1038 "or=",
1039
1040 "[\\s|\\)|:|$|\\>]", // from regex tests
1041 "s|):$>",
1042 "abc",
1043
1044 "[\\uDC00cd]", // JB#2906: isolated trail at start
1045 "cd\\uDC00",
1046 "ab\\uD800\\U00010000",
1047
1048 "[ab\\uD800]", // JB#2906: isolated trail at start
1049 "ab\\uD800",
1050 "cd\\uDC00\\U00010000",
1051
1052 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1053 "abcd\\uD800",
1054 "ef\\uDC00\\U00010000",
1055
1056 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1057 "abcd\\uDC00",
1058 "ef\\uD800\\U00010000",
1059
1060 #if !UCONFIG_NO_NORMALIZATION
1061 "[:^lccc=0:]", // Lead canonical class
1062 "\\u0300\\u0301",
1063 "abcd\\u00c0\\u00c5",
1064
1065 "[:^tccc=0:]", // Trail canonical class
1066 "\\u0300\\u0301\\u00c0\\u00c5",
1067 "abcd",
1068
1069 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1070 "\\u0300\\u0301\\u00c0\\u00c5",
1071 "abcd",
1072
1073 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1074 "",
1075 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1076
1077 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1078 "\\u0F73\\u0F75\\u0F81",
1079 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1080 #endif /* !UCONFIG_NO_NORMALIZATION */
1081
1082 "[:Assigned:]",
1083 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1084 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1085
1086 // Script_Extensions, new in Unicode 6.0
1087 "[:scx=Arab:]",
1088 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1089 "\\u061D\\uFDEF\\uFDFE",
1090
1091 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1092 // so scx-sc is missing U+FDF2.
1093 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1094 "\\u0640\\u064B\\u0650\\u0655",
1095 "\\uFDF2"
1096 };
1097
1098 static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1099
1100 for (int32_t i=0; i<DATA_LEN; i+=3) {
1101 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1102 CharsToUnicodeString(DATA[i+2]));
1103 }
1104 }
1105
1106 /**
1107 * Test that Posix style character classes [:digit:], etc.
1108 * have the Unicode definitions from TR 18.
1109 */
TestPosixClasses()1110 void UnicodeSetTest::TestPosixClasses() {
1111 {
1112 UErrorCode status = U_ZERO_ERROR;
1113 UnicodeSet s1("[:alpha:]", status);
1114 UnicodeSet s2(u"\\p{Alphabetic}", status);
1115 TEST_ASSERT_SUCCESS(status);
1116 TEST_ASSERT(s1==s2);
1117 }
1118 {
1119 UErrorCode status = U_ZERO_ERROR;
1120 UnicodeSet s1("[:lower:]", status);
1121 UnicodeSet s2(u"\\p{lowercase}", status);
1122 TEST_ASSERT_SUCCESS(status);
1123 TEST_ASSERT(s1==s2);
1124 }
1125 {
1126 UErrorCode status = U_ZERO_ERROR;
1127 UnicodeSet s1("[:upper:]", status);
1128 UnicodeSet s2(u"\\p{Uppercase}", status);
1129 TEST_ASSERT_SUCCESS(status);
1130 TEST_ASSERT(s1==s2);
1131 }
1132 {
1133 UErrorCode status = U_ZERO_ERROR;
1134 UnicodeSet s1("[:punct:]", status);
1135 UnicodeSet s2(u"\\p{gc=Punctuation}", status);
1136 TEST_ASSERT_SUCCESS(status);
1137 TEST_ASSERT(s1==s2);
1138 }
1139 {
1140 UErrorCode status = U_ZERO_ERROR;
1141 UnicodeSet s1("[:digit:]", status);
1142 UnicodeSet s2(u"\\p{gc=DecimalNumber}", status);
1143 TEST_ASSERT_SUCCESS(status);
1144 TEST_ASSERT(s1==s2);
1145 }
1146 {
1147 UErrorCode status = U_ZERO_ERROR;
1148 UnicodeSet s1("[:xdigit:]", status);
1149 UnicodeSet s2(u"[\\p{DecimalNumber}\\p{HexDigit}]", status);
1150 TEST_ASSERT_SUCCESS(status);
1151 TEST_ASSERT(s1==s2);
1152 }
1153 {
1154 UErrorCode status = U_ZERO_ERROR;
1155 UnicodeSet s1("[:alnum:]", status);
1156 UnicodeSet s2(u"[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1157 TEST_ASSERT_SUCCESS(status);
1158 TEST_ASSERT(s1==s2);
1159 }
1160 {
1161 UErrorCode status = U_ZERO_ERROR;
1162 UnicodeSet s1("[:space:]", status);
1163 UnicodeSet s2(u"\\p{Whitespace}", status);
1164 TEST_ASSERT_SUCCESS(status);
1165 TEST_ASSERT(s1==s2);
1166 }
1167 {
1168 UErrorCode status = U_ZERO_ERROR;
1169 UnicodeSet s1("[:blank:]", status);
1170 TEST_ASSERT_SUCCESS(status);
1171 UnicodeSet s2(u"[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1172 status);
1173 TEST_ASSERT_SUCCESS(status);
1174 TEST_ASSERT(s1==s2);
1175 }
1176 {
1177 UErrorCode status = U_ZERO_ERROR;
1178 UnicodeSet s1("[:cntrl:]", status);
1179 TEST_ASSERT_SUCCESS(status);
1180 UnicodeSet s2(u"\\p{Control}", status);
1181 TEST_ASSERT_SUCCESS(status);
1182 TEST_ASSERT(s1==s2);
1183 }
1184 {
1185 UErrorCode status = U_ZERO_ERROR;
1186 UnicodeSet s1("[:graph:]", status);
1187 TEST_ASSERT_SUCCESS(status);
1188 UnicodeSet s2(u"[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1189 TEST_ASSERT_SUCCESS(status);
1190 TEST_ASSERT(s1==s2);
1191 }
1192 {
1193 UErrorCode status = U_ZERO_ERROR;
1194 UnicodeSet s1("[:print:]", status);
1195 TEST_ASSERT_SUCCESS(status);
1196 UnicodeSet s2(u"[[:graph:][:blank:]-[\\p{Control}]]", status);
1197 TEST_ASSERT_SUCCESS(status);
1198 TEST_ASSERT(s1==s2);
1199 }
1200 }
1201 /**
1202 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1203 */
TestClone()1204 void UnicodeSetTest::TestClone() {
1205 UErrorCode ec = U_ZERO_ERROR;
1206 UnicodeSet s("[abcxyz]", ec);
1207 UnicodeSet t(s);
1208 expectContainment(t, "abc", "def");
1209 }
1210
1211 /**
1212 * Test the indexOf() and charAt() methods.
1213 */
TestIndexOf()1214 void UnicodeSetTest::TestIndexOf() {
1215 UErrorCode ec = U_ZERO_ERROR;
1216 UnicodeSet set("[a-cx-y3578]", ec);
1217 if (U_FAILURE(ec)) {
1218 errln("FAIL: UnicodeSet constructor");
1219 return;
1220 }
1221 for (int32_t i=0; i<set.size(); ++i) {
1222 UChar32 c = set.charAt(i);
1223 if (set.indexOf(c) != i) {
1224 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1225 i, c, set.indexOf(c));
1226 }
1227 }
1228 UChar32 c = set.charAt(set.size());
1229 if (c != -1) {
1230 errln("FAIL: charAt(<out of range>) = %X", c);
1231 }
1232 int32_t j = set.indexOf(u'q');
1233 if (j != -1) {
1234 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1235 }
1236 }
1237
1238 /**
1239 * Test closure API.
1240 */
TestCloseOver()1241 void UnicodeSetTest::TestCloseOver() {
1242 UErrorCode ec = U_ZERO_ERROR;
1243
1244 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1245 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1246 const char* DATA[] = {
1247 // selector, input, output
1248 CASE,
1249 "[aq\\u00DF{Bc}{bC}{Fi}]",
1250 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1251
1252 CASE,
1253 "[\\u01F1]", // 'DZ'
1254 "[\\u01F1\\u01F2\\u01F3]",
1255
1256 CASE,
1257 "[\\u1FB4]",
1258 "[\\u1FB4{\\u03AC\\u03B9}]",
1259
1260 CASE,
1261 "[{F\\uFB01}]",
1262 "[\\uFB03{ffi}]",
1263
1264 CASE, // make sure binary search finds limits
1265 "[a\\uFF3A]",
1266 "[aA\\uFF3A\\uFF5A]",
1267
1268 CASE,
1269 "[a-z]","[A-Za-z\\u017F\\u212A]",
1270 CASE,
1271 "[abc]","[A-Ca-c]",
1272 CASE,
1273 "[ABC]","[A-Ca-c]",
1274
1275 CASE, "[i]", "[iI]",
1276
1277 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1278 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1279
1280 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1281
1282 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1283
1284 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1285
1286 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1287
1288 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1289
1290 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1291
1292 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1293 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1294
1295 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1296
1297 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1298
1299 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1300
1301 #if !UCONFIG_NO_FILE_IO
1302 CASE_MAPPINGS,
1303 "[aq\\u00DF{Bc}{bC}{Fi}]",
1304 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1305 #endif
1306
1307 CASE_MAPPINGS,
1308 "[\\u01F1]", // 'DZ'
1309 "[\\u01F1\\u01F2\\u01F3]",
1310
1311 CASE_MAPPINGS,
1312 "[a-z]",
1313 "[A-Za-z]",
1314
1315 NULL
1316 };
1317
1318 UnicodeSet s;
1319 UnicodeSet t;
1320 UnicodeString buf;
1321 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1322 int32_t selector = DATA[i][0];
1323 UnicodeString pat(DATA[i+1], -1, US_INV);
1324 UnicodeString exp(DATA[i+2], -1, US_INV);
1325 s.applyPattern(pat, ec);
1326 s.closeOver(selector);
1327 t.applyPattern(exp, ec);
1328 if (U_FAILURE(ec)) {
1329 errln("FAIL: applyPattern failed");
1330 continue;
1331 }
1332 if (s == t) {
1333 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1334 } else {
1335 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1336 s.toPattern(buf, TRUE) + ", expected " + exp);
1337 }
1338 }
1339
1340 #if 0
1341 /*
1342 * Unused test code.
1343 * This was used to compare the old implementation (using USET_CASE)
1344 * with the new one (using 0x100 temporarily)
1345 * while transitioning from hardcoded case closure tables in uniset.cpp
1346 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1347 * and using ucase.c functions for closure.
1348 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1349 *
1350 * Note: The old and new implementation never fully matched because
1351 * the old implementation turned out to not map U+0130 and U+0131 correctly
1352 * (dotted I and dotless i) and because the old implementation's data tables
1353 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1354 * new implementation. (So sigmas and some other characters were not handled
1355 * according to the newer Unicode version.)
1356 */
1357 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1358 UnicodeSetIterator si(sens);
1359 UnicodeString str, buf2;
1360 const UnicodeString *pStr;
1361 UChar32 c;
1362 while(si.next()) {
1363 if(!si.isString()) {
1364 c=si.getCodepoint();
1365 s.clear();
1366 s.add(c);
1367
1368 str.setTo(c);
1369 str.foldCase();
1370 sens2.add(str);
1371
1372 t=s;
1373 s.closeOver(USET_CASE);
1374 t.closeOver(0x100);
1375 if(s!=t) {
1376 errln("FAIL: closeOver(U+%04x) differs: ", c);
1377 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1378 }
1379 }
1380 }
1381 // remove all code points
1382 // should contain all full case folding mapping strings
1383 sens2.remove(0, 0x10ffff);
1384 si.reset(sens2);
1385 while(si.next()) {
1386 if(si.isString()) {
1387 pStr=&si.getString();
1388 s.clear();
1389 s.add(*pStr);
1390 t=s2=s;
1391 s.closeOver(USET_CASE);
1392 t.closeOver(0x100);
1393 if(s!=t) {
1394 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1395 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1396 }
1397 }
1398 }
1399 #endif
1400
1401 // Test the pattern API
1402 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1403 if (U_FAILURE(ec)) {
1404 errln("FAIL: applyPattern failed");
1405 } else {
1406 expectContainment(s, "abcABC", "defDEF");
1407 }
1408 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1409 if (U_FAILURE(ec)) {
1410 errln("FAIL: constructor failed");
1411 } else {
1412 expectContainment(v, "defDEF", "abcABC");
1413 }
1414 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1415 if (U_FAILURE(ec)) {
1416 errln("FAIL: construct w/case mappings failed");
1417 } else {
1418 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1419 }
1420 }
1421
TestEscapePattern()1422 void UnicodeSetTest::TestEscapePattern() {
1423 const char pattern[] =
1424 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1425 const char exp[] =
1426 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1427 // We test this with two passes; in the second pass we
1428 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1429 // this fails -- which is what we expect.
1430 for (int32_t pass=1; pass<=2; ++pass) {
1431 UErrorCode ec = U_ZERO_ERROR;
1432 UnicodeString pat(pattern, -1, US_INV);
1433 if (pass==2) {
1434 pat = pat.unescape();
1435 }
1436 // Pattern is only good for pass 1
1437 UBool isPatternValid = (pass==1);
1438
1439 UnicodeSet set(pat, ec);
1440 if (U_SUCCESS(ec) != isPatternValid){
1441 errln((UnicodeString)"FAIL: applyPattern(" +
1442 escape(pat) + ") => " +
1443 u_errorName(ec));
1444 continue;
1445 }
1446 if (U_FAILURE(ec)) {
1447 continue;
1448 }
1449 if (set.contains(u'\u0644')){
1450 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1451 }
1452
1453 UnicodeString newpat;
1454 set.toPattern(newpat, TRUE);
1455 if (newpat == UnicodeString(exp, -1, US_INV)) {
1456 logln(escape(pat) + " => " + newpat);
1457 } else {
1458 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1459 }
1460
1461 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1462 UnicodeString str("Range ");
1463 str.append((UChar)(u'0' + i))
1464 .append(": ")
1465 .append((UChar32)set.getRangeStart(i))
1466 .append(" - ")
1467 .append((UChar32)set.getRangeEnd(i));
1468 str = str + " (" + set.getRangeStart(i) + " - " +
1469 set.getRangeEnd(i) + ")";
1470 if (set.getRangeStart(i) < 0) {
1471 errln((UnicodeString)"FAIL: " + escape(str));
1472 } else {
1473 logln(escape(str));
1474 }
1475 }
1476 }
1477 }
1478
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1479 void UnicodeSetTest::expectRange(const UnicodeString& label,
1480 const UnicodeSet& set,
1481 UChar32 start, UChar32 end) {
1482 UnicodeSet exp(start, end);
1483 UnicodeString pat;
1484 if (set == exp) {
1485 logln(label + " => " + set.toPattern(pat, TRUE));
1486 } else {
1487 UnicodeString xpat;
1488 errln((UnicodeString)"FAIL: " + label + " => " +
1489 set.toPattern(pat, TRUE) +
1490 ", expected " + exp.toPattern(xpat, TRUE));
1491 }
1492 }
1493
TestInvalidCodePoint()1494 void UnicodeSetTest::TestInvalidCodePoint() {
1495
1496 const UChar32 DATA[] = {
1497 // Test range Expected range
1498 0, 0x10FFFF, 0, 0x10FFFF,
1499 (UChar32)-1, 8, 0, 8,
1500 8, 0x110000, 8, 0x10FFFF
1501 };
1502 const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1503
1504 UnicodeString pat;
1505 int32_t i;
1506
1507 for (i=0; i<DATA_LENGTH; i+=4) {
1508 UChar32 start = DATA[i];
1509 UChar32 end = DATA[i+1];
1510 UChar32 xstart = DATA[i+2];
1511 UChar32 xend = DATA[i+3];
1512
1513 // Try various API using the test code points
1514
1515 UnicodeSet set(start, end);
1516 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1517 set, xstart, xend);
1518
1519 set.clear();
1520 set.set(start, end);
1521 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1522 set, xstart, xend);
1523
1524 UBool b = set.contains(start);
1525 b = set.contains(start, end);
1526 b = set.containsNone(start, end);
1527 b = set.containsSome(start, end);
1528 (void)b; // Suppress set but not used warning.
1529
1530 /*int32_t index = set.indexOf(start);*/
1531
1532 set.clear();
1533 set.add(start);
1534 set.add(start, end);
1535 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1536 set, xstart, xend);
1537
1538 set.set(0, 0x10FFFF);
1539 set.retain(start, end);
1540 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1541 set, xstart, xend);
1542 set.retain(start);
1543
1544 set.set(0, 0x10FFFF);
1545 set.remove(start);
1546 set.remove(start, end);
1547 set.complement();
1548 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1549 set, xstart, xend);
1550
1551 set.set(0, 0x10FFFF);
1552 set.complement(start, end);
1553 set.complement();
1554 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1555 set, xstart, xend);
1556 set.complement(start);
1557 }
1558
1559 const UChar32 DATA2[] = {
1560 0,
1561 0x10FFFF,
1562 (UChar32)-1,
1563 0x110000
1564 };
1565 const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1566
1567 for (i=0; i<DATA2_LENGTH; ++i) {
1568 UChar32 c = DATA2[i], end = 0x10FFFF;
1569 UBool valid = (c >= 0 && c <= 0x10FFFF);
1570
1571 UnicodeSet set(0, 0x10FFFF);
1572
1573 // For single-codepoint contains, invalid codepoints are NOT contained
1574 UBool b = set.contains(c);
1575 if (b == valid) {
1576 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1577 ") = " + b);
1578 } else {
1579 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1580 ") = " + b);
1581 }
1582
1583 // For codepoint range contains, containsNone, and containsSome,
1584 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1585 b = set.contains(c, end);
1586 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1587 "," + end + ") = " + b);
1588
1589 b = set.containsNone(c, end);
1590 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1591 "," + end + ") = " + b);
1592
1593 b = set.containsSome(c, end);
1594 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1595 "," + end + ") = " + b);
1596
1597 int32_t index = set.indexOf(c);
1598 if ((index >= 0) == valid) {
1599 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1600 ") = " + index);
1601 } else {
1602 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1603 ") = " + index);
1604 }
1605 }
1606 }
1607
1608 // Used by TestSymbolTable
1609 class TokenSymbolTable : public SymbolTable {
1610 public:
1611 Hashtable contents;
1612
TokenSymbolTable(UErrorCode & ec)1613 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1614 contents.setValueDeleter(uprv_deleteUObject);
1615 }
1616
~TokenSymbolTable()1617 ~TokenSymbolTable() {}
1618
1619 /**
1620 * (Non-SymbolTable API) Add the given variable and value to
1621 * the table. Variable should NOT contain leading '$'.
1622 */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1623 void add(const UnicodeString& var, const UnicodeString& value,
1624 UErrorCode& ec) {
1625 if (U_SUCCESS(ec)) {
1626 contents.put(var, new UnicodeString(value), ec);
1627 }
1628 }
1629
1630 /**
1631 * SymbolTable API
1632 */
lookup(const UnicodeString & s) const1633 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1634 return (const UnicodeString*) contents.get(s);
1635 }
1636
1637 /**
1638 * SymbolTable API
1639 */
lookupMatcher(UChar32) const1640 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1641 return NULL;
1642 }
1643
1644 /**
1645 * SymbolTable API
1646 */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1647 virtual UnicodeString parseReference(const UnicodeString& text,
1648 ParsePosition& pos, int32_t limit) const {
1649 int32_t start = pos.getIndex();
1650 int32_t i = start;
1651 UnicodeString result;
1652 while (i < limit) {
1653 UChar c = text.charAt(i);
1654 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1655 break;
1656 }
1657 ++i;
1658 }
1659 if (i == start) { // No valid name chars
1660 return result; // Indicate failure with empty string
1661 }
1662 pos.setIndex(i);
1663 text.extractBetween(start, i, result);
1664 return result;
1665 }
1666 };
1667
TestSymbolTable()1668 void UnicodeSetTest::TestSymbolTable() {
1669 // Multiple test cases can be set up here. Each test case
1670 // is terminated by null:
1671 // var, value, var, value,..., input pat., exp. output pat., null
1672 const char* DATA[] = {
1673 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1674 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1675 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1676 NULL
1677 };
1678
1679 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1680 UErrorCode ec = U_ZERO_ERROR;
1681 TokenSymbolTable sym(ec);
1682 if (U_FAILURE(ec)) {
1683 errln("FAIL: couldn't construct TokenSymbolTable");
1684 continue;
1685 }
1686
1687 // Set up variables
1688 while (DATA[i+2] != NULL) {
1689 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1690 if (U_FAILURE(ec)) {
1691 errln("FAIL: couldn't add to TokenSymbolTable");
1692 continue;
1693 }
1694 i += 2;
1695 }
1696
1697 // Input pattern and expected output pattern
1698 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1699 i += 2;
1700
1701 ParsePosition pos(0);
1702 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1703 if (U_FAILURE(ec)) {
1704 errln("FAIL: couldn't construct UnicodeSet");
1705 continue;
1706 }
1707
1708 // results
1709 if (pos.getIndex() != inpat.length()) {
1710 errln((UnicodeString)"Failed to read to end of string \""
1711 + inpat + "\": read to "
1712 + pos.getIndex() + ", length is "
1713 + inpat.length());
1714 }
1715
1716 UnicodeSet us2(exppat, ec);
1717 if (U_FAILURE(ec)) {
1718 errln("FAIL: couldn't construct expected UnicodeSet");
1719 continue;
1720 }
1721
1722 UnicodeString a, b;
1723 if (us != us2) {
1724 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1725 ", expected " + us2.toPattern(b, TRUE));
1726 } else {
1727 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1728 }
1729 }
1730 }
1731
TestSurrogate()1732 void UnicodeSetTest::TestSurrogate() {
1733 const char* DATA[] = {
1734 // These should all behave identically
1735 "[abc\\uD800\\uDC00]",
1736 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1737 "[abc\\U00010000]",
1738 0
1739 };
1740 for (int i=0; DATA[i] != 0; ++i) {
1741 UErrorCode ec = U_ZERO_ERROR;
1742 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1743 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1744 UnicodeSet set(str, ec);
1745 if (U_FAILURE(ec)) {
1746 errln("FAIL: UnicodeSet constructor");
1747 continue;
1748 }
1749 expectContainment(set,
1750 CharsToUnicodeString("abc\\U00010000"),
1751 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1752 if (set.size() != 4) {
1753 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1754 set.size() + ", expected 4");
1755 }
1756
1757 {
1758 UErrorCode subErr = U_ZERO_ERROR;
1759 checkRoundTrip(set);
1760 checkSerializeRoundTrip(set, subErr);
1761 }
1762 }
1763 }
1764
TestExhaustive()1765 void UnicodeSetTest::TestExhaustive() {
1766 // exhaustive tests. Simulate UnicodeSets with integers.
1767 // That gives us very solid tests (except for large memory tests).
1768
1769 int32_t limit = 128;
1770
1771 UnicodeSet x, y, z, aa;
1772
1773 for (int32_t i = 0; i < limit; ++i) {
1774 bitsToSet(i, x);
1775 logln((UnicodeString)"Testing " + i + ", " + x);
1776 _testComplement(i, x, y);
1777
1778 UnicodeSet &toTest = bitsToSet(i, aa);
1779
1780 // AS LONG AS WE ARE HERE, check roundtrip
1781 checkRoundTrip(toTest);
1782 UErrorCode ec = U_ZERO_ERROR;
1783 checkSerializeRoundTrip(toTest, ec);
1784
1785 for (int32_t j = 0; j < limit; ++j) {
1786 _testAdd(i,j, x,y,z);
1787 _testXor(i,j, x,y,z);
1788 _testRetain(i,j, x,y,z);
1789 _testRemove(i,j, x,y,z);
1790 }
1791 }
1792 }
1793
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1794 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1795 bitsToSet(a, x);
1796 z = x;
1797 z.complement();
1798 int32_t c = setToBits(z);
1799 if (c != (~a)) {
1800 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1801 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1802 }
1803 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1804 }
1805
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1806 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1807 bitsToSet(a, x);
1808 bitsToSet(b, y);
1809 z = x;
1810 z.addAll(y);
1811 int32_t c = setToBits(z);
1812 if (c != (a | b)) {
1813 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1814 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1815 }
1816 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1817 }
1818
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1819 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1820 bitsToSet(a, x);
1821 bitsToSet(b, y);
1822 z = x;
1823 z.retainAll(y);
1824 int32_t c = setToBits(z);
1825 if (c != (a & b)) {
1826 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1827 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1828 }
1829 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1830 }
1831
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1832 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1833 bitsToSet(a, x);
1834 bitsToSet(b, y);
1835 z = x;
1836 z.removeAll(y);
1837 int32_t c = setToBits(z);
1838 if (c != (a &~ b)) {
1839 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1840 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1841 }
1842 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1843 }
1844
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1845 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1846 bitsToSet(a, x);
1847 bitsToSet(b, y);
1848 z = x;
1849 z.complementAll(y);
1850 int32_t c = setToBits(z);
1851 if (c != (a ^ b)) {
1852 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1853 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1854 }
1855 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1856 }
1857
1858 /**
1859 * Check that ranges are monotonically increasing and non-
1860 * overlapping.
1861 */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1862 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1863 int32_t n = set.getRangeCount();
1864 if (n < 0) {
1865 errln((UnicodeString)"FAIL result of " + msg +
1866 ": range count should be >= 0 but is " +
1867 n /*+ " for " + set.toPattern())*/);
1868 return;
1869 }
1870 UChar32 last = 0;
1871 for (int32_t i=0; i<n; ++i) {
1872 UChar32 start = set.getRangeStart(i);
1873 UChar32 end = set.getRangeEnd(i);
1874 if (start > end) {
1875 errln((UnicodeString)"FAIL result of " + msg +
1876 ": range " + (i+1) +
1877 " start > end: " + (int)start + ", " + (int)end +
1878 " for " + set);
1879 }
1880 if (i > 0 && start <= last) {
1881 errln((UnicodeString)"FAIL result of " + msg +
1882 ": range " + (i+1) +
1883 " overlaps previous range: " + (int)start + ", " + (int)end +
1884 " for " + set);
1885 }
1886 last = end;
1887 }
1888 }
1889
1890 /**
1891 * Convert a bitmask to a UnicodeSet.
1892 */
bitsToSet(int32_t a,UnicodeSet & result)1893 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1894 result.clear();
1895 for (UChar32 i = 0; i < 32; ++i) {
1896 if ((a & (1<<i)) != 0) {
1897 result.add(i);
1898 }
1899 }
1900 return result;
1901 }
1902
1903 /**
1904 * Convert a UnicodeSet to a bitmask. Only the characters
1905 * U+0000 to U+0020 are represented in the bitmask.
1906 */
setToBits(const UnicodeSet & x)1907 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1908 int32_t result = 0;
1909 for (int32_t i = 0; i < 32; ++i) {
1910 if (x.contains((UChar32)i)) {
1911 result |= (1<<i);
1912 }
1913 }
1914 return result;
1915 }
1916
1917 /**
1918 * Return the representation of an inversion list based UnicodeSet
1919 * as a pairs list. Ranges are listed in ascending Unicode order.
1920 * For example, the set [a-zA-M3] is represented as "33AMaz".
1921 */
getPairs(const UnicodeSet & set)1922 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1923 UnicodeString pairs;
1924 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1925 UChar32 start = set.getRangeStart(i);
1926 UChar32 end = set.getRangeEnd(i);
1927 if (end > 0xFFFF) {
1928 end = 0xFFFF;
1929 i = set.getRangeCount(); // Should be unnecessary
1930 }
1931 pairs.append((UChar)start).append((UChar)end);
1932 }
1933 return pairs;
1934 }
1935
1936 /**
1937 * Basic consistency check for a few items.
1938 * That the iterator works, and that we can create a pattern and
1939 * get the same thing back
1940 */
checkRoundTrip(const UnicodeSet & s)1941 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1942 {
1943 UnicodeSet t(s);
1944 checkEqual(s, t, "copy ct");
1945 }
1946
1947 {
1948 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
1949 t = s;
1950 checkEqual(s, t, "operator=");
1951 }
1952
1953 {
1954 UnicodeSet t;
1955 copyWithIterator(t, s, FALSE);
1956 checkEqual(s, t, "iterator roundtrip");
1957 }
1958
1959 {
1960 UnicodeSet t;
1961 copyWithIterator(t, s, TRUE); // try range
1962 checkEqual(s, t, "iterator roundtrip");
1963 }
1964
1965 {
1966 UnicodeSet t;
1967 UnicodeString pat;
1968 UErrorCode ec = U_ZERO_ERROR;
1969 s.toPattern(pat, FALSE);
1970 t.applyPattern(pat, ec);
1971 if (U_FAILURE(ec)) {
1972 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1973 return;
1974 } else {
1975 checkEqual(s, t, "toPattern(false)");
1976 }
1977 }
1978
1979 {
1980 UnicodeSet t;
1981 UnicodeString pat;
1982 UErrorCode ec = U_ZERO_ERROR;
1983 s.toPattern(pat, TRUE);
1984 t.applyPattern(pat, ec);
1985 if (U_FAILURE(ec)) {
1986 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1987 return;
1988 } else {
1989 checkEqual(s, t, "toPattern(true)");
1990 }
1991 }
1992 }
1993
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1994 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1995 if(U_FAILURE(status)) return;
1996 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1997 if(status == U_BUFFER_OVERFLOW_ERROR) {
1998 status = U_ZERO_ERROR;
1999 serializeBuffer.resize(len);
2000 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2001 // let 2nd error stand
2002 }
2003 if(U_FAILURE(status)) {
2004 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
2005 return;
2006 }
2007 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
2008 if(U_FAILURE(status)) {
2009 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
2010 return;
2011 }
2012
2013 checkEqual(t, deserialized, "Set was unequal when deserialized");
2014 }
2015
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)2016 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
2017 t.clear();
2018 UnicodeSetIterator it(s);
2019 if (withRange) {
2020 while (it.nextRange()) {
2021 if (it.isString()) {
2022 t.add(it.getString());
2023 } else {
2024 t.add(it.getCodepoint(), it.getCodepointEnd());
2025 }
2026 }
2027 } else {
2028 while (it.next()) {
2029 if (it.isString()) {
2030 t.add(it.getString());
2031 } else {
2032 t.add(it.getCodepoint());
2033 }
2034 }
2035 }
2036 }
2037
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2038 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2039 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2040 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2041 UnicodeString source; s.toPattern(source, TRUE);
2042 UnicodeString result; t.toPattern(result, TRUE);
2043 if (s != t) {
2044 errln((UnicodeString)"FAIL: " + message
2045 + "; source = " + source
2046 + "; result = " + result
2047 );
2048 return FALSE;
2049 } else {
2050 logln((UnicodeString)"Ok: " + message
2051 + "; source = " + source
2052 + "; result = " + result
2053 );
2054 }
2055 return TRUE;
2056 }
2057
2058 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2059 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2060 const UnicodeString& charsIn,
2061 const UnicodeString& charsOut) {
2062 UErrorCode ec = U_ZERO_ERROR;
2063 UnicodeSet set(pat, ec);
2064 if (U_FAILURE(ec)) {
2065 dataerrln((UnicodeString)"FAIL: pattern \"" +
2066 pat + "\" => " + u_errorName(ec));
2067 return;
2068 }
2069 expectContainment(set, pat, charsIn, charsOut);
2070 }
2071
2072 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2073 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2074 const UnicodeString& charsIn,
2075 const UnicodeString& charsOut) {
2076 UnicodeString pat;
2077 set.toPattern(pat);
2078 expectContainment(set, pat, charsIn, charsOut);
2079 }
2080
2081 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2082 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2083 const UnicodeString& setName,
2084 const UnicodeString& charsIn,
2085 const UnicodeString& charsOut) {
2086 UnicodeString bad;
2087 UChar32 c;
2088 int32_t i;
2089
2090 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2091 c = charsIn.char32At(i);
2092 if (!set.contains(c)) {
2093 bad.append(c);
2094 }
2095 }
2096 if (bad.length() > 0) {
2097 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2098 ", expected containment of " + prettify(charsIn));
2099 } else {
2100 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2101 }
2102
2103 bad.truncate(0);
2104 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2105 c = charsOut.char32At(i);
2106 if (set.contains(c)) {
2107 bad.append(c);
2108 }
2109 }
2110 if (bad.length() > 0) {
2111 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2112 ", expected non-containment of " + prettify(charsOut));
2113 } else {
2114 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2115 }
2116 }
2117
2118 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2119 UnicodeSetTest::expectPattern(UnicodeSet& set,
2120 const UnicodeString& pattern,
2121 const UnicodeString& expectedPairs){
2122 UErrorCode status = U_ZERO_ERROR;
2123 set.applyPattern(pattern, status);
2124 if (U_FAILURE(status)) {
2125 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2126 "\") failed");
2127 return;
2128 } else {
2129 if (getPairs(set) != expectedPairs ) {
2130 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2131 "\") => pairs \"" +
2132 escape(getPairs(set)) + "\", expected \"" +
2133 escape(expectedPairs) + "\"");
2134 } else {
2135 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2136 "\") => pairs \"" +
2137 escape(getPairs(set)) + "\"");
2138 }
2139 }
2140 // the result of calling set.toPattern(), which is the string representation of
2141 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2142 // will produce another set that is equal to this one.
2143 UnicodeString temppattern;
2144 set.toPattern(temppattern);
2145 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2146 if (U_FAILURE(status)) {
2147 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2148 return;
2149 }
2150 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2151 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2152 escape(getPairs(set)) + "\""));
2153 } else{
2154 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2155 }
2156
2157 delete tempset;
2158
2159 }
2160
2161 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2162 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2163 if (getPairs(set) != expectedPairs) {
2164 errln(UnicodeString("FAIL: Expected pair list \"") +
2165 escape(expectedPairs) + "\", got \"" +
2166 escape(getPairs(set)) + "\"");
2167 }
2168 }
2169
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2170 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2171 const UnicodeString& expPat,
2172 const char** expStrings) {
2173 UnicodeString pat;
2174 set.toPattern(pat, TRUE);
2175 if (pat == expPat) {
2176 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2177 } else {
2178 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2179 return;
2180 }
2181 if (expStrings == NULL) {
2182 return;
2183 }
2184 UBool in = TRUE;
2185 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2186 if (expStrings[i] == NOT) { // sic; pointer comparison
2187 in = FALSE;
2188 continue;
2189 }
2190 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2191 UBool contained = set.contains(s);
2192 if (contained == in) {
2193 logln((UnicodeString)"Ok: " + expPat +
2194 (contained ? " contains {" : " does not contain {") +
2195 escape(expStrings[i]) + "}");
2196 } else {
2197 errln((UnicodeString)"FAIL: " + expPat +
2198 (contained ? " contains {" : " does not contain {") +
2199 escape(expStrings[i]) + "}");
2200 }
2201 }
2202 }
2203
toHexString(int32_t i)2204 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? u'0' : (u'A' - 10))); }
2205
2206 void
doAssert(UBool condition,const char * message)2207 UnicodeSetTest::doAssert(UBool condition, const char *message)
2208 {
2209 if (!condition) {
2210 errln(UnicodeString("ERROR : ") + message);
2211 }
2212 }
2213
2214 UnicodeString
escape(const UnicodeString & s)2215 UnicodeSetTest::escape(const UnicodeString& s) {
2216 UnicodeString buf;
2217 for (int32_t i=0; i<s.length(); )
2218 {
2219 UChar32 c = s.char32At(i);
2220 if (0x0020 <= c && c <= 0x007F) {
2221 buf += c;
2222 } else {
2223 if (c <= 0xFFFF) {
2224 buf += u"\\u";
2225 } else {
2226 buf += u"\\U";
2227 buf += toHexString((c & 0xF0000000) >> 28);
2228 buf += toHexString((c & 0x0F000000) >> 24);
2229 buf += toHexString((c & 0x00F00000) >> 20);
2230 buf += toHexString((c & 0x000F0000) >> 16);
2231 }
2232 buf += toHexString((c & 0xF000) >> 12);
2233 buf += toHexString((c & 0x0F00) >> 8);
2234 buf += toHexString((c & 0x00F0) >> 4);
2235 buf += toHexString(c & 0x000F);
2236 }
2237 i += U16_LENGTH(c);
2238 }
2239 return buf;
2240 }
2241
TestFreezable()2242 void UnicodeSetTest::TestFreezable() {
2243 UErrorCode errorCode=U_ZERO_ERROR;
2244 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2245 UnicodeSet idSet(idPattern, errorCode);
2246 if(U_FAILURE(errorCode)) {
2247 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2248 return;
2249 }
2250
2251 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2252 UnicodeSet wsSet(wsPattern, errorCode);
2253 if(U_FAILURE(errorCode)) {
2254 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2255 return;
2256 }
2257
2258 idSet.add(idPattern);
2259 UnicodeSet frozen(idSet);
2260 frozen.freeze();
2261
2262 if(idSet.isFrozen() || !frozen.isFrozen()) {
2263 errln("FAIL: isFrozen() is wrong");
2264 }
2265 if(frozen!=idSet || !(frozen==idSet)) {
2266 errln("FAIL: a copy-constructed frozen set differs from its original");
2267 }
2268
2269 frozen=wsSet;
2270 if(frozen!=idSet || !(frozen==idSet)) {
2271 errln("FAIL: a frozen set was modified by operator=");
2272 }
2273
2274 UnicodeSet frozen2(frozen);
2275 if(frozen2!=frozen || frozen2!=idSet) {
2276 errln("FAIL: a copied frozen set differs from its frozen original");
2277 }
2278 if(!frozen2.isFrozen()) {
2279 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2280 }
2281 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2282 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2283 errln("FAIL: UnicodeSet(5, 55) failed");
2284 }
2285 frozen3=frozen;
2286 if(!frozen3.isFrozen()) {
2287 errln("FAIL: copying a frozen set results in a thawed one");
2288 }
2289
2290 UnicodeSet *cloned=frozen.clone();
2291 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2292 errln("FAIL: clone() failed");
2293 }
2294 cloned->add(0xd802, 0xd805);
2295 if(cloned->containsSome(0xd802, 0xd805)) {
2296 errln("FAIL: unable to modify clone");
2297 }
2298 delete cloned;
2299
2300 UnicodeSet *thawed=frozen.cloneAsThawed();
2301 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2302 errln("FAIL: cloneAsThawed() failed");
2303 }
2304 thawed->add(0xd802, 0xd805);
2305 if(!thawed->contains(0xd802, 0xd805)) {
2306 errln("FAIL: unable to modify thawed clone");
2307 }
2308 delete thawed;
2309
2310 frozen.set(5, 55);
2311 if(frozen!=idSet || !(frozen==idSet)) {
2312 errln("FAIL: UnicodeSet::set() modified a frozen set");
2313 }
2314
2315 frozen.clear();
2316 if(frozen!=idSet || !(frozen==idSet)) {
2317 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2318 }
2319
2320 frozen.closeOver(USET_CASE_INSENSITIVE);
2321 if(frozen!=idSet || !(frozen==idSet)) {
2322 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2323 }
2324
2325 frozen.compact();
2326 if(frozen!=idSet || !(frozen==idSet)) {
2327 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2328 }
2329
2330 ParsePosition pos;
2331 frozen.
2332 applyPattern(wsPattern, errorCode).
2333 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2334 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2335 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2336 applyPropertyAlias(u"Assigned", UnicodeString(), errorCode);
2337 if(frozen!=idSet || !(frozen==idSet)) {
2338 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2339 }
2340
2341 frozen.
2342 add(0xd800).
2343 add(0xd802, 0xd805).
2344 add(wsPattern).
2345 addAll(idPattern).
2346 addAll(wsSet);
2347 if(frozen!=idSet || !(frozen==idSet)) {
2348 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2349 }
2350
2351 frozen.
2352 retain(0x62).
2353 retain(0x64, 0x69).
2354 retainAll(wsPattern).
2355 retainAll(wsSet);
2356 if(frozen!=idSet || !(frozen==idSet)) {
2357 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2358 }
2359
2360 frozen.
2361 remove(0x62).
2362 remove(0x64, 0x69).
2363 remove(idPattern).
2364 removeAll(idPattern).
2365 removeAll(idSet);
2366 if(frozen!=idSet || !(frozen==idSet)) {
2367 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2368 }
2369
2370 frozen.
2371 complement().
2372 complement(0x62).
2373 complement(0x64, 0x69).
2374 complement(idPattern).
2375 complementAll(idPattern).
2376 complementAll(idSet);
2377 if(frozen!=idSet || !(frozen==idSet)) {
2378 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2379 }
2380 }
2381
2382 // Test span() etc. -------------------------------------------------------- ***
2383
2384 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2385 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2386 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2387 UErrorCode errorCode=U_ZERO_ERROR;
2388 int32_t length8=0;
2389 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2390 if(U_SUCCESS(errorCode)) {
2391 return length8;
2392 } else {
2393 // The string contains an unpaired surrogate.
2394 // Ignore this string.
2395 return 0;
2396 }
2397 }
2398
2399 class UnicodeSetWithStringsIterator;
2400
2401 // Make the strings in a UnicodeSet easily accessible.
2402 class UnicodeSetWithStrings {
2403 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2404 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2405 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2406 int32_t size=set.size();
2407 if(size>0 && set.charAt(size-1)<0) {
2408 // If a set's last element is not a code point, then it must contain strings.
2409 // Iterate over the set, skip all code point ranges, and cache the strings.
2410 // Convert them to UTF-8 for spanUTF8().
2411 UnicodeSetIterator iter(set);
2412 const UnicodeString *s;
2413 char *s8=utf8;
2414 int32_t length8, utf8Count=0;
2415 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2416 if(iter.isString()) {
2417 // Store the pointer to the set's string element
2418 // which we happen to know is a stable pointer.
2419 strings[stringsLength]=s=&iter.getString();
2420 utf8Count+=
2421 utf8Lengths[stringsLength]=length8=
2422 appendUTF8(s->getBuffer(), s->length(),
2423 s8, (int32_t)(sizeof(utf8)-utf8Count));
2424 if(length8==0) {
2425 hasSurrogates=TRUE; // Contains unpaired surrogates.
2426 }
2427 s8+=length8;
2428 ++stringsLength;
2429 }
2430 }
2431 }
2432 }
2433
getSet() const2434 const UnicodeSet &getSet() const {
2435 return set;
2436 }
2437
hasStrings() const2438 UBool hasStrings() const {
2439 return (UBool)(stringsLength>0);
2440 }
2441
hasStringsWithSurrogates() const2442 UBool hasStringsWithSurrogates() const {
2443 return hasSurrogates;
2444 }
2445
2446 private:
2447 friend class UnicodeSetWithStringsIterator;
2448
2449 const UnicodeSet &set;
2450
2451 const UnicodeString *strings[20];
2452 int32_t stringsLength;
2453 UBool hasSurrogates;
2454
2455 char utf8[1024];
2456 int32_t utf8Lengths[20];
2457 };
2458
2459 class UnicodeSetWithStringsIterator {
2460 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2461 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2462 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2463 }
2464
reset()2465 void reset() {
2466 nextStringIndex=nextUTF8Start=0;
2467 }
2468
nextString()2469 const UnicodeString *nextString() {
2470 if(nextStringIndex<fSet.stringsLength) {
2471 return fSet.strings[nextStringIndex++];
2472 } else {
2473 return NULL;
2474 }
2475 }
2476
2477 // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2478 const char *nextUTF8(int32_t &length) {
2479 if(nextStringIndex<fSet.stringsLength) {
2480 const char *s8=fSet.utf8+nextUTF8Start;
2481 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2482 return s8;
2483 } else {
2484 length=0;
2485 return NULL;
2486 }
2487 }
2488
2489 private:
2490 const UnicodeSetWithStrings &fSet;
2491 int32_t nextStringIndex;
2492 int32_t nextUTF8Start;
2493 };
2494
2495 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2496 // at code point boundaries.
2497 // That is, each edge of a match must not be in the middle of a surrogate pair.
2498 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2499 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2500 s+=start;
2501 limit-=start;
2502 int32_t length=t.length();
2503 return 0==t.compare(s, length) &&
2504 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2505 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2506 }
2507
2508 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2509 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2510 USetSpanCondition spanCondition) {
2511 const UnicodeSet &realSet(set.getSet());
2512 if(!set.hasStrings()) {
2513 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2514 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2515 }
2516
2517 UChar32 c;
2518 int32_t start=0, prev;
2519 while((prev=start)<length) {
2520 U16_NEXT(s, start, length, c);
2521 if(realSet.contains(c)!=spanCondition) {
2522 break;
2523 }
2524 }
2525 return prev;
2526 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2527 UnicodeSetWithStringsIterator iter(set);
2528 UChar32 c;
2529 int32_t start, next;
2530 for(start=next=0; start<length;) {
2531 U16_NEXT(s, next, length, c);
2532 if(realSet.contains(c)) {
2533 break;
2534 }
2535 const UnicodeString *str;
2536 iter.reset();
2537 while((str=iter.nextString())!=NULL) {
2538 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2539 // spanNeedsStrings=TRUE;
2540 return start;
2541 }
2542 }
2543 start=next;
2544 }
2545 return start;
2546 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2547 UnicodeSetWithStringsIterator iter(set);
2548 UChar32 c;
2549 int32_t start, next, maxSpanLimit=0;
2550 for(start=next=0; start<length;) {
2551 U16_NEXT(s, next, length, c);
2552 if(!realSet.contains(c)) {
2553 next=start; // Do not span this single, not-contained code point.
2554 }
2555 const UnicodeString *str;
2556 iter.reset();
2557 while((str=iter.nextString())!=NULL) {
2558 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2559 // spanNeedsStrings=TRUE;
2560 int32_t matchLimit=start+str->length();
2561 if(matchLimit==length) {
2562 return length;
2563 }
2564 if(spanCondition==USET_SPAN_CONTAINED) {
2565 // Iterate for the shortest match at each position.
2566 // Recurse for each but the shortest match.
2567 if(next==start) {
2568 next=matchLimit; // First match from start.
2569 } else {
2570 if(matchLimit<next) {
2571 // Remember shortest match from start for iteration.
2572 int32_t temp=next;
2573 next=matchLimit;
2574 matchLimit=temp;
2575 }
2576 // Recurse for non-shortest match from start.
2577 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2578 USET_SPAN_CONTAINED);
2579 if((matchLimit+spanLength)>maxSpanLimit) {
2580 maxSpanLimit=matchLimit+spanLength;
2581 if(maxSpanLimit==length) {
2582 return length;
2583 }
2584 }
2585 }
2586 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2587 if(matchLimit>next) {
2588 // Remember longest match from start.
2589 next=matchLimit;
2590 }
2591 }
2592 }
2593 }
2594 if(next==start) {
2595 break; // No match from start.
2596 }
2597 start=next;
2598 }
2599 if(start>maxSpanLimit) {
2600 return start;
2601 } else {
2602 return maxSpanLimit;
2603 }
2604 }
2605 }
2606
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2607 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2608 USetSpanCondition spanCondition) {
2609 if(length==0) {
2610 return 0;
2611 }
2612 const UnicodeSet &realSet(set.getSet());
2613 if(!set.hasStrings()) {
2614 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2615 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2616 }
2617
2618 UChar32 c;
2619 int32_t prev=length;
2620 do {
2621 U16_PREV(s, 0, length, c);
2622 if(realSet.contains(c)!=spanCondition) {
2623 break;
2624 }
2625 } while((prev=length)>0);
2626 return prev;
2627 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2628 UnicodeSetWithStringsIterator iter(set);
2629 UChar32 c;
2630 int32_t prev=length, length0=length;
2631 do {
2632 U16_PREV(s, 0, length, c);
2633 if(realSet.contains(c)) {
2634 break;
2635 }
2636 const UnicodeString *str;
2637 iter.reset();
2638 while((str=iter.nextString())!=NULL) {
2639 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2640 // spanNeedsStrings=TRUE;
2641 return prev;
2642 }
2643 }
2644 } while((prev=length)>0);
2645 return prev;
2646 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2647 UnicodeSetWithStringsIterator iter(set);
2648 UChar32 c;
2649 int32_t prev=length, minSpanStart=length, length0=length;
2650 do {
2651 U16_PREV(s, 0, length, c);
2652 if(!realSet.contains(c)) {
2653 length=prev; // Do not span this single, not-contained code point.
2654 }
2655 const UnicodeString *str;
2656 iter.reset();
2657 while((str=iter.nextString())!=NULL) {
2658 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2659 // spanNeedsStrings=TRUE;
2660 int32_t matchStart=prev-str->length();
2661 if(matchStart==0) {
2662 return 0;
2663 }
2664 if(spanCondition==USET_SPAN_CONTAINED) {
2665 // Iterate for the shortest match at each position.
2666 // Recurse for each but the shortest match.
2667 if(length==prev) {
2668 length=matchStart; // First match from prev.
2669 } else {
2670 if(matchStart>length) {
2671 // Remember shortest match from prev for iteration.
2672 int32_t temp=length;
2673 length=matchStart;
2674 matchStart=temp;
2675 }
2676 // Recurse for non-shortest match from prev.
2677 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2678 USET_SPAN_CONTAINED);
2679 if(spanStart<minSpanStart) {
2680 minSpanStart=spanStart;
2681 if(minSpanStart==0) {
2682 return 0;
2683 }
2684 }
2685 }
2686 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2687 if(matchStart<length) {
2688 // Remember longest match from prev.
2689 length=matchStart;
2690 }
2691 }
2692 }
2693 }
2694 if(length==prev) {
2695 break; // No match from prev.
2696 }
2697 } while((prev=length)>0);
2698 if(prev<minSpanStart) {
2699 return prev;
2700 } else {
2701 return minSpanStart;
2702 }
2703 }
2704 }
2705
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2706 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2707 USetSpanCondition spanCondition) {
2708 const UnicodeSet &realSet(set.getSet());
2709 if(!set.hasStrings()) {
2710 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2711 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2712 }
2713
2714 UChar32 c;
2715 int32_t start=0, prev;
2716 while((prev=start)<length) {
2717 U8_NEXT_OR_FFFD(s, start, length, c);
2718 if(realSet.contains(c)!=spanCondition) {
2719 break;
2720 }
2721 }
2722 return prev;
2723 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2724 UnicodeSetWithStringsIterator iter(set);
2725 UChar32 c;
2726 int32_t start, next;
2727 for(start=next=0; start<length;) {
2728 U8_NEXT_OR_FFFD(s, next, length, c);
2729 if(realSet.contains(c)) {
2730 break;
2731 }
2732 const char *s8;
2733 int32_t length8;
2734 iter.reset();
2735 while((s8=iter.nextUTF8(length8))!=NULL) {
2736 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2737 // spanNeedsStrings=TRUE;
2738 return start;
2739 }
2740 }
2741 start=next;
2742 }
2743 return start;
2744 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2745 UnicodeSetWithStringsIterator iter(set);
2746 UChar32 c;
2747 int32_t start, next, maxSpanLimit=0;
2748 for(start=next=0; start<length;) {
2749 U8_NEXT_OR_FFFD(s, next, length, c);
2750 if(!realSet.contains(c)) {
2751 next=start; // Do not span this single, not-contained code point.
2752 }
2753 const char *s8;
2754 int32_t length8;
2755 iter.reset();
2756 while((s8=iter.nextUTF8(length8))!=NULL) {
2757 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2758 // spanNeedsStrings=TRUE;
2759 int32_t matchLimit=start+length8;
2760 if(matchLimit==length) {
2761 return length;
2762 }
2763 if(spanCondition==USET_SPAN_CONTAINED) {
2764 // Iterate for the shortest match at each position.
2765 // Recurse for each but the shortest match.
2766 if(next==start) {
2767 next=matchLimit; // First match from start.
2768 } else {
2769 if(matchLimit<next) {
2770 // Remember shortest match from start for iteration.
2771 int32_t temp=next;
2772 next=matchLimit;
2773 matchLimit=temp;
2774 }
2775 // Recurse for non-shortest match from start.
2776 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2777 USET_SPAN_CONTAINED);
2778 if((matchLimit+spanLength)>maxSpanLimit) {
2779 maxSpanLimit=matchLimit+spanLength;
2780 if(maxSpanLimit==length) {
2781 return length;
2782 }
2783 }
2784 }
2785 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2786 if(matchLimit>next) {
2787 // Remember longest match from start.
2788 next=matchLimit;
2789 }
2790 }
2791 }
2792 }
2793 if(next==start) {
2794 break; // No match from start.
2795 }
2796 start=next;
2797 }
2798 if(start>maxSpanLimit) {
2799 return start;
2800 } else {
2801 return maxSpanLimit;
2802 }
2803 }
2804 }
2805
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2806 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2807 USetSpanCondition spanCondition) {
2808 if(length==0) {
2809 return 0;
2810 }
2811 const UnicodeSet &realSet(set.getSet());
2812 if(!set.hasStrings()) {
2813 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2814 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2815 }
2816
2817 UChar32 c;
2818 int32_t prev=length;
2819 do {
2820 U8_PREV_OR_FFFD(s, 0, length, c);
2821 if(realSet.contains(c)!=spanCondition) {
2822 break;
2823 }
2824 } while((prev=length)>0);
2825 return prev;
2826 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2827 UnicodeSetWithStringsIterator iter(set);
2828 UChar32 c;
2829 int32_t prev=length;
2830 do {
2831 U8_PREV_OR_FFFD(s, 0, length, c);
2832 if(realSet.contains(c)) {
2833 break;
2834 }
2835 const char *s8;
2836 int32_t length8;
2837 iter.reset();
2838 while((s8=iter.nextUTF8(length8))!=NULL) {
2839 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2840 // spanNeedsStrings=TRUE;
2841 return prev;
2842 }
2843 }
2844 } while((prev=length)>0);
2845 return prev;
2846 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2847 UnicodeSetWithStringsIterator iter(set);
2848 UChar32 c;
2849 int32_t prev=length, minSpanStart=length;
2850 do {
2851 U8_PREV_OR_FFFD(s, 0, length, c);
2852 if(!realSet.contains(c)) {
2853 length=prev; // Do not span this single, not-contained code point.
2854 }
2855 const char *s8;
2856 int32_t length8;
2857 iter.reset();
2858 while((s8=iter.nextUTF8(length8))!=NULL) {
2859 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2860 // spanNeedsStrings=TRUE;
2861 int32_t matchStart=prev-length8;
2862 if(matchStart==0) {
2863 return 0;
2864 }
2865 if(spanCondition==USET_SPAN_CONTAINED) {
2866 // Iterate for the shortest match at each position.
2867 // Recurse for each but the shortest match.
2868 if(length==prev) {
2869 length=matchStart; // First match from prev.
2870 } else {
2871 if(matchStart>length) {
2872 // Remember shortest match from prev for iteration.
2873 int32_t temp=length;
2874 length=matchStart;
2875 matchStart=temp;
2876 }
2877 // Recurse for non-shortest match from prev.
2878 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2879 USET_SPAN_CONTAINED);
2880 if(spanStart<minSpanStart) {
2881 minSpanStart=spanStart;
2882 if(minSpanStart==0) {
2883 return 0;
2884 }
2885 }
2886 }
2887 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2888 if(matchStart<length) {
2889 // Remember longest match from prev.
2890 length=matchStart;
2891 }
2892 }
2893 }
2894 }
2895 if(length==prev) {
2896 break; // No match from prev.
2897 }
2898 } while((prev=length)>0);
2899 if(prev<minSpanStart) {
2900 return prev;
2901 } else {
2902 return minSpanStart;
2903 }
2904 }
2905 }
2906
2907 // spans to be performed and compared
2908 enum {
2909 SPAN_UTF16 =1,
2910 SPAN_UTF8 =2,
2911 SPAN_UTFS =3,
2912
2913 SPAN_SET =4,
2914 SPAN_COMPLEMENT =8,
2915 SPAN_POLARITY =0xc,
2916
2917 SPAN_FWD =0x10,
2918 SPAN_BACK =0x20,
2919 SPAN_DIRS =0x30,
2920
2921 SPAN_CONTAINED =0x100,
2922 SPAN_SIMPLE =0x200,
2923 SPAN_CONDITION =0x300,
2924
2925 SPAN_ALL =0x33f
2926 };
2927
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2928 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2929 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2930 }
2931
slen(const void * s,UBool isUTF16)2932 static inline int32_t slen(const void *s, UBool isUTF16) {
2933 return isUTF16 ? u_strlen((const UChar *)s) : static_cast<int32_t>(strlen((const char *)s));
2934 }
2935
2936 /*
2937 * Count spans on a string with the method according to type and set the span limits.
2938 * The set may be the complement of the original.
2939 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2940 * according to the expected number of spans.
2941 * Sets typeName to an empty string if there is no such type.
2942 * Returns -1 if the span option is filtered out.
2943 */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2944 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2945 const void *s, int32_t length, UBool isUTF16,
2946 uint32_t whichSpans,
2947 int type, const char *&typeName,
2948 int32_t limits[], int32_t limitsCapacity,
2949 int32_t expectCount) {
2950 const UnicodeSet &realSet(set.getSet());
2951 int32_t start, count;
2952 USetSpanCondition spanCondition, firstSpanCondition, contained;
2953 UBool isForward;
2954
2955 if(type<0 || 7<type) {
2956 typeName="";
2957 return 0;
2958 }
2959
2960 static const char *const typeNames16[]={
2961 "contains", "contains(LM)",
2962 "span", "span(LM)",
2963 "containsBack", "containsBack(LM)",
2964 "spanBack", "spanBack(LM)"
2965 };
2966
2967 static const char *const typeNames8[]={
2968 "containsUTF8", "containsUTF8(LM)",
2969 "spanUTF8", "spanUTF8(LM)",
2970 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2971 "spanBackUTF8", "spanBackUTF8(LM)"
2972 };
2973
2974 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2975
2976 // filter span options
2977 if(type<=3) {
2978 // span forward
2979 if((whichSpans&SPAN_FWD)==0) {
2980 return -1;
2981 }
2982 isForward=TRUE;
2983 } else {
2984 // span backward
2985 if((whichSpans&SPAN_BACK)==0) {
2986 return -1;
2987 }
2988 isForward=FALSE;
2989 }
2990 if((type&1)==0) {
2991 // use USET_SPAN_CONTAINED
2992 if((whichSpans&SPAN_CONTAINED)==0) {
2993 return -1;
2994 }
2995 contained=USET_SPAN_CONTAINED;
2996 } else {
2997 // use USET_SPAN_SIMPLE
2998 if((whichSpans&SPAN_SIMPLE)==0) {
2999 return -1;
3000 }
3001 contained=USET_SPAN_SIMPLE;
3002 }
3003
3004 // Default first span condition for going forward with an uncomplemented set.
3005 spanCondition=USET_SPAN_NOT_CONTAINED;
3006 if(isComplement) {
3007 spanCondition=invertSpanCondition(spanCondition, contained);
3008 }
3009
3010 // First span condition for span(), used to terminate the spanBack() iteration.
3011 firstSpanCondition=spanCondition;
3012
3013 // spanBack(): Its initial span condition is span()'s last span condition,
3014 // which is the opposite of span()'s first span condition
3015 // if we expect an even number of spans.
3016 // (The loop inverts spanCondition (expectCount-1) times
3017 // before the expectCount'th span() call.)
3018 // If we do not compare forward and backward directions, then we do not have an
3019 // expectCount and just start with firstSpanCondition.
3020 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
3021 spanCondition=invertSpanCondition(spanCondition, contained);
3022 }
3023
3024 count=0;
3025 switch(type) {
3026 case 0:
3027 case 1:
3028 start=0;
3029 if(length<0) {
3030 length=slen(s, isUTF16);
3031 }
3032 for(;;) {
3033 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
3034 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3035 if(count<limitsCapacity) {
3036 limits[count]=start;
3037 }
3038 ++count;
3039 if(start>=length) {
3040 break;
3041 }
3042 spanCondition=invertSpanCondition(spanCondition, contained);
3043 }
3044 break;
3045 case 2:
3046 case 3:
3047 start=0;
3048 for(;;) {
3049 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3050 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3051 if(count<limitsCapacity) {
3052 limits[count]=start;
3053 }
3054 ++count;
3055 if(length>=0 ? start>=length :
3056 isUTF16 ? ((const UChar *)s)[start]==0 :
3057 ((const char *)s)[start]==0
3058 ) {
3059 break;
3060 }
3061 spanCondition=invertSpanCondition(spanCondition, contained);
3062 }
3063 break;
3064 case 4:
3065 case 5:
3066 if(length<0) {
3067 length=slen(s, isUTF16);
3068 }
3069 for(;;) {
3070 ++count;
3071 if(count<=limitsCapacity) {
3072 limits[limitsCapacity-count]=length;
3073 }
3074 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3075 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3076 if(length==0 && spanCondition==firstSpanCondition) {
3077 break;
3078 }
3079 spanCondition=invertSpanCondition(spanCondition, contained);
3080 }
3081 if(count<limitsCapacity) {
3082 memmove(limits, limits+(limitsCapacity-count), count*4);
3083 }
3084 break;
3085 case 6:
3086 case 7:
3087 for(;;) {
3088 ++count;
3089 if(count<=limitsCapacity) {
3090 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3091 }
3092 // Note: Length<0 is tested only for the first spanBack().
3093 // If we wanted to keep length<0 for all spanBack()s, we would have to
3094 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3095 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3096 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3097 if(length==0 && spanCondition==firstSpanCondition) {
3098 break;
3099 }
3100 spanCondition=invertSpanCondition(spanCondition, contained);
3101 }
3102 if(count<limitsCapacity) {
3103 memmove(limits, limits+(limitsCapacity-count), count*4);
3104 }
3105 break;
3106 default:
3107 typeName="";
3108 return -1;
3109 }
3110
3111 return count;
3112 }
3113
3114 // sets to be tested; odd index=isComplement
3115 enum {
3116 SLOW,
3117 SLOW_NOT,
3118 FAST,
3119 FAST_NOT,
3120 SET_COUNT
3121 };
3122
3123 static const char *const setNames[SET_COUNT]={
3124 "slow",
3125 "slow.not",
3126 "fast",
3127 "fast.not"
3128 };
3129
3130 /*
3131 * Verify that we get the same results whether we look at text with contains(),
3132 * span() or spanBack(), using unfrozen or frozen versions of the set,
3133 * and using the set or its complement (switching the spanConditions accordingly).
3134 * The latter verifies that
3135 * set.span(spanCondition) == set.complement().span(!spanCondition).
3136 *
3137 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3138 * or returned to the caller (with an input expectCount<0).
3139 */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3140 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3141 const void *s, int32_t length, UBool isUTF16,
3142 uint32_t whichSpans,
3143 int32_t expectLimits[], int32_t &expectCount,
3144 const char *testName, int32_t index) {
3145 int32_t limits[500];
3146 int32_t limitsCount;
3147 int i, j;
3148
3149 const char *typeName;
3150 int type;
3151
3152 for(i=0; i<SET_COUNT; ++i) {
3153 if((i&1)==0) {
3154 // Even-numbered sets are original, uncomplemented sets.
3155 if((whichSpans&SPAN_SET)==0) {
3156 continue;
3157 }
3158 } else {
3159 // Odd-numbered sets are complemented.
3160 if((whichSpans&SPAN_COMPLEMENT)==0) {
3161 continue;
3162 }
3163 }
3164 for(type=0;; ++type) {
3165 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3166 s, length, isUTF16,
3167 whichSpans,
3168 type, typeName,
3169 limits, UPRV_LENGTHOF(limits), expectCount);
3170 if(typeName[0]==0) {
3171 break; // All types tried.
3172 }
3173 if(limitsCount<0) {
3174 continue; // Span option filtered out.
3175 }
3176 if(expectCount<0) {
3177 expectCount=limitsCount;
3178 if(limitsCount>UPRV_LENGTHOF(limits)) {
3179 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3180 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3181 return;
3182 }
3183 memcpy(expectLimits, limits, limitsCount*4);
3184 } else if(limitsCount!=expectCount) {
3185 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3186 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3187 } else {
3188 for(j=0; j<limitsCount; ++j) {
3189 if(limits[j]!=expectLimits[j]) {
3190 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3191 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3192 j, (long)limits[j], (long)expectLimits[j]);
3193 break;
3194 }
3195 }
3196 }
3197 }
3198 }
3199
3200 // Compare span() with containsAll()/containsNone(),
3201 // but only if we have expectLimits[] from the uncomplemented set.
3202 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3203 const UChar *s16=(const UChar *)s;
3204 UnicodeString string;
3205 int32_t prev=0, limit, length;
3206 for(i=0; i<expectCount; ++i) {
3207 limit=expectLimits[i];
3208 length=limit-prev;
3209 if(length>0) {
3210 string.setTo(FALSE, s16+prev, length); // read-only alias
3211 if(i&1) {
3212 if(!sets[SLOW]->getSet().containsAll(string)) {
3213 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3214 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3215 return;
3216 }
3217 if(!sets[FAST]->getSet().containsAll(string)) {
3218 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3219 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3220 return;
3221 }
3222 } else {
3223 if(!sets[SLOW]->getSet().containsNone(string)) {
3224 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3225 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3226 return;
3227 }
3228 if(!sets[FAST]->getSet().containsNone(string)) {
3229 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3230 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3231 return;
3232 }
3233 }
3234 }
3235 prev=limit;
3236 }
3237 }
3238 }
3239
3240 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3241 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3242 const void *s, int32_t length, UBool isUTF16,
3243 uint32_t whichSpans,
3244 const char *testName, int32_t index) {
3245 int32_t expectLimits[500];
3246 int32_t expectCount=-1;
3247 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3248 }
3249
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3250 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3251 UChar c, c2;
3252
3253 if(length>=0) {
3254 while(length>0) {
3255 c=*s++;
3256 --length;
3257 if(0xd800<=c && c<0xe000) {
3258 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3259 return TRUE;
3260 }
3261 --length;
3262 }
3263 }
3264 } else {
3265 while((c=*s++)!=0) {
3266 if(0xd800<=c && c<0xe000) {
3267 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3268 return TRUE;
3269 }
3270 }
3271 }
3272 }
3273 return FALSE;
3274 }
3275
3276 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3277 // unless either UTF is turned off in whichSpans.
3278 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3279 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3280 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3281 const UChar *s16, int32_t length16,
3282 uint32_t whichSpans,
3283 const char *testName, int32_t index) {
3284 int32_t expectLimits[500];
3285 int32_t expectCount;
3286
3287 expectCount=-1; // Get expectLimits[] from testSpan().
3288
3289 if((whichSpans&SPAN_UTF16)!=0) {
3290 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3291 }
3292 if((whichSpans&SPAN_UTF8)==0) {
3293 return;
3294 }
3295
3296 // Convert s16[] and expectLimits[] to UTF-8.
3297 uint8_t s8[3000];
3298 int32_t offsets[3000];
3299
3300 const UChar *s16Limit=s16+length16;
3301 char *t=(char *)s8;
3302 char *tLimit=t+sizeof(s8);
3303 int32_t *o=offsets;
3304 UErrorCode errorCode=U_ZERO_ERROR;
3305
3306 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3307 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3308 if(U_FAILURE(errorCode)) {
3309 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3310 testName, (long)index, u_errorName(errorCode));
3311 ucnv_resetFromUnicode(utf8Cnv);
3312 return;
3313 }
3314 int32_t length8=(int32_t)(t-(char *)s8);
3315
3316 // Convert expectLimits[].
3317 int32_t i, j, expect;
3318 for(i=j=0; i<expectCount; ++i) {
3319 expect=expectLimits[i];
3320 if(expect==length16) {
3321 expectLimits[i]=length8;
3322 } else {
3323 while(offsets[j]<expect) {
3324 ++j;
3325 }
3326 expectLimits[i]=j;
3327 }
3328 }
3329
3330 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3331 }
3332
nextCodePoint(UChar32 c)3333 static UChar32 nextCodePoint(UChar32 c) {
3334 // Skip some large and boring ranges.
3335 switch(c) {
3336 case 0x3441:
3337 return 0x4d7f;
3338 case 0x5100:
3339 return 0x9f00;
3340 case 0xb040:
3341 return 0xd780;
3342 case 0xe041:
3343 return 0xf8fe;
3344 case 0x10100:
3345 return 0x20000;
3346 case 0x20041:
3347 return 0xe0000;
3348 case 0xe0101:
3349 return 0x10fffd;
3350 default:
3351 return c+1;
3352 }
3353 }
3354
3355 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3356 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3357 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3358 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3359 // Skip the UTF-8 part of the test - if the string contains surrogates -
3360 // because it is likely to produce a different result.
3361 UBool inconsistentSurrogates=
3362 (!(sets[0]->getSet().contains(0xfffd) ?
3363 sets[0]->getSet().contains(0xd800, 0xdfff) :
3364 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3365 sets[0]->hasStringsWithSurrogates());
3366
3367 UChar s[1000];
3368 int32_t length=0;
3369 uint32_t localWhichSpans;
3370
3371 UChar32 c, first;
3372 for(first=c=0;; c=nextCodePoint(c)) {
3373 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3374 localWhichSpans=whichSpans;
3375 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3376 localWhichSpans&=~SPAN_UTF8;
3377 }
3378 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3379 if(c>0x10ffff) {
3380 break;
3381 }
3382 length=0;
3383 first=c;
3384 }
3385 U16_APPEND_UNSAFE(s, length, c);
3386 }
3387 }
3388
3389 // Test with a particular, interesting string.
3390 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3391 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3392 static const UChar s[]={
3393 0x61, 0x62, 0x20, // Latin, space
3394 0x3b1, 0x3b2, 0x3b3, // Greek
3395 0xd900, // lead surrogate
3396 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3397 0xdc05, // trail surrogate
3398 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3399 0xd900, 0xdc05, // unassigned supplementary
3400 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3401 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3402 0 // NUL
3403 };
3404
3405 if((whichSpans&SPAN_UTF16)==0) {
3406 return;
3407 }
3408 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3409 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3410 }
3411
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3412 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3413 static const char s[]={
3414 "abc" // Latin
3415
3416 /* trail byte in lead position */
3417 "\x80"
3418
3419 " " // space
3420
3421 /* truncated multi-byte sequences */
3422 "\xd0"
3423 "\xe0"
3424 "\xe1"
3425 "\xed"
3426 "\xee"
3427 "\xf0"
3428 "\xf1"
3429 "\xf4"
3430 "\xf8"
3431 "\xfc"
3432
3433 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3434
3435 /* trail byte in lead position */
3436 "\x80"
3437
3438 "\xe0\x80"
3439 "\xe0\xa0"
3440 "\xe1\x80"
3441 "\xed\x80"
3442 "\xed\xa0"
3443 "\xee\x80"
3444 "\xf0\x80"
3445 "\xf0\x90"
3446 "\xf1\x80"
3447 "\xf4\x80"
3448 "\xf4\x90"
3449 "\xf8\x80"
3450 "\xfc\x80"
3451
3452 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3453
3454 /* trail byte in lead position */
3455 "\x80"
3456
3457 "\xf0\x80\x80"
3458 "\xf0\x90\x80"
3459 "\xf1\x80\x80"
3460 "\xf4\x80\x80"
3461 "\xf4\x90\x80"
3462 "\xf8\x80\x80"
3463 "\xfc\x80\x80"
3464
3465 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3466
3467 /* trail byte in lead position */
3468 "\x80"
3469
3470 "\xf8\x80\x80\x80"
3471 "\xfc\x80\x80\x80"
3472
3473 "\xF1\x90\x80\x85" // unassigned supplementary
3474
3475 /* trail byte in lead position */
3476 "\x80"
3477
3478 "\xfc\x80\x80\x80\x80"
3479
3480 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3481
3482 /* trail byte in lead position */
3483 "\x80"
3484
3485 /* complete sequences but non-shortest forms or out of range etc. */
3486 "\xc0\x80"
3487 "\xe0\x80\x80"
3488 "\xed\xa0\x80"
3489 "\xf0\x80\x80\x80"
3490 "\xf4\x90\x80\x80"
3491 "\xf8\x80\x80\x80\x80"
3492 "\xfc\x80\x80\x80\x80\x80"
3493 "\xfe"
3494 "\xff"
3495
3496 /* trail byte in lead position */
3497 "\x80"
3498
3499 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3500 };
3501
3502 if((whichSpans&SPAN_UTF8)==0) {
3503 return;
3504 }
3505 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3506 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3507 }
3508
3509 // Take a set of span options and multiply them so that
3510 // each portion only has one of the options a, b and c.
3511 // If b==0, then the set of options is just modified with mask and a.
3512 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3513 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3514 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3515 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3516 uint32_t s;
3517 int32_t i;
3518
3519 for(i=0; i<whichSpansCount; ++i) {
3520 s=whichSpans[i]&mask;
3521 whichSpans[i]=s|a;
3522 if(b!=0) {
3523 whichSpans[whichSpansCount+i]=s|b;
3524 if(c!=0) {
3525 whichSpans[2*whichSpansCount+i]=s|c;
3526 }
3527 }
3528 }
3529 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3530 }
3531
3532 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3533 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3534 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3535 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3536
TestSpan()3537 void UnicodeSetTest::TestSpan() {
3538 // "[...]" is a UnicodeSet pattern.
3539 // "*" performs tests on all Unicode code points and on a selection of
3540 // malformed UTF-8/16 strings.
3541 // "-options" limits the scope of testing for the current set.
3542 // By default, the test verifies that equivalent boundaries are found
3543 // for UTF-16 and UTF-8, going forward and backward,
3544 // alternating USET_SPAN_NOT_CONTAINED with
3545 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3546 // Single-character options:
3547 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3548 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3549 // or the set contains strings with unpaired surrogates
3550 // which do not translate to valid UTF-8.
3551 // c -- set.span() and set.complement().span() boundaries may differ.
3552 // Cause: Set strings are not complemented.
3553 // b -- span() and spanBack() boundaries may differ.
3554 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3555 // and spanBack(USET_SPAN_SIMPLE) are defined to
3556 // match with non-overlapping substrings.
3557 // For example, with a set containing "ab" and "ba",
3558 // span() of "aba" yields boundaries { 0, 2, 3 }
3559 // because the initial "ab" matches from 0 to 2,
3560 // while spanBack() yields boundaries { 0, 1, 3 }
3561 // because the final "ba" matches from 1 to 3.
3562 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3563 // Cause: Strings in the set overlap, and a longer match may
3564 // require a sequence including non-longest substrings.
3565 // For example, with a set containing "ab", "abc" and "cd",
3566 // span(contained) of "abcd" spans the entire string
3567 // but span(longest match) only spans the first 3 characters.
3568 // Each "-options" first resets all options and then applies the specified options.
3569 // A "-" without options resets the options.
3570 // The options are also reset for each new set.
3571 // Other strings will be spanned.
3572 static const char *const testdata[]={
3573 "[:ID_Continue:]",
3574 "*",
3575 "[:White_Space:]",
3576 "*",
3577 "[]",
3578 "*",
3579 "[\\u0000-\\U0010FFFF]",
3580 "*",
3581 "[\\u0000\\u0080\\u0800\\U00010000]",
3582 "*",
3583 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3584 "*",
3585 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3586 "-c",
3587 "*",
3588 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3589 "-c",
3590 "*",
3591
3592 // Overlapping strings cause overlapping attempts to match.
3593 "[x{xy}{xya}{axy}{ax}]",
3594 "-cl",
3595
3596 // More repetitions of "xya" would take too long with the recursive
3597 // reference implementation.
3598 // containsAll()=FALSE
3599 // test_string 0x14
3600 "xx"
3601 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3602 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3603 "xyaxyaxyaxya"
3604 "xx"
3605 "xyaxyaxyaxya" // span() ends here.
3606 "aaa",
3607
3608 // containsAll()=TRUE
3609 // test_string 0x15
3610 "xx"
3611 "xyaxyaxyaxya"
3612 "xx"
3613 "xyaxyaxyaxya"
3614 "xx"
3615 "xyaxyaxyaxy",
3616
3617 "-bc",
3618 // test_string 0x17
3619 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3620 "-c",
3621 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3622 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3623 "-",
3624 "byaya", // span() -> { 5 }
3625 "byay", // span() -> { 4 }
3626 "bya", // span() -> { 3 }
3627
3628 // span(longest match) will not span the whole string.
3629 "[a{ab}{bc}]",
3630 "-cl",
3631 // test_string 0x21
3632 "abc",
3633
3634 "[a{ab}{abc}{cd}]",
3635 "-cl",
3636 "acdabcdabccd",
3637
3638 // spanBack(longest match) will not span the whole string.
3639 "[c{ab}{bc}]",
3640 "-cl",
3641 "abc",
3642
3643 "[d{cd}{bcd}{ab}]",
3644 "-cl",
3645 "abbcdabcdabd",
3646
3647 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3648 // and UTF-8 trail bytes.
3649 // Copies of above test sets and strings, but transliterated to have
3650 // different code points with similar trail units.
3651 // Previous: a b c d
3652 // Unicode: 042B 30AB 200AB 204AB
3653 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3654 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3655 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3656 "-cl",
3657 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3658
3659 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3660 "-cl",
3661 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3662
3663 // Stress bookkeeping and recursion.
3664 // The following strings are barely doable with the recursive
3665 // reference implementation.
3666 // The not-contained character at the end prevents an early exit from the span().
3667 "[b{bb}]",
3668 "-c",
3669 // test_string 0x33
3670 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3671 // On complement sets, span() and spanBack() get different results
3672 // because b is not in the complement set and there is an odd number of b's
3673 // in the test string.
3674 "-bc",
3675 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3676
3677 // Test with set strings with an initial or final code point span
3678 // longer than 254.
3679 "[a{" _64_a _64_a _64_a _64_a "b}"
3680 "{a" _64_b _64_b _64_b _64_b "}]",
3681 "-c",
3682 _64_a _64_a _64_a _63_a "b",
3683 _64_a _64_a _64_a _64_a "b",
3684 _64_a _64_a _64_a _64_a "aaaabbbb",
3685 "a" _64_b _64_b _64_b _63_b,
3686 "a" _64_b _64_b _64_b _64_b,
3687 "aaaabbbb" _64_b _64_b _64_b _64_b,
3688
3689 // Test with strings containing unpaired surrogates.
3690 // They are not representable in UTF-8, and a leading trail surrogate
3691 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3692 // U+20001 == \\uD840\\uDC01
3693 // U+20400 == \\uD841\\uDC00
3694 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3695 "-8cl",
3696 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3697 };
3698 uint32_t whichSpans[96]={ SPAN_ALL };
3699 int32_t whichSpansCount=1;
3700
3701 UnicodeSet *sets[SET_COUNT]={ NULL };
3702 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3703
3704 char testName[1024];
3705 char *testNameLimit=testName;
3706
3707 int32_t i, j;
3708 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3709 const char *s=testdata[i];
3710 if(s[0]=='[') {
3711 // Create new test sets from this pattern.
3712 for(j=0; j<SET_COUNT; ++j) {
3713 delete sets_with_str[j];
3714 delete sets[j];
3715 }
3716 UErrorCode errorCode=U_ZERO_ERROR;
3717 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3718 if(U_FAILURE(errorCode)) {
3719 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3720 break;
3721 }
3722 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3723 sets[SLOW_NOT]->complement();
3724 // Intermediate set: Test cloning of a frozen set.
3725 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3726 fast->freeze();
3727 sets[FAST]=fast->clone();
3728 delete fast;
3729 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3730 fastNot->freeze();
3731 sets[FAST_NOT]=fastNot->clone();
3732 delete fastNot;
3733
3734 for(j=0; j<SET_COUNT; ++j) {
3735 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3736 }
3737
3738 strcpy(testName, s);
3739 testNameLimit=strchr(testName, 0);
3740 *testNameLimit++=':';
3741 *testNameLimit=0;
3742
3743 whichSpans[0]=SPAN_ALL;
3744 whichSpansCount=1;
3745 } else if(s[0]=='-') {
3746 whichSpans[0]=SPAN_ALL;
3747 whichSpansCount=1;
3748
3749 while(*++s!=0) {
3750 switch(*s) {
3751 case 'c':
3752 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3753 ~SPAN_POLARITY,
3754 SPAN_SET,
3755 SPAN_COMPLEMENT,
3756 0);
3757 break;
3758 case 'b':
3759 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3760 ~SPAN_DIRS,
3761 SPAN_FWD,
3762 SPAN_BACK,
3763 0);
3764 break;
3765 case 'l':
3766 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3767 // USET_SPAN_SIMPLE only FWD, and separately
3768 // USET_SPAN_SIMPLE only BACK
3769 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3770 ~(SPAN_DIRS|SPAN_CONDITION),
3771 SPAN_DIRS|SPAN_CONTAINED,
3772 SPAN_FWD|SPAN_SIMPLE,
3773 SPAN_BACK|SPAN_SIMPLE);
3774 break;
3775 case '8':
3776 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3777 ~SPAN_UTFS,
3778 SPAN_UTF16,
3779 SPAN_UTF8,
3780 0);
3781 break;
3782 default:
3783 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3784 break;
3785 }
3786 }
3787 } else if(0==strcmp(s, "*")) {
3788 strcpy(testNameLimit, "bad_string");
3789 for(j=0; j<whichSpansCount; ++j) {
3790 if(whichSpansCount>1) {
3791 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3792 "%%0x%3x",
3793 whichSpans[j]);
3794 }
3795 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3796 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3797 }
3798
3799 strcpy(testNameLimit, "contents");
3800 for(j=0; j<whichSpansCount; ++j) {
3801 if(whichSpansCount>1) {
3802 sprintf(testNameLimit+8 /* strlen("contents") */,
3803 "%%0x%3x",
3804 whichSpans[j]);
3805 }
3806 testSpanContents(sets_with_str, whichSpans[j], testName);
3807 }
3808 } else {
3809 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3810 strcpy(testNameLimit, "test_string");
3811 for(j=0; j<whichSpansCount; ++j) {
3812 if(whichSpansCount>1) {
3813 sprintf(testNameLimit+11 /* strlen("test_string") */,
3814 "%%0x%3x",
3815 whichSpans[j]);
3816 }
3817 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3818 }
3819 }
3820 }
3821 for(j=0; j<SET_COUNT; ++j) {
3822 delete sets_with_str[j];
3823 delete sets[j];
3824 }
3825 }
3826
3827 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3828 void UnicodeSetTest::TestStringSpan() {
3829 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3830 static const char *const string=
3831 "xx"
3832 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3833 "xx"
3834 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3835 "xx"
3836 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3837 "aaaa";
3838
3839 UErrorCode errorCode=U_ZERO_ERROR;
3840 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3841 UnicodeSet set(pattern16, errorCode);
3842 if(U_FAILURE(errorCode)) {
3843 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3844 return;
3845 }
3846
3847 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3848
3849 if(set.containsAll(string16)) {
3850 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3851 }
3852
3853 // Remove trailing "aaaa".
3854 string16.truncate(string16.length()-4);
3855 if(!set.containsAll(string16)) {
3856 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3857 }
3858
3859 string16=u"byayaxya";
3860 const UChar *s16=string16.getBuffer();
3861 int32_t length16=string16.length();
3862 (void)length16; // Suppress set but not used warning.
3863 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3864 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3865 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3866 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3867 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3868 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3869 ) {
3870 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3871 }
3872
3873 pattern="[a{ab}{abc}{cd}]";
3874 pattern16=UnicodeString(pattern, -1, US_INV);
3875 set.applyPattern(pattern16, errorCode);
3876 if(U_FAILURE(errorCode)) {
3877 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3878 return;
3879 }
3880 string16=u"acdabcdabccd";
3881 s16=string16.getBuffer();
3882 length16=string16.length();
3883 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3884 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3885 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3886 ) {
3887 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3888 }
3889
3890 pattern="[d{cd}{bcd}{ab}]";
3891 pattern16=UnicodeString(pattern, -1, US_INV);
3892 set.applyPattern(pattern16, errorCode).freeze();
3893 if(U_FAILURE(errorCode)) {
3894 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3895 return;
3896 }
3897 string16=u"abbcdabcdabd";
3898 s16=string16.getBuffer();
3899 length16=string16.length();
3900 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3901 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3902 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3903 ) {
3904 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3905 }
3906 }
3907
3908 /**
3909 * Including collationroot.h fails here with
3910 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3911 * .. so, we skip this test on Windows.
3912 *
3913 * the cause is that intltest builds with /Za which disables language extensions - which means
3914 * windows header files can't be used.
3915 */
3916 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3917 #include "collationroot.h"
3918 #include "collationtailoring.h"
3919 #endif
3920
TestUCAUnsafeBackwards()3921 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3922 #if U_PLATFORM_HAS_WIN32_API
3923 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3924 #elif !UCONFIG_NO_COLLATION
3925 UErrorCode errorCode = U_ZERO_ERROR;
3926
3927 // Get the unsafeBackwardsSet
3928 const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3929 if(U_FAILURE(errorCode)) {
3930 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3931 return;
3932 }
3933 //const UVersionInfo &version = rootEntry->tailoring->version;
3934 const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3935
3936 checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3937
3938 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3939 // simple test case
3940 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3941 // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3942 UnicodeSet surrogates;
3943 surrogates.add(0xd83a); // a lead surrogate
3944 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
3945 UnicodeString pat;
3946 surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3947 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3948 // so that at least one type of surrogate code points are escaped,
3949 // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3950 errorCode = U_ZERO_ERROR;
3951 UnicodeSet s2;
3952 s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3953 if(U_FAILURE(errorCode)) {
3954 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3955 } else {
3956 checkEqual(surrogates, s2, "surrogates to/from pattern");
3957 }
3958 // This occurs in the UCA unsafe-backwards set.
3959 checkRoundTrip(*unsafeBackwardSet);
3960 }
3961 #endif
3962 }
3963
TestIntOverflow()3964 void UnicodeSetTest::TestIntOverflow() {
3965 // This test triggers undefined double->int conversion behavior
3966 // if the implementation is not careful.
3967 IcuTestErrorCode errorCode(*this, "TestIntOverflow");
3968 UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
3969 assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
3970 assertEquals("[:ccc=int_overflow:] -> illegal argument",
3971 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3972 }
3973
TestUnusedCcc()3974 void UnicodeSetTest::TestUnusedCcc() {
3975 #if !UCONFIG_NO_NORMALIZATION
3976 // All numeric ccc values 0..255 are valid, but many are unused.
3977 IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
3978 UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
3979 assertSuccess("[:ccc=2:]", errorCode);
3980 assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
3981
3982 UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
3983 assertSuccess("[:ccc=255:]", errorCode);
3984 assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
3985
3986 // Non-integer values and values outside 0..255 are invalid.
3987 UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
3988 assertEquals("[:ccc=-1:] -> illegal argument",
3989 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3990 assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
3991
3992 UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
3993 assertEquals("[:ccc=256:] -> illegal argument",
3994 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3995 assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
3996
3997 UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
3998 assertEquals("[:ccc=1.1:] -> illegal argument",
3999 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4000 assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
4001 #endif
4002 }
4003
TestDeepPattern()4004 void UnicodeSetTest::TestDeepPattern() {
4005 IcuTestErrorCode errorCode(*this, "TestDeepPattern");
4006 // Nested ranges are parsed via recursion which can use a lot of stack space.
4007 // After a reasonable limit, we should get an error.
4008 constexpr int32_t DEPTH = 20000;
4009 UnicodeString pattern, suffix;
4010 for (int32_t i = 0; i < DEPTH; ++i) {
4011 pattern.append(u"[a", 2);
4012 suffix.append(']');
4013 }
4014 pattern.append(suffix);
4015 UnicodeSet set(pattern, errorCode);
4016 assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
4017 errorCode.reset();
4018 }
4019
TestEmptyString()4020 void UnicodeSetTest::TestEmptyString() {
4021 IcuTestErrorCode errorCode(*this, "TestEmptyString");
4022 // Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
4023 UnicodeSet set(u"[{}]", errorCode);
4024 if (!assertSuccess("set from pattern with {}", errorCode)) { return; }
4025 assertTrue("set from pattern with {}", set.contains(u""));
4026 assertEquals("set from pattern with {}: size", 1, set.size());
4027 assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
4028
4029 // Remove, add back, ...
4030 assertFalse("remove empty string", set.remove(u"").contains(u""));
4031 assertEquals("remove empty string: size", 0, set.size());
4032 assertTrue("remove empty string: isEmpty", set.isEmpty());
4033 assertTrue("add empty string", set.add(u"").contains(u""));
4034 // missing API -- assertTrue("retain empty string", set.retain(u"").contains(u""));
4035 assertFalse("complement-remove empty string", set.complement(u"").contains(u""));
4036 assertTrue("complement-add empty string", set.complement(u"").contains(u""));
4037
4038 assertFalse("clear", set.clear().contains(u""));
4039 assertTrue("add empty string 2", set.add(u"").contains(u""));
4040 assertFalse("removeAllStrings", set.removeAllStrings().contains(u""));
4041 assertTrue("add empty string 3", set.add(u"").contains(u""));
4042 // Note that this leaves the set containing exactly the empty string.
4043
4044 // strings() access and iteration
4045 // no C++ equivalent for Java strings() -- assertTrue("strings()", set.strings().contains(u""));
4046 UnicodeSetIterator sit(set);
4047 assertTrue("set iterator.next()", sit.next());
4048 assertTrue("set iterator has empty string", sit.isString() && sit.getString().isEmpty());
4049
4050 // The empty string is ignored in matching.
4051 set.add(u'a').add(u'c');
4052 assertEquals("span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4053 assertEquals("spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4054 assertTrue("containsNone", set.containsNone(u"def"));
4055 assertFalse("containsSome", set.containsSome(u"def"));
4056 set.freeze();
4057 assertEquals("frozen span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4058 assertEquals("frozen spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4059 assertTrue("frozen containsNone", set.containsNone(u"def"));
4060 assertFalse("frozen containsSome", set.containsSome(u"def"));
4061 }
4062