• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 *   Copyright (C) 1999-2016 International Business Machines Corporation and
6 *   others. All Rights Reserved.
7 ********************************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 *   03/22/2000  Madhu       Added additional tests
11 ********************************************************************************
12 */
13 
14 #include <stdio.h>
15 
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/utf8.h"
27 #include "unicode/utf16.h"
28 #include "unicode/uversion.h"
29 #include "cmemory.h"
30 #include "hash.h"
31 
32 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
33     if (U_FAILURE(status)) { \
34         dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
35                   u_errorName(status)); \
36     } \
37 } UPRV_BLOCK_MACRO_END
38 
39 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
40     if (!(expr)) { \
41         dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
42     } \
43 } UPRV_BLOCK_MACRO_END
44 
operator +(const UnicodeString & left,const UnicodeSet & set)45 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
46     UnicodeString pat;
47     set.toPattern(pat);
48     return left + UnicodeSetTest::escape(pat);
49 }
50 
UnicodeSetTest()51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53 
openUTF8Converter()54 UConverter *UnicodeSetTest::openUTF8Converter() {
55     if(utf8Cnv==NULL) {
56         UErrorCode errorCode=U_ZERO_ERROR;
57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
58     }
59     return utf8Cnv;
60 }
61 
~UnicodeSetTest()62 UnicodeSetTest::~UnicodeSetTest() {
63     ucnv_close(utf8Cnv);
64 }
65 
66 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68                                const char* &name, char* /*par*/) {
69     if (exec) {
70         logln(u"TestSuite UnicodeSetTest");
71     }
72     TESTCASE_AUTO_BEGIN;
73     TESTCASE_AUTO(TestPatterns);
74     TESTCASE_AUTO(TestAddRemove);
75     TESTCASE_AUTO(TestCategories);
76     TESTCASE_AUTO(TestCloneEqualHash);
77     TESTCASE_AUTO(TestMinimalRep);
78     TESTCASE_AUTO(TestAPI);
79     TESTCASE_AUTO(TestScriptSet);
80     TESTCASE_AUTO(TestPropertySet);
81     TESTCASE_AUTO(TestClone);
82     TESTCASE_AUTO(TestExhaustive);
83     TESTCASE_AUTO(TestToPattern);
84     TESTCASE_AUTO(TestIndexOf);
85     TESTCASE_AUTO(TestStrings);
86     TESTCASE_AUTO(Testj2268);
87     TESTCASE_AUTO(TestCloseOver);
88     TESTCASE_AUTO(TestEscapePattern);
89     TESTCASE_AUTO(TestInvalidCodePoint);
90     TESTCASE_AUTO(TestSymbolTable);
91     TESTCASE_AUTO(TestSurrogate);
92     TESTCASE_AUTO(TestPosixClasses);
93     TESTCASE_AUTO(TestIteration);
94     TESTCASE_AUTO(TestFreezable);
95     TESTCASE_AUTO(TestSpan);
96     TESTCASE_AUTO(TestStringSpan);
97     TESTCASE_AUTO(TestUCAUnsafeBackwards);
98     TESTCASE_AUTO(TestIntOverflow);
99     TESTCASE_AUTO(TestUnusedCcc);
100     TESTCASE_AUTO(TestDeepPattern);
101     TESTCASE_AUTO(TestEmptyString);
102     TESTCASE_AUTO_END;
103 }
104 
105 static const char NOT[] = "%%%%";
106 
107 /**
108  * UVector was improperly copying contents
109  * This code will crash this is still true
110  */
Testj2268()111 void UnicodeSetTest::Testj2268() {
112   UnicodeSet t;
113   t.add(UnicodeString("abc"));
114   UnicodeSet test(t);
115   UnicodeString ustrPat;
116   test.toPattern(ustrPat, TRUE);
117 }
118 
119 /**
120  * Test toPattern().
121  */
TestToPattern()122 void UnicodeSetTest::TestToPattern() {
123     UErrorCode ec = U_ZERO_ERROR;
124 
125     // Test that toPattern() round trips with syntax characters and
126     // whitespace.
127     {
128         static const char* OTHER_TOPATTERN_TESTS[] = {
129             "[[:latin:]&[:greek:]]",
130             "[[:latin:]-[:greek:]]",
131             "[:nonspacing mark:]",
132             NULL
133         };
134 
135         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
136             ec = U_ZERO_ERROR;
137             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
138             if (U_FAILURE(ec)) {
139                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
140                 continue;
141             }
142             checkPat(OTHER_TOPATTERN_TESTS[j], s);
143         }
144 
145         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
146             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
147 
148                 // check various combinations to make sure they all work.
149                 if (i != 0 && !toPatternAux(i, i)){
150                     continue;
151                 }
152                 if (!toPatternAux(0, i)){
153                     continue;
154                 }
155                 if (!toPatternAux(i, 0xFFFF)){
156                     continue;
157                 }
158             }
159         }
160     }
161 
162     // Test pattern behavior of multicharacter strings.
163     {
164         ec = U_ZERO_ERROR;
165         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
166 
167         // This loop isn't a loop.  It's here to make the compiler happy.
168         // If you're curious, try removing it and changing the 'break'
169         // statements (except for the last) to goto's.
170         for (;;) {
171             if (U_FAILURE(ec)) break;
172             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
173             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
174 
175             s->add("ac");
176             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
177             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
178 
179             s->applyPattern(u"[a-z {\\{l} {r\\}}]", ec);
180             if (U_FAILURE(ec)) break;
181             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
182             expectToPattern(*s, u"[a-z{r\\}}{\\{l}]", exp3);
183 
184             s->add("[]");
185             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
186             expectToPattern(*s, u"[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
187 
188             s->applyPattern(u"[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
189             if (U_FAILURE(ec)) break;
190             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
191             expectToPattern(*s, u"[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
192 
193             // j2189
194             s->clear();
195             s->add(UnicodeString("abc", ""));
196             s->add(UnicodeString("abc", ""));
197             const char* exp6[] = {"abc", NOT, "ab", NULL};
198             expectToPattern(*s, "[{abc}]", exp6);
199 
200             break;
201         }
202 
203         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
204         delete s;
205     }
206 
207     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
208     UnicodeSet s;
209     s.add(u'a', u'b');
210     expectToPattern(s, "[ab]", NULL);
211 }
212 
toPatternAux(UChar32 start,UChar32 end)213 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
214 
215     // use Integer.toString because Utility.hex doesn't handle ints
216     UnicodeString pat = "";
217     // TODO do these in hex
218     //String source = "0x" + Integer.toString(start,16).toUpperCase();
219     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
220     UnicodeString source;
221     source = source + (uint32_t)start;
222     if (start != end)
223         source = source + ".." + (uint32_t)end;
224     UnicodeSet testSet;
225     testSet.add(start, end);
226     return checkPat(source, testSet);
227 }
228 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)229 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
230                                const UnicodeSet& testSet) {
231     // What we want to make sure of is that a pattern generated
232     // by toPattern(), with or without escaped unprintables, can
233     // be passed back into the UnicodeSet constructor.
234     UnicodeString pat0;
235 
236     testSet.toPattern(pat0, TRUE);
237 
238     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
239 
240     //String pat1 = unescapeLeniently(pat0);
241     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
242 
243     UnicodeString pat2;
244     testSet.toPattern(pat2, FALSE);
245     if (!checkPat(source, testSet, pat2)) return FALSE;
246 
247     //String pat3 = unescapeLeniently(pat2);
248     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
249 
250     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
251     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
252     return TRUE;
253 }
254 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)255 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
256                                const UnicodeSet& testSet,
257                                const UnicodeString& pat) {
258     UErrorCode ec = U_ZERO_ERROR;
259     UnicodeSet testSet2(pat, ec);
260     if (testSet2 != testSet) {
261         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
262         return FALSE;
263     }
264     return TRUE;
265 }
266 
267 void
TestPatterns(void)268 UnicodeSetTest::TestPatterns(void) {
269     UnicodeSet set;
270     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
271     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
272     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
273     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
274     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
275     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
276 
277     // Throw in a test of complement
278     set.complement();
279     UnicodeString exp;
280     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(u'z'+1)).append(u'\uFFFF');
281     expectPairs(set, exp);
282 }
283 
284 void
TestCategories(void)285 UnicodeSetTest::TestCategories(void) {
286     UErrorCode status = U_ZERO_ERROR;
287     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
288     UnicodeSet set(pat, status);
289     if (U_FAILURE(status)) {
290         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
291         return;
292     } else {
293         expectContainment(set, pat, "ABC", "abc");
294     }
295 
296     UChar32 i;
297     int32_t failures = 0;
298     // Make sure generation of L doesn't pollute cached Lu set
299     // First generate L, then Lu
300     set.applyPattern("[:L:]", status);
301     if (U_FAILURE(status)) { errln("FAIL"); return; }
302     for (i=0; i<0x200; ++i) {
303         UBool l = u_isalpha((UChar)i);
304         if (l != set.contains(i)) {
305             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
306                   set.contains(i));
307             if (++failures == 10) break;
308         }
309     }
310 
311     set.applyPattern("[:Lu:]", status);
312     if (U_FAILURE(status)) { errln("FAIL"); return; }
313     for (i=0; i<0x200; ++i) {
314         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
315         if (lu != set.contains(i)) {
316             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
317                   set.contains(i));
318             if (++failures == 20) break;
319         }
320     }
321 }
322 void
TestCloneEqualHash(void)323 UnicodeSetTest::TestCloneEqualHash(void) {
324     UErrorCode status = U_ZERO_ERROR;
325     // set1 and set2 used to be built with the obsolete constructor taking
326     // UCharCategory values; replaced with pattern constructors
327     // markus 20030502
328     UnicodeSet *set1=new UnicodeSet(u"\\p{Lowercase Letter}", status); //  :Ll: Letter, lowercase
329     UnicodeSet *set1a=new UnicodeSet(u"[:Ll:]", status); //  Letter, lowercase
330     if (U_FAILURE(status)){
331         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
332         return;
333     }
334     UnicodeSet *set2=new UnicodeSet(u"\\p{Decimal Number}", status);   //Number, Decimal digit
335     UnicodeSet *set2a=new UnicodeSet(u"[:Nd:]", status);   //Number, Decimal digit
336     if (U_FAILURE(status)){
337         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
338         return;
339     }
340 
341     if (*set1 != *set1a) {
342         errln("FAIL: category constructor for Ll broken");
343     }
344     if (*set2 != *set2a) {
345         errln("FAIL: category constructor for Nd broken");
346     }
347     delete set1a;
348     delete set2a;
349 
350     logln("Testing copy construction");
351     UnicodeSet *set1copy=new UnicodeSet(*set1);
352     if(*set1 != *set1copy || *set1 == *set2 ||
353         getPairs(*set1) != getPairs(*set1copy) ||
354         set1->hashCode() != set1copy->hashCode()){
355         errln("FAIL : Error in copy construction");
356         return;
357     }
358 
359     logln("Testing =operator");
360     UnicodeSet set1equal=*set1;
361     UnicodeSet set2equal=*set2;
362     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
363         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
364         errln("FAIL: Error in =operator");
365     }
366 
367     logln("Testing clone()");
368     UnicodeSet *set1clone=set1->clone();
369     UnicodeSet *set2clone=set2->clone();
370     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
371         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
372         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
373         errln("FAIL: Error in clone");
374     }
375 
376     logln("Testing hashcode");
377     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
378         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
379         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
380         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
381         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
382         errln("FAIL: Error in hashCode()");
383     }
384 
385     delete set1;
386     delete set1copy;
387     delete set2;
388     delete set1clone;
389     delete set2clone;
390 
391 
392 }
393 void
TestAddRemove(void)394 UnicodeSetTest::TestAddRemove(void) {
395     UnicodeSet set; // Construct empty set
396     doAssert(set.isEmpty() == TRUE, "set should be empty");
397     doAssert(set.size() == 0, "size should be 0");
398     set.complement();
399     doAssert(set.size() == 0x110000, "size should be 0x110000");
400     set.clear();
401     set.add(0x0061, 0x007a);
402     expectPairs(set, "az");
403     doAssert(set.isEmpty() == FALSE, "set should not be empty");
404     doAssert(set.size() != 0, "size should not be equal to 0");
405     doAssert(set.size() == 26, "size should be equal to 26");
406     set.remove(0x006d, 0x0070);
407     expectPairs(set, "alqz");
408     doAssert(set.size() == 22, "size should be equal to 22");
409     set.remove(0x0065, 0x0067);
410     expectPairs(set, "adhlqz");
411     doAssert(set.size() == 19, "size should be equal to 19");
412     set.remove(0x0064, 0x0069);
413     expectPairs(set, "acjlqz");
414     doAssert(set.size() == 16, "size should be equal to 16");
415     set.remove(0x0063, 0x0072);
416     expectPairs(set, "absz");
417     doAssert(set.size() == 10, "size should be equal to 10");
418     set.add(0x0066, 0x0071);
419     expectPairs(set, "abfqsz");
420     doAssert(set.size() == 22, "size should be equal to 22");
421     set.remove(0x0061, 0x0067);
422     expectPairs(set, "hqsz");
423     set.remove(0x0061, 0x007a);
424     expectPairs(set, "");
425     doAssert(set.isEmpty() == TRUE, "set should be empty");
426     doAssert(set.size() == 0, "size should be 0");
427     set.add(0x0061);
428     doAssert(set.isEmpty() == FALSE, "set should not be empty");
429     doAssert(set.size() == 1, "size should not be equal to 1");
430     set.add(0x0062);
431     set.add(0x0063);
432     expectPairs(set, "ac");
433     doAssert(set.size() == 3, "size should not be equal to 3");
434     set.add(0x0070);
435     set.add(0x0071);
436     expectPairs(set, "acpq");
437     doAssert(set.size() == 5, "size should not be equal to 5");
438     set.clear();
439     expectPairs(set, "");
440     doAssert(set.isEmpty() == TRUE, "set should be empty");
441     doAssert(set.size() == 0, "size should be 0");
442 
443     // Try removing an entire set from another set
444     expectPattern(set, "[c-x]", "cx");
445     UnicodeSet set2;
446     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
447     set.removeAll(set2);
448     expectPairs(set, "deluxx");
449 
450     // Try adding an entire set to another set
451     expectPattern(set, "[jackiemclean]", "aacceein");
452     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
453     set.addAll(set2);
454     expectPairs(set, "aacehort");
455     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
456 
457     // Try retaining an set of elements contained in another set (intersection)
458     UnicodeSet set3;
459     expectPattern(set3, "[a-c]", "ac");
460     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
461     set3.remove(0x0062);
462     expectPairs(set3, "aacc");
463     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
464     set.retainAll(set3);
465     expectPairs(set, "aacc");
466     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
467     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
468     set.clear();
469     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
470 
471     // Test commutativity
472     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
473     expectPattern(set2, "[jackiemclean]", "aacceein");
474     set.addAll(set2);
475     expectPairs(set, "aacehort");
476     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
477 
478 
479 
480 
481 }
482 
483 /**
484  * Make sure minimal representation is maintained.
485  */
TestMinimalRep()486 void UnicodeSetTest::TestMinimalRep() {
487     UErrorCode status = U_ZERO_ERROR;
488     // This is pretty thoroughly tested by checkCanonicalRep()
489     // run against the exhaustive operation results.  Use the code
490     // here for debugging specific spot problems.
491 
492     // 1 overlap against 2
493     UnicodeSet set("[h-km-q]", status);
494     if (U_FAILURE(status)) { errln("FAIL"); return; }
495     UnicodeSet set2("[i-o]", status);
496     if (U_FAILURE(status)) { errln("FAIL"); return; }
497     set.addAll(set2);
498     expectPairs(set, "hq");
499     // right
500     set.applyPattern("[a-m]", status);
501     if (U_FAILURE(status)) { errln("FAIL"); return; }
502     set2.applyPattern("[e-o]", status);
503     if (U_FAILURE(status)) { errln("FAIL"); return; }
504     set.addAll(set2);
505     expectPairs(set, "ao");
506     // left
507     set.applyPattern("[e-o]", status);
508     if (U_FAILURE(status)) { errln("FAIL"); return; }
509     set2.applyPattern("[a-m]", status);
510     if (U_FAILURE(status)) { errln("FAIL"); return; }
511     set.addAll(set2);
512     expectPairs(set, "ao");
513     // 1 overlap against 3
514     set.applyPattern("[a-eg-mo-w]", status);
515     if (U_FAILURE(status)) { errln("FAIL"); return; }
516     set2.applyPattern("[d-q]", status);
517     if (U_FAILURE(status)) { errln("FAIL"); return; }
518     set.addAll(set2);
519     expectPairs(set, "aw");
520 }
521 
TestAPI()522 void UnicodeSetTest::TestAPI() {
523     UErrorCode status = U_ZERO_ERROR;
524     // default ct
525     UnicodeSet set;
526     if (!set.isEmpty() || set.getRangeCount() != 0) {
527         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
528               set);
529     }
530 
531     // clear(), isEmpty()
532     set.add(0x0061);
533     if (set.isEmpty()) {
534         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
535               set);
536     }
537     set.clear();
538     if (!set.isEmpty()) {
539         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
540               set);
541     }
542 
543     // size()
544     set.clear();
545     if (set.size() != 0) {
546         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
547               ": " + set);
548     }
549     set.add(0x0061);
550     if (set.size() != 1) {
551         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
552               ": " + set);
553     }
554     set.add(0x0031, 0x0039);
555     if (set.size() != 10) {
556         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
557               ": " + set);
558     }
559 
560     // contains(first, last)
561     set.clear();
562     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
563     if (U_FAILURE(status)) { errln("FAIL"); return; }
564     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
565         UChar32 a = set.getRangeStart(i);
566         UChar32 b = set.getRangeEnd(i);
567         if (!set.contains(a, b)) {
568             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
569                   " but doesn't: " + set);
570         }
571         if (set.contains((UChar32)(a-1), b)) {
572             errln((UnicodeString)"FAIL, shouldn't contain " +
573                   (unsigned short)(a-1) + '-' + (unsigned short)b +
574                   " but does: " + set);
575         }
576         if (set.contains(a, (UChar32)(b+1))) {
577             errln((UnicodeString)"FAIL, shouldn't contain " +
578                   (unsigned short)a + '-' + (unsigned short)(b+1) +
579                   " but does: " + set);
580         }
581     }
582 
583     // Ported InversionList test.
584     UnicodeSet a((UChar32)3,(UChar32)10);
585     UnicodeSet b((UChar32)7,(UChar32)15);
586     UnicodeSet c;
587 
588     logln((UnicodeString)"a [3-10]: " + a);
589     logln((UnicodeString)"b [7-15]: " + b);
590     c = a;
591     c.addAll(b);
592     UnicodeSet exp((UChar32)3,(UChar32)15);
593     if (c == exp) {
594         logln((UnicodeString)"c.set(a).add(b): " + c);
595     } else {
596         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
597     }
598     c.complement();
599     exp.set((UChar32)0, (UChar32)2);
600     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
601     if (c == exp) {
602         logln((UnicodeString)"c.complement(): " + c);
603     } else {
604         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
605     }
606     c.complement();
607     exp.set((UChar32)3, (UChar32)15);
608     if (c == exp) {
609         logln((UnicodeString)"c.complement(): " + c);
610     } else {
611         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
612     }
613     c = a;
614     c.complementAll(b);
615     exp.set((UChar32)3,(UChar32)6);
616     exp.add((UChar32)11,(UChar32) 15);
617     if (c == exp) {
618         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
619     } else {
620         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
621     }
622 
623     exp = c;
624     bitsToSet(setToBits(c), c);
625     if (c == exp) {
626         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
627     } else {
628         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
629     }
630 
631     // Additional tests for coverage JB#2118
632     //UnicodeSet::complement(class UnicodeString const &)
633     //UnicodeSet::complementAll(class UnicodeString const &)
634     //UnicodeSet::containsNone(class UnicodeSet const &)
635     //UnicodeSet::containsNone(long,long)
636     //UnicodeSet::containsSome(class UnicodeSet const &)
637     //UnicodeSet::containsSome(long,long)
638     //UnicodeSet::removeAll(class UnicodeString const &)
639     //UnicodeSet::retain(long)
640     //UnicodeSet::retainAll(class UnicodeString const &)
641     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
642     //UnicodeSetIterator::getString(void)
643     set.clear();
644     set.complement("ab");
645     exp.applyPattern("[{ab}]", status);
646     if (U_FAILURE(status)) { errln("FAIL"); return; }
647     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
648 
649     UnicodeSetIterator iset(set);
650     if (!iset.next() || !iset.isString()) {
651         errln("FAIL: UnicodeSetIterator::next/isString");
652     } else if (iset.getString() != "ab") {
653         errln("FAIL: UnicodeSetIterator::getString");
654     }
655 
656     set.add(u'a', u'z');
657     set.complementAll("alan");
658     exp.applyPattern("[{ab}b-kmo-z]", status);
659     if (U_FAILURE(status)) { errln("FAIL"); return; }
660     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
661 
662     exp.applyPattern("[a-z]", status);
663     if (U_FAILURE(status)) { errln("FAIL"); return; }
664     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
665     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
666     exp.applyPattern("[aln]", status);
667     if (U_FAILURE(status)) { errln("FAIL"); return; }
668     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
669     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
670 
671     if (set.containsNone(u'a', u'z')) {
672         errln("FAIL: containsNone(UChar32, UChar32)");
673     }
674     if (!set.containsSome(u'a', u'z')) {
675         errln("FAIL: containsSome(UChar32, UChar32)");
676     }
677     if (!set.containsNone(u'A', u'Z')) {
678         errln("FAIL: containsNone(UChar32, UChar32)");
679     }
680     if (set.containsSome(u'A', u'Z')) {
681         errln("FAIL: containsSome(UChar32, UChar32)");
682     }
683 
684     set.removeAll("liu");
685     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
686     if (U_FAILURE(status)) { errln("FAIL"); return; }
687     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
688 
689     set.retainAll("star");
690     exp.applyPattern("[rst]", status);
691     if (U_FAILURE(status)) { errln("FAIL"); return; }
692     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
693 
694     set.retain(u's');
695     exp.applyPattern("[s]", status);
696     if (U_FAILURE(status)) { errln("FAIL"); return; }
697     if (set != exp) { errln("FAIL: retain('s')"); return; }
698 
699     // ICU 2.6 coverage tests
700     // public final UnicodeSet retain(String s);
701     // public final UnicodeSet remove(int c);
702     // public final UnicodeSet remove(String s);
703     // public int hashCode();
704     set.applyPattern(u"[a-z{ab}{cd}]", status);
705     if (U_FAILURE(status)) { errln("FAIL"); return; }
706     set.retain(u"cd");
707     exp.applyPattern(u"[{cd}]", status);
708     if (U_FAILURE(status)) { errln("FAIL"); return; }
709     if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
710 
711     set.applyPattern(u"[a-z{ab}{yz}]", status);
712     if (U_FAILURE(status)) { errln("FAIL"); return; }
713     set.retain(u"cd");
714     exp.clear();
715     if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
716 
717     set.applyPattern(u"[a-z{ab}{cd}]", status);
718     if (U_FAILURE(status)) { errln("FAIL"); return; }
719     set.remove(u'c');
720     exp.applyPattern(u"[abd-z{ab}{cd}]", status);
721     if (set != exp) { errln("FAIL: remove('c')"); return; }
722 
723     set.remove(u"cd");
724     exp.applyPattern(u"[abd-z{ab}]", status);
725     if (U_FAILURE(status)) { errln("FAIL"); return; }
726     if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
727 
728     set.applyPattern("[s]", status);
729     if (U_FAILURE(status)) { errln("FAIL"); return; }
730     uint16_t buf[32];
731     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
732     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
733     if (slen != 3 || buf[0] != 2 || buf[1] != u's' || buf[2] != u't') {
734         errln("FAIL: serialize");
735         return;
736     }
737 
738     // Conversions to and from USet
739     UnicodeSet *uniset = &set;
740     USet *uset = uniset->toUSet();
741     TEST_ASSERT((void *)uset == (void *)uniset);
742     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
743     TEST_ASSERT((void *)setx == (void *)uset);
744     const UnicodeSet *constSet = uniset;
745     const USet *constUSet = constSet->toUSet();
746     TEST_ASSERT((void *)constUSet == (void *)constSet);
747     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
748     TEST_ASSERT((void *)constSetx == (void *)constUSet);
749 
750     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
751     UnicodeString longString=u"aaaaaaaaaabbbbbbbbbbcccccccccc";
752     UnicodeSet ac(0x61, 0x63);
753     ac.remove(0x62).freeze();
754     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
755         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
756         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
757         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
758         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
759         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
760         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
761         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
762         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
763         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
764     ) {
765         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
766     }
767     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
768         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
769         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
770         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
771         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
772         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
773         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
774         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
775         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
776         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
777     ) {
778         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
779     }
780 }
781 
TestIteration()782 void UnicodeSetTest::TestIteration() {
783     UErrorCode ec = U_ZERO_ERROR;
784     int i = 0;
785     int outerLoop;
786 
787     // 6 code points, 3 ranges, 2 strings, 8 total elements
788     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
789     UnicodeSet set(u"[zabyc\\U0001abcd{str1}{str2}]", ec);
790     TEST_ASSERT_SUCCESS(ec);
791     UnicodeSetIterator it(set);
792 
793     for (outerLoop=0; outerLoop<3; outerLoop++) {
794         // Run the test multiple times, to check that iterator.reset() is working.
795         for (i=0; i<10; i++) {
796             UBool         nextv        = it.next();
797             UBool         isString     = it.isString();
798             int32_t       codePoint    = it.getCodepoint();
799             //int32_t       codePointEnd = it.getCodepointEnd();
800             UnicodeString s   = it.getString();
801             switch (i) {
802             case 0:
803                 TEST_ASSERT(nextv == TRUE);
804                 TEST_ASSERT(isString == FALSE);
805                 TEST_ASSERT(codePoint==0x61);
806                 TEST_ASSERT(s == "a");
807                 break;
808             case 1:
809                 TEST_ASSERT(nextv == TRUE);
810                 TEST_ASSERT(isString == FALSE);
811                 TEST_ASSERT(codePoint==0x62);
812                 TEST_ASSERT(s == "b");
813                 break;
814             case 2:
815                 TEST_ASSERT(nextv == TRUE);
816                 TEST_ASSERT(isString == FALSE);
817                 TEST_ASSERT(codePoint==0x63);
818                 TEST_ASSERT(s == "c");
819                 break;
820             case 3:
821                 TEST_ASSERT(nextv == TRUE);
822                 TEST_ASSERT(isString == FALSE);
823                 TEST_ASSERT(codePoint==0x79);
824                 TEST_ASSERT(s == "y");
825                 break;
826             case 4:
827                 TEST_ASSERT(nextv == TRUE);
828                 TEST_ASSERT(isString == FALSE);
829                 TEST_ASSERT(codePoint==0x7a);
830                 TEST_ASSERT(s == "z");
831                 break;
832             case 5:
833                 TEST_ASSERT(nextv == TRUE);
834                 TEST_ASSERT(isString == FALSE);
835                 TEST_ASSERT(codePoint==0x1abcd);
836                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
837                 break;
838             case 6:
839                 TEST_ASSERT(nextv == TRUE);
840                 TEST_ASSERT(isString == TRUE);
841                 TEST_ASSERT(s == "str1");
842                 break;
843             case 7:
844                 TEST_ASSERT(nextv == TRUE);
845                 TEST_ASSERT(isString == TRUE);
846                 TEST_ASSERT(s == "str2");
847                 break;
848             case 8:
849                 TEST_ASSERT(nextv == FALSE);
850                 break;
851             case 9:
852                 TEST_ASSERT(nextv == FALSE);
853                 break;
854             }
855         }
856         it.reset();  // prepare to run the iteration again.
857     }
858 }
859 
860 
861 
862 
TestStrings()863 void UnicodeSetTest::TestStrings() {
864     UErrorCode ec = U_ZERO_ERROR;
865 
866     UnicodeSet* testList[] = {
867         UnicodeSet::createFromAll("abc"),
868         new UnicodeSet("[a-c]", ec),
869 
870         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
871         new UnicodeSet("[{ll}{ch}a-z]", ec),
872 
873         UnicodeSet::createFrom("ab}c"),
874         new UnicodeSet("[{ab\\}c}]", ec),
875 
876         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
877         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
878 
879         NULL
880     };
881 
882     if (U_FAILURE(ec)) {
883         errln("FAIL: couldn't construct test sets");
884     }
885 
886     for (int32_t i = 0; testList[i] != NULL; i+=2) {
887         if (U_SUCCESS(ec)) {
888             UnicodeString pat0, pat1;
889             testList[i]->toPattern(pat0, TRUE);
890             testList[i+1]->toPattern(pat1, TRUE);
891             if (*testList[i] == *testList[i+1]) {
892                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
893             } else {
894                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
895             }
896         }
897         delete testList[i];
898         delete testList[i+1];
899     }
900 }
901 
902 /**
903  * Test the [:Latin:] syntax.
904  */
TestScriptSet()905 void UnicodeSetTest::TestScriptSet() {
906     expectContainment(u"[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
907 
908     expectContainment(u"[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
909 
910     /* Jitterbug 1423 */
911     expectContainment(u"[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
912 
913 }
914 
915 /**
916  * Test the [:Latin:] syntax.
917  */
TestPropertySet()918 void UnicodeSetTest::TestPropertySet() {
919     static const char* const DATA[] = {
920         // Pattern, Chars IN, Chars NOT in
921 
922         "[:Latin:]",
923         "aA",
924         "\\u0391\\u03B1",
925 
926         "[\\p{Greek}]",
927         "\\u0391\\u03B1",
928         "aA",
929 
930         "\\P{ GENERAL Category = upper case letter }",
931         "abc",
932         "ABC",
933 
934 #if !UCONFIG_NO_NORMALIZATION
935         // Combining class: @since ICU 2.2
936         // Check both symbolic and numeric
937         "\\p{ccc=Nukta}",
938         "\\u0ABC",
939         "abc",
940 
941         "\\p{Canonical Combining Class = 11}",
942         "\\u05B1",
943         "\\u05B2",
944 
945         "[:c c c = iota subscript :]",
946         "\\u0345",
947         "xyz",
948 #endif
949 
950         // Bidi class: @since ICU 2.2
951         "\\p{bidiclass=lefttoright}",
952         "abc",
953         "\\u0671\\u0672",
954 
955         // Binary properties: @since ICU 2.2
956         "\\p{ideographic}",
957         "\\u4E0A",
958         "x",
959 
960         "[:math=false:]",
961         "q)*(",
962         // weiv: )(and * were removed from math in Unicode 4.0.1
963         //"(*+)",
964         "+<>^",
965 
966         // JB#1767 \N{}, \p{ASCII}
967         "[:Ascii:]",
968         "abc\\u0000\\u007F",
969         "\\u0080\\u4E00",
970 
971         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
972         "az",
973         "qrs",
974 
975         // JB#2015
976         "[:any:]",
977         "a\\U0010FFFF",
978         "",
979 
980         "[:nv=0.5:]",
981         "\\u00BD\\u0F2A",
982         "\\u00BC",
983 
984         // JB#2653: Age
985         "[:Age=1.1:]",
986         "\\u03D6", // 1.1
987         "\\u03D8\\u03D9", // 3.2
988 
989         "[:Age=3.1:]",
990         "\\u1800\\u3400\\U0002f800",
991         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
992 
993         // JB#2350: Case_Sensitive
994         "[:Case Sensitive:]",
995         "A\\u1FFC\\U00010410",
996         ";\\u00B4\\U00010500",
997 
998         // JB#2832: C99-compatibility props
999         "[:blank:]",
1000         " \\u0009",
1001         "1-9A-Z",
1002 
1003         "[:graph:]",
1004         "19AZ",
1005         " \\u0003\\u0007\\u0009\\u000A\\u000D",
1006 
1007         "[:punct:]",
1008         "!@#%&*()[]{}-_\\/;:,.?'\"",
1009         "09azAZ",
1010 
1011         "[:xdigit:]",
1012         "09afAF",
1013         "gG!",
1014 
1015         // Regex compatibility test
1016         "[-b]", // leading '-' is literal
1017         "-b",
1018         "ac",
1019 
1020         "[^-b]", // leading '-' is literal
1021         "ac",
1022         "-b",
1023 
1024         "[b-]", // trailing '-' is literal
1025         "-b",
1026         "ac",
1027 
1028         "[^b-]", // trailing '-' is literal
1029         "ac",
1030         "-b",
1031 
1032         "[a-b-]", // trailing '-' is literal
1033         "ab-",
1034         "c=",
1035 
1036         "[[a-q]&[p-z]-]", // trailing '-' is literal
1037         "pq-",
1038         "or=",
1039 
1040         "[\\s|\\)|:|$|\\>]", // from regex tests
1041         "s|):$>",
1042         "abc",
1043 
1044         "[\\uDC00cd]", // JB#2906: isolated trail at start
1045         "cd\\uDC00",
1046         "ab\\uD800\\U00010000",
1047 
1048         "[ab\\uD800]", // JB#2906: isolated trail at start
1049         "ab\\uD800",
1050         "cd\\uDC00\\U00010000",
1051 
1052         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1053         "abcd\\uD800",
1054         "ef\\uDC00\\U00010000",
1055 
1056         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1057         "abcd\\uDC00",
1058         "ef\\uD800\\U00010000",
1059 
1060 #if !UCONFIG_NO_NORMALIZATION
1061         "[:^lccc=0:]", // Lead canonical class
1062         "\\u0300\\u0301",
1063         "abcd\\u00c0\\u00c5",
1064 
1065         "[:^tccc=0:]", // Trail canonical class
1066         "\\u0300\\u0301\\u00c0\\u00c5",
1067         "abcd",
1068 
1069         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1070         "\\u0300\\u0301\\u00c0\\u00c5",
1071         "abcd",
1072 
1073         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1074         "",
1075         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1076 
1077         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1078         "\\u0F73\\u0F75\\u0F81",
1079         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1080 #endif /* !UCONFIG_NO_NORMALIZATION */
1081 
1082         "[:Assigned:]",
1083         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1084         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1085 
1086         // Script_Extensions, new in Unicode 6.0
1087         "[:scx=Arab:]",
1088         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1089         "\\u061D\\uFDEF\\uFDFE",
1090 
1091         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1092         // so scx-sc is missing U+FDF2.
1093         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1094         "\\u0640\\u064B\\u0650\\u0655",
1095         "\\uFDF2"
1096     };
1097 
1098     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1099 
1100     for (int32_t i=0; i<DATA_LEN; i+=3) {
1101         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1102                           CharsToUnicodeString(DATA[i+2]));
1103     }
1104 }
1105 
1106 /**
1107   * Test that Posix style character classes [:digit:], etc.
1108   *   have the Unicode definitions from TR 18.
1109   */
TestPosixClasses()1110 void UnicodeSetTest::TestPosixClasses() {
1111     {
1112         UErrorCode status = U_ZERO_ERROR;
1113         UnicodeSet s1("[:alpha:]", status);
1114         UnicodeSet s2(u"\\p{Alphabetic}", status);
1115         TEST_ASSERT_SUCCESS(status);
1116         TEST_ASSERT(s1==s2);
1117     }
1118     {
1119         UErrorCode status = U_ZERO_ERROR;
1120         UnicodeSet s1("[:lower:]", status);
1121         UnicodeSet s2(u"\\p{lowercase}", status);
1122         TEST_ASSERT_SUCCESS(status);
1123         TEST_ASSERT(s1==s2);
1124     }
1125     {
1126         UErrorCode status = U_ZERO_ERROR;
1127         UnicodeSet s1("[:upper:]", status);
1128         UnicodeSet s2(u"\\p{Uppercase}", status);
1129         TEST_ASSERT_SUCCESS(status);
1130         TEST_ASSERT(s1==s2);
1131     }
1132     {
1133         UErrorCode status = U_ZERO_ERROR;
1134         UnicodeSet s1("[:punct:]", status);
1135         UnicodeSet s2(u"\\p{gc=Punctuation}", status);
1136         TEST_ASSERT_SUCCESS(status);
1137         TEST_ASSERT(s1==s2);
1138     }
1139     {
1140         UErrorCode status = U_ZERO_ERROR;
1141         UnicodeSet s1("[:digit:]", status);
1142         UnicodeSet s2(u"\\p{gc=DecimalNumber}", status);
1143         TEST_ASSERT_SUCCESS(status);
1144         TEST_ASSERT(s1==s2);
1145     }
1146     {
1147         UErrorCode status = U_ZERO_ERROR;
1148         UnicodeSet s1("[:xdigit:]", status);
1149         UnicodeSet s2(u"[\\p{DecimalNumber}\\p{HexDigit}]", status);
1150         TEST_ASSERT_SUCCESS(status);
1151         TEST_ASSERT(s1==s2);
1152     }
1153     {
1154         UErrorCode status = U_ZERO_ERROR;
1155         UnicodeSet s1("[:alnum:]", status);
1156         UnicodeSet s2(u"[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1157         TEST_ASSERT_SUCCESS(status);
1158         TEST_ASSERT(s1==s2);
1159     }
1160     {
1161         UErrorCode status = U_ZERO_ERROR;
1162         UnicodeSet s1("[:space:]", status);
1163         UnicodeSet s2(u"\\p{Whitespace}", status);
1164         TEST_ASSERT_SUCCESS(status);
1165         TEST_ASSERT(s1==s2);
1166     }
1167     {
1168         UErrorCode status = U_ZERO_ERROR;
1169         UnicodeSet s1("[:blank:]", status);
1170         TEST_ASSERT_SUCCESS(status);
1171         UnicodeSet s2(u"[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1172             status);
1173         TEST_ASSERT_SUCCESS(status);
1174         TEST_ASSERT(s1==s2);
1175     }
1176     {
1177         UErrorCode status = U_ZERO_ERROR;
1178         UnicodeSet s1("[:cntrl:]", status);
1179         TEST_ASSERT_SUCCESS(status);
1180         UnicodeSet s2(u"\\p{Control}", status);
1181         TEST_ASSERT_SUCCESS(status);
1182         TEST_ASSERT(s1==s2);
1183     }
1184     {
1185         UErrorCode status = U_ZERO_ERROR;
1186         UnicodeSet s1("[:graph:]", status);
1187         TEST_ASSERT_SUCCESS(status);
1188         UnicodeSet s2(u"[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1189         TEST_ASSERT_SUCCESS(status);
1190         TEST_ASSERT(s1==s2);
1191     }
1192     {
1193         UErrorCode status = U_ZERO_ERROR;
1194         UnicodeSet s1("[:print:]", status);
1195         TEST_ASSERT_SUCCESS(status);
1196         UnicodeSet s2(u"[[:graph:][:blank:]-[\\p{Control}]]", status);
1197         TEST_ASSERT_SUCCESS(status);
1198         TEST_ASSERT(s1==s2);
1199     }
1200 }
1201 /**
1202  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1203  */
TestClone()1204 void UnicodeSetTest::TestClone() {
1205     UErrorCode ec = U_ZERO_ERROR;
1206     UnicodeSet s("[abcxyz]", ec);
1207     UnicodeSet t(s);
1208     expectContainment(t, "abc", "def");
1209 }
1210 
1211 /**
1212  * Test the indexOf() and charAt() methods.
1213  */
TestIndexOf()1214 void UnicodeSetTest::TestIndexOf() {
1215     UErrorCode ec = U_ZERO_ERROR;
1216     UnicodeSet set("[a-cx-y3578]", ec);
1217     if (U_FAILURE(ec)) {
1218         errln("FAIL: UnicodeSet constructor");
1219         return;
1220     }
1221     for (int32_t i=0; i<set.size(); ++i) {
1222         UChar32 c = set.charAt(i);
1223         if (set.indexOf(c) != i) {
1224             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1225                 i, c, set.indexOf(c));
1226         }
1227     }
1228     UChar32 c = set.charAt(set.size());
1229     if (c != -1) {
1230         errln("FAIL: charAt(<out of range>) = %X", c);
1231     }
1232     int32_t j = set.indexOf(u'q');
1233     if (j != -1) {
1234         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1235     }
1236 }
1237 
1238 /**
1239  * Test closure API.
1240  */
TestCloseOver()1241 void UnicodeSetTest::TestCloseOver() {
1242     UErrorCode ec = U_ZERO_ERROR;
1243 
1244     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1245     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1246     const char* DATA[] = {
1247         // selector, input, output
1248         CASE,
1249         "[aq\\u00DF{Bc}{bC}{Fi}]",
1250         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1251 
1252         CASE,
1253         "[\\u01F1]", // 'DZ'
1254         "[\\u01F1\\u01F2\\u01F3]",
1255 
1256         CASE,
1257         "[\\u1FB4]",
1258         "[\\u1FB4{\\u03AC\\u03B9}]",
1259 
1260         CASE,
1261         "[{F\\uFB01}]",
1262         "[\\uFB03{ffi}]",
1263 
1264         CASE, // make sure binary search finds limits
1265         "[a\\uFF3A]",
1266         "[aA\\uFF3A\\uFF5A]",
1267 
1268         CASE,
1269         "[a-z]","[A-Za-z\\u017F\\u212A]",
1270         CASE,
1271         "[abc]","[A-Ca-c]",
1272         CASE,
1273         "[ABC]","[A-Ca-c]",
1274 
1275         CASE, "[i]", "[iI]",
1276 
1277         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1278         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1279 
1280         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1281 
1282         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1283 
1284         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1285 
1286         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1287 
1288         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1289 
1290         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1291 
1292         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1293         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1294 
1295         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1296 
1297         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1298 
1299         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1300 
1301 #if !UCONFIG_NO_FILE_IO
1302         CASE_MAPPINGS,
1303         "[aq\\u00DF{Bc}{bC}{Fi}]",
1304         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1305 #endif
1306 
1307         CASE_MAPPINGS,
1308         "[\\u01F1]", // 'DZ'
1309         "[\\u01F1\\u01F2\\u01F3]",
1310 
1311         CASE_MAPPINGS,
1312         "[a-z]",
1313         "[A-Za-z]",
1314 
1315         NULL
1316     };
1317 
1318     UnicodeSet s;
1319     UnicodeSet t;
1320     UnicodeString buf;
1321     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1322         int32_t selector = DATA[i][0];
1323         UnicodeString pat(DATA[i+1], -1, US_INV);
1324         UnicodeString exp(DATA[i+2], -1, US_INV);
1325         s.applyPattern(pat, ec);
1326         s.closeOver(selector);
1327         t.applyPattern(exp, ec);
1328         if (U_FAILURE(ec)) {
1329             errln("FAIL: applyPattern failed");
1330             continue;
1331         }
1332         if (s == t) {
1333             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1334         } else {
1335             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1336                   s.toPattern(buf, TRUE) + ", expected " + exp);
1337         }
1338     }
1339 
1340 #if 0
1341     /*
1342      * Unused test code.
1343      * This was used to compare the old implementation (using USET_CASE)
1344      * with the new one (using 0x100 temporarily)
1345      * while transitioning from hardcoded case closure tables in uniset.cpp
1346      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1347      * and using ucase.c functions for closure.
1348      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1349      *
1350      * Note: The old and new implementation never fully matched because
1351      * the old implementation turned out to not map U+0130 and U+0131 correctly
1352      * (dotted I and dotless i) and because the old implementation's data tables
1353      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1354      * new implementation. (So sigmas and some other characters were not handled
1355      * according to the newer Unicode version.)
1356      */
1357     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1358     UnicodeSetIterator si(sens);
1359     UnicodeString str, buf2;
1360     const UnicodeString *pStr;
1361     UChar32 c;
1362     while(si.next()) {
1363         if(!si.isString()) {
1364             c=si.getCodepoint();
1365             s.clear();
1366             s.add(c);
1367 
1368             str.setTo(c);
1369             str.foldCase();
1370             sens2.add(str);
1371 
1372             t=s;
1373             s.closeOver(USET_CASE);
1374             t.closeOver(0x100);
1375             if(s!=t) {
1376                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1377                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1378             }
1379         }
1380     }
1381     // remove all code points
1382     // should contain all full case folding mapping strings
1383     sens2.remove(0, 0x10ffff);
1384     si.reset(sens2);
1385     while(si.next()) {
1386         if(si.isString()) {
1387             pStr=&si.getString();
1388             s.clear();
1389             s.add(*pStr);
1390             t=s2=s;
1391             s.closeOver(USET_CASE);
1392             t.closeOver(0x100);
1393             if(s!=t) {
1394                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1395                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1396             }
1397         }
1398     }
1399 #endif
1400 
1401     // Test the pattern API
1402     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1403     if (U_FAILURE(ec)) {
1404         errln("FAIL: applyPattern failed");
1405     } else {
1406         expectContainment(s, "abcABC", "defDEF");
1407     }
1408     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1409     if (U_FAILURE(ec)) {
1410         errln("FAIL: constructor failed");
1411     } else {
1412         expectContainment(v, "defDEF", "abcABC");
1413     }
1414     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1415     if (U_FAILURE(ec)) {
1416         errln("FAIL: construct w/case mappings failed");
1417     } else {
1418         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1419     }
1420 }
1421 
TestEscapePattern()1422 void UnicodeSetTest::TestEscapePattern() {
1423     const char pattern[] =
1424         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1425     const char exp[] =
1426         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1427     // We test this with two passes; in the second pass we
1428     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1429     // this fails -- which is what we expect.
1430     for (int32_t pass=1; pass<=2; ++pass) {
1431         UErrorCode ec = U_ZERO_ERROR;
1432         UnicodeString pat(pattern, -1, US_INV);
1433         if (pass==2) {
1434             pat = pat.unescape();
1435         }
1436         // Pattern is only good for pass 1
1437         UBool isPatternValid = (pass==1);
1438 
1439         UnicodeSet set(pat, ec);
1440         if (U_SUCCESS(ec) != isPatternValid){
1441             errln((UnicodeString)"FAIL: applyPattern(" +
1442                   escape(pat) + ") => " +
1443                   u_errorName(ec));
1444             continue;
1445         }
1446         if (U_FAILURE(ec)) {
1447             continue;
1448         }
1449         if (set.contains(u'\u0644')){
1450             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1451         }
1452 
1453         UnicodeString newpat;
1454         set.toPattern(newpat, TRUE);
1455         if (newpat == UnicodeString(exp, -1, US_INV)) {
1456             logln(escape(pat) + " => " + newpat);
1457         } else {
1458             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1459         }
1460 
1461         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1462             UnicodeString str("Range ");
1463             str.append((UChar)(u'0' + i))
1464                 .append(": ")
1465                 .append((UChar32)set.getRangeStart(i))
1466                 .append(" - ")
1467                 .append((UChar32)set.getRangeEnd(i));
1468             str = str + " (" + set.getRangeStart(i) + " - " +
1469                 set.getRangeEnd(i) + ")";
1470             if (set.getRangeStart(i) < 0) {
1471                 errln((UnicodeString)"FAIL: " + escape(str));
1472             } else {
1473                 logln(escape(str));
1474             }
1475         }
1476     }
1477 }
1478 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1479 void UnicodeSetTest::expectRange(const UnicodeString& label,
1480                                  const UnicodeSet& set,
1481                                  UChar32 start, UChar32 end) {
1482     UnicodeSet exp(start, end);
1483     UnicodeString pat;
1484     if (set == exp) {
1485         logln(label + " => " + set.toPattern(pat, TRUE));
1486     } else {
1487         UnicodeString xpat;
1488         errln((UnicodeString)"FAIL: " + label + " => " +
1489               set.toPattern(pat, TRUE) +
1490               ", expected " + exp.toPattern(xpat, TRUE));
1491     }
1492 }
1493 
TestInvalidCodePoint()1494 void UnicodeSetTest::TestInvalidCodePoint() {
1495 
1496     const UChar32 DATA[] = {
1497         // Test range             Expected range
1498         0, 0x10FFFF,              0, 0x10FFFF,
1499         (UChar32)-1, 8,           0, 8,
1500         8, 0x110000,              8, 0x10FFFF
1501     };
1502     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1503 
1504     UnicodeString pat;
1505     int32_t i;
1506 
1507     for (i=0; i<DATA_LENGTH; i+=4) {
1508         UChar32 start  = DATA[i];
1509         UChar32 end    = DATA[i+1];
1510         UChar32 xstart = DATA[i+2];
1511         UChar32 xend   = DATA[i+3];
1512 
1513         // Try various API using the test code points
1514 
1515         UnicodeSet set(start, end);
1516         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1517                     set, xstart, xend);
1518 
1519         set.clear();
1520         set.set(start, end);
1521         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1522                     set, xstart, xend);
1523 
1524         UBool b = set.contains(start);
1525         b = set.contains(start, end);
1526         b = set.containsNone(start, end);
1527         b = set.containsSome(start, end);
1528         (void)b;   // Suppress set but not used warning.
1529 
1530         /*int32_t index = set.indexOf(start);*/
1531 
1532         set.clear();
1533         set.add(start);
1534         set.add(start, end);
1535         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1536                     set, xstart, xend);
1537 
1538         set.set(0, 0x10FFFF);
1539         set.retain(start, end);
1540         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1541                     set, xstart, xend);
1542         set.retain(start);
1543 
1544         set.set(0, 0x10FFFF);
1545         set.remove(start);
1546         set.remove(start, end);
1547         set.complement();
1548         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1549                     set, xstart, xend);
1550 
1551         set.set(0, 0x10FFFF);
1552         set.complement(start, end);
1553         set.complement();
1554         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1555                     set, xstart, xend);
1556         set.complement(start);
1557     }
1558 
1559     const UChar32 DATA2[] = {
1560         0,
1561         0x10FFFF,
1562         (UChar32)-1,
1563         0x110000
1564     };
1565     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1566 
1567     for (i=0; i<DATA2_LENGTH; ++i) {
1568         UChar32 c = DATA2[i], end = 0x10FFFF;
1569         UBool valid = (c >= 0 && c <= 0x10FFFF);
1570 
1571         UnicodeSet set(0, 0x10FFFF);
1572 
1573         // For single-codepoint contains, invalid codepoints are NOT contained
1574         UBool b = set.contains(c);
1575         if (b == valid) {
1576             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1577                   ") = " + b);
1578         } else {
1579             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1580                   ") = " + b);
1581         }
1582 
1583         // For codepoint range contains, containsNone, and containsSome,
1584         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1585         b = set.contains(c, end);
1586         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1587               "," + end + ") = " + b);
1588 
1589         b = set.containsNone(c, end);
1590         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1591               "," + end + ") = " + b);
1592 
1593         b = set.containsSome(c, end);
1594         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1595               "," + end + ") = " + b);
1596 
1597         int32_t index = set.indexOf(c);
1598         if ((index >= 0) == valid) {
1599             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1600                   ") = " + index);
1601         } else {
1602             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1603                   ") = " + index);
1604         }
1605     }
1606 }
1607 
1608 // Used by TestSymbolTable
1609 class TokenSymbolTable : public SymbolTable {
1610 public:
1611     Hashtable contents;
1612 
TokenSymbolTable(UErrorCode & ec)1613     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1614         contents.setValueDeleter(uprv_deleteUObject);
1615     }
1616 
~TokenSymbolTable()1617     ~TokenSymbolTable() {}
1618 
1619     /**
1620      * (Non-SymbolTable API) Add the given variable and value to
1621      * the table.  Variable should NOT contain leading '$'.
1622      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1623     void add(const UnicodeString& var, const UnicodeString& value,
1624              UErrorCode& ec) {
1625         if (U_SUCCESS(ec)) {
1626             contents.put(var, new UnicodeString(value), ec);
1627         }
1628     }
1629 
1630     /**
1631      * SymbolTable API
1632      */
lookup(const UnicodeString & s) const1633     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1634         return (const UnicodeString*) contents.get(s);
1635     }
1636 
1637     /**
1638      * SymbolTable API
1639      */
lookupMatcher(UChar32) const1640     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1641         return NULL;
1642     }
1643 
1644     /**
1645      * SymbolTable API
1646      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1647     virtual UnicodeString parseReference(const UnicodeString& text,
1648                                          ParsePosition& pos, int32_t limit) const {
1649         int32_t start = pos.getIndex();
1650         int32_t i = start;
1651         UnicodeString result;
1652         while (i < limit) {
1653             UChar c = text.charAt(i);
1654             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1655                 break;
1656             }
1657             ++i;
1658         }
1659         if (i == start) { // No valid name chars
1660             return result; // Indicate failure with empty string
1661         }
1662         pos.setIndex(i);
1663         text.extractBetween(start, i, result);
1664         return result;
1665     }
1666 };
1667 
TestSymbolTable()1668 void UnicodeSetTest::TestSymbolTable() {
1669     // Multiple test cases can be set up here.  Each test case
1670     // is terminated by null:
1671     // var, value, var, value,..., input pat., exp. output pat., null
1672     const char* DATA[] = {
1673         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1674         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1675         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1676         NULL
1677     };
1678 
1679     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1680         UErrorCode ec = U_ZERO_ERROR;
1681         TokenSymbolTable sym(ec);
1682         if (U_FAILURE(ec)) {
1683             errln("FAIL: couldn't construct TokenSymbolTable");
1684             continue;
1685         }
1686 
1687         // Set up variables
1688         while (DATA[i+2] != NULL) {
1689             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1690             if (U_FAILURE(ec)) {
1691                 errln("FAIL: couldn't add to TokenSymbolTable");
1692                 continue;
1693             }
1694             i += 2;
1695         }
1696 
1697         // Input pattern and expected output pattern
1698         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1699         i += 2;
1700 
1701         ParsePosition pos(0);
1702         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1703         if (U_FAILURE(ec)) {
1704             errln("FAIL: couldn't construct UnicodeSet");
1705             continue;
1706         }
1707 
1708         // results
1709         if (pos.getIndex() != inpat.length()) {
1710             errln((UnicodeString)"Failed to read to end of string \""
1711                   + inpat + "\": read to "
1712                   + pos.getIndex() + ", length is "
1713                   + inpat.length());
1714         }
1715 
1716         UnicodeSet us2(exppat, ec);
1717         if (U_FAILURE(ec)) {
1718             errln("FAIL: couldn't construct expected UnicodeSet");
1719             continue;
1720         }
1721 
1722         UnicodeString a, b;
1723         if (us != us2) {
1724             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1725                   ", expected " + us2.toPattern(b, TRUE));
1726         } else {
1727             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1728         }
1729     }
1730 }
1731 
TestSurrogate()1732 void UnicodeSetTest::TestSurrogate() {
1733     const char* DATA[] = {
1734         // These should all behave identically
1735         "[abc\\uD800\\uDC00]",
1736         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1737         "[abc\\U00010000]",
1738         0
1739     };
1740     for (int i=0; DATA[i] != 0; ++i) {
1741         UErrorCode ec = U_ZERO_ERROR;
1742         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1743         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1744         UnicodeSet set(str, ec);
1745         if (U_FAILURE(ec)) {
1746             errln("FAIL: UnicodeSet constructor");
1747             continue;
1748         }
1749         expectContainment(set,
1750                           CharsToUnicodeString("abc\\U00010000"),
1751                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1752         if (set.size() != 4) {
1753             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1754                   set.size() + ", expected 4");
1755         }
1756 
1757         {
1758           UErrorCode subErr = U_ZERO_ERROR;
1759           checkRoundTrip(set);
1760           checkSerializeRoundTrip(set, subErr);
1761         }
1762     }
1763 }
1764 
TestExhaustive()1765 void UnicodeSetTest::TestExhaustive() {
1766     // exhaustive tests. Simulate UnicodeSets with integers.
1767     // That gives us very solid tests (except for large memory tests).
1768 
1769     int32_t limit = 128;
1770 
1771     UnicodeSet x, y, z, aa;
1772 
1773     for (int32_t i = 0; i < limit; ++i) {
1774         bitsToSet(i, x);
1775         logln((UnicodeString)"Testing " + i + ", " + x);
1776         _testComplement(i, x, y);
1777 
1778         UnicodeSet &toTest = bitsToSet(i, aa);
1779 
1780         // AS LONG AS WE ARE HERE, check roundtrip
1781         checkRoundTrip(toTest);
1782         UErrorCode ec = U_ZERO_ERROR;
1783         checkSerializeRoundTrip(toTest, ec);
1784 
1785         for (int32_t j = 0; j < limit; ++j) {
1786             _testAdd(i,j,  x,y,z);
1787             _testXor(i,j,  x,y,z);
1788             _testRetain(i,j,  x,y,z);
1789             _testRemove(i,j,  x,y,z);
1790         }
1791     }
1792 }
1793 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1794 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1795     bitsToSet(a, x);
1796     z = x;
1797     z.complement();
1798     int32_t c = setToBits(z);
1799     if (c != (~a)) {
1800         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1801         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1802     }
1803     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1804 }
1805 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1806 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1807     bitsToSet(a, x);
1808     bitsToSet(b, y);
1809     z = x;
1810     z.addAll(y);
1811     int32_t c = setToBits(z);
1812     if (c != (a | b)) {
1813         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1814         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1815     }
1816     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1817 }
1818 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1819 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1820     bitsToSet(a, x);
1821     bitsToSet(b, y);
1822     z = x;
1823     z.retainAll(y);
1824     int32_t c = setToBits(z);
1825     if (c != (a & b)) {
1826         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1827         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1828     }
1829     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1830 }
1831 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1832 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1833     bitsToSet(a, x);
1834     bitsToSet(b, y);
1835     z = x;
1836     z.removeAll(y);
1837     int32_t c = setToBits(z);
1838     if (c != (a &~ b)) {
1839         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1840         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1841     }
1842     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1843 }
1844 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1845 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1846     bitsToSet(a, x);
1847     bitsToSet(b, y);
1848     z = x;
1849     z.complementAll(y);
1850     int32_t c = setToBits(z);
1851     if (c != (a ^ b)) {
1852         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1853         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1854     }
1855     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1856 }
1857 
1858 /**
1859  * Check that ranges are monotonically increasing and non-
1860  * overlapping.
1861  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1862 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1863     int32_t n = set.getRangeCount();
1864     if (n < 0) {
1865         errln((UnicodeString)"FAIL result of " + msg +
1866               ": range count should be >= 0 but is " +
1867               n /*+ " for " + set.toPattern())*/);
1868         return;
1869     }
1870     UChar32 last = 0;
1871     for (int32_t i=0; i<n; ++i) {
1872         UChar32 start = set.getRangeStart(i);
1873         UChar32 end = set.getRangeEnd(i);
1874         if (start > end) {
1875             errln((UnicodeString)"FAIL result of " + msg +
1876                   ": range " + (i+1) +
1877                   " start > end: " + (int)start + ", " + (int)end +
1878                   " for " + set);
1879         }
1880         if (i > 0 && start <= last) {
1881             errln((UnicodeString)"FAIL result of " + msg +
1882                   ": range " + (i+1) +
1883                   " overlaps previous range: " + (int)start + ", " + (int)end +
1884                   " for " + set);
1885         }
1886         last = end;
1887     }
1888 }
1889 
1890 /**
1891  * Convert a bitmask to a UnicodeSet.
1892  */
bitsToSet(int32_t a,UnicodeSet & result)1893 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1894     result.clear();
1895     for (UChar32 i = 0; i < 32; ++i) {
1896         if ((a & (1<<i)) != 0) {
1897             result.add(i);
1898         }
1899     }
1900     return result;
1901 }
1902 
1903 /**
1904  * Convert a UnicodeSet to a bitmask.  Only the characters
1905  * U+0000 to U+0020 are represented in the bitmask.
1906  */
setToBits(const UnicodeSet & x)1907 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1908     int32_t result = 0;
1909     for (int32_t i = 0; i < 32; ++i) {
1910         if (x.contains((UChar32)i)) {
1911             result |= (1<<i);
1912         }
1913     }
1914     return result;
1915 }
1916 
1917 /**
1918  * Return the representation of an inversion list based UnicodeSet
1919  * as a pairs list.  Ranges are listed in ascending Unicode order.
1920  * For example, the set [a-zA-M3] is represented as "33AMaz".
1921  */
getPairs(const UnicodeSet & set)1922 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1923     UnicodeString pairs;
1924     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1925         UChar32 start = set.getRangeStart(i);
1926         UChar32 end = set.getRangeEnd(i);
1927         if (end > 0xFFFF) {
1928             end = 0xFFFF;
1929             i = set.getRangeCount(); // Should be unnecessary
1930         }
1931         pairs.append((UChar)start).append((UChar)end);
1932     }
1933     return pairs;
1934 }
1935 
1936 /**
1937  * Basic consistency check for a few items.
1938  * That the iterator works, and that we can create a pattern and
1939  * get the same thing back
1940  */
checkRoundTrip(const UnicodeSet & s)1941 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1942     {
1943         UnicodeSet t(s);
1944         checkEqual(s, t, "copy ct");
1945     }
1946 
1947     {
1948         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
1949         t = s;
1950         checkEqual(s, t, "operator=");
1951     }
1952 
1953     {
1954         UnicodeSet t;
1955         copyWithIterator(t, s, FALSE);
1956         checkEqual(s, t, "iterator roundtrip");
1957     }
1958 
1959     {
1960         UnicodeSet t;
1961         copyWithIterator(t, s, TRUE); // try range
1962         checkEqual(s, t, "iterator roundtrip");
1963     }
1964 
1965     {
1966         UnicodeSet t;
1967         UnicodeString pat;
1968         UErrorCode ec = U_ZERO_ERROR;
1969         s.toPattern(pat, FALSE);
1970         t.applyPattern(pat, ec);
1971         if (U_FAILURE(ec)) {
1972             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1973             return;
1974         } else {
1975             checkEqual(s, t, "toPattern(false)");
1976         }
1977     }
1978 
1979     {
1980         UnicodeSet t;
1981         UnicodeString pat;
1982         UErrorCode ec = U_ZERO_ERROR;
1983         s.toPattern(pat, TRUE);
1984         t.applyPattern(pat, ec);
1985         if (U_FAILURE(ec)) {
1986             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1987             return;
1988         } else {
1989             checkEqual(s, t, "toPattern(true)");
1990         }
1991     }
1992 }
1993 
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1994 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1995   if(U_FAILURE(status)) return;
1996   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1997   if(status == U_BUFFER_OVERFLOW_ERROR) {
1998     status = U_ZERO_ERROR;
1999     serializeBuffer.resize(len);
2000     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2001     // let 2nd error stand
2002   }
2003   if(U_FAILURE(status)) {
2004     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
2005     return;
2006   }
2007   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
2008   if(U_FAILURE(status)) {
2009     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
2010     return;
2011   }
2012 
2013   checkEqual(t, deserialized, "Set was unequal when deserialized");
2014 }
2015 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)2016 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
2017     t.clear();
2018     UnicodeSetIterator it(s);
2019     if (withRange) {
2020         while (it.nextRange()) {
2021             if (it.isString()) {
2022                 t.add(it.getString());
2023             } else {
2024                 t.add(it.getCodepoint(), it.getCodepointEnd());
2025             }
2026         }
2027     } else {
2028         while (it.next()) {
2029             if (it.isString()) {
2030                 t.add(it.getString());
2031             } else {
2032                 t.add(it.getCodepoint());
2033             }
2034         }
2035     }
2036 }
2037 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2038 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2039   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2040   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2041     UnicodeString source; s.toPattern(source, TRUE);
2042     UnicodeString result; t.toPattern(result, TRUE);
2043     if (s != t) {
2044         errln((UnicodeString)"FAIL: " + message
2045               + "; source = " + source
2046               + "; result = " + result
2047               );
2048         return FALSE;
2049     } else {
2050         logln((UnicodeString)"Ok: " + message
2051               + "; source = " + source
2052               + "; result = " + result
2053               );
2054     }
2055     return TRUE;
2056 }
2057 
2058 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2059 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2060                                   const UnicodeString& charsIn,
2061                                   const UnicodeString& charsOut) {
2062     UErrorCode ec = U_ZERO_ERROR;
2063     UnicodeSet set(pat, ec);
2064     if (U_FAILURE(ec)) {
2065         dataerrln((UnicodeString)"FAIL: pattern \"" +
2066               pat + "\" => " + u_errorName(ec));
2067         return;
2068     }
2069     expectContainment(set, pat, charsIn, charsOut);
2070 }
2071 
2072 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2073 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2074                                   const UnicodeString& charsIn,
2075                                   const UnicodeString& charsOut) {
2076     UnicodeString pat;
2077     set.toPattern(pat);
2078     expectContainment(set, pat, charsIn, charsOut);
2079 }
2080 
2081 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2082 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2083                                   const UnicodeString& setName,
2084                                   const UnicodeString& charsIn,
2085                                   const UnicodeString& charsOut) {
2086     UnicodeString bad;
2087     UChar32 c;
2088     int32_t i;
2089 
2090     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2091         c = charsIn.char32At(i);
2092         if (!set.contains(c)) {
2093             bad.append(c);
2094         }
2095     }
2096     if (bad.length() > 0) {
2097         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2098               ", expected containment of " + prettify(charsIn));
2099     } else {
2100         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2101     }
2102 
2103     bad.truncate(0);
2104     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2105         c = charsOut.char32At(i);
2106         if (set.contains(c)) {
2107             bad.append(c);
2108         }
2109     }
2110     if (bad.length() > 0) {
2111         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2112               ", expected non-containment of " + prettify(charsOut));
2113     } else {
2114         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2115     }
2116 }
2117 
2118 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2119 UnicodeSetTest::expectPattern(UnicodeSet& set,
2120                               const UnicodeString& pattern,
2121                               const UnicodeString& expectedPairs){
2122     UErrorCode status = U_ZERO_ERROR;
2123     set.applyPattern(pattern, status);
2124     if (U_FAILURE(status)) {
2125         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2126               "\") failed");
2127         return;
2128     } else {
2129         if (getPairs(set) != expectedPairs ) {
2130             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2131                   "\") => pairs \"" +
2132                   escape(getPairs(set)) + "\", expected \"" +
2133                   escape(expectedPairs) + "\"");
2134         } else {
2135             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2136                   "\") => pairs \"" +
2137                   escape(getPairs(set)) + "\"");
2138         }
2139     }
2140     // the result of calling set.toPattern(), which is the string representation of
2141     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2142     // will produce another set that is equal to this one.
2143     UnicodeString temppattern;
2144     set.toPattern(temppattern);
2145     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2146     if (U_FAILURE(status)) {
2147         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2148         return;
2149     }
2150     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2151         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2152             escape(getPairs(set)) + "\""));
2153     } else{
2154         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2155     }
2156 
2157     delete tempset;
2158 
2159 }
2160 
2161 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2162 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2163     if (getPairs(set) != expectedPairs) {
2164         errln(UnicodeString("FAIL: Expected pair list \"") +
2165               escape(expectedPairs) + "\", got \"" +
2166               escape(getPairs(set)) + "\"");
2167     }
2168 }
2169 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2170 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2171                                      const UnicodeString& expPat,
2172                                      const char** expStrings) {
2173     UnicodeString pat;
2174     set.toPattern(pat, TRUE);
2175     if (pat == expPat) {
2176         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2177     } else {
2178         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2179         return;
2180     }
2181     if (expStrings == NULL) {
2182         return;
2183     }
2184     UBool in = TRUE;
2185     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2186         if (expStrings[i] == NOT) { // sic; pointer comparison
2187             in = FALSE;
2188             continue;
2189         }
2190         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2191         UBool contained = set.contains(s);
2192         if (contained == in) {
2193             logln((UnicodeString)"Ok: " + expPat +
2194                   (contained ? " contains {" : " does not contain {") +
2195                   escape(expStrings[i]) + "}");
2196         } else {
2197             errln((UnicodeString)"FAIL: " + expPat +
2198                   (contained ? " contains {" : " does not contain {") +
2199                   escape(expStrings[i]) + "}");
2200         }
2201     }
2202 }
2203 
toHexString(int32_t i)2204 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? u'0' : (u'A' - 10))); }
2205 
2206 void
doAssert(UBool condition,const char * message)2207 UnicodeSetTest::doAssert(UBool condition, const char *message)
2208 {
2209     if (!condition) {
2210         errln(UnicodeString("ERROR : ") + message);
2211     }
2212 }
2213 
2214 UnicodeString
escape(const UnicodeString & s)2215 UnicodeSetTest::escape(const UnicodeString& s) {
2216     UnicodeString buf;
2217     for (int32_t i=0; i<s.length(); )
2218     {
2219         UChar32 c = s.char32At(i);
2220         if (0x0020 <= c && c <= 0x007F) {
2221             buf += c;
2222         } else {
2223             if (c <= 0xFFFF) {
2224                 buf += u"\\u";
2225             } else {
2226                 buf += u"\\U";
2227                 buf += toHexString((c & 0xF0000000) >> 28);
2228                 buf += toHexString((c & 0x0F000000) >> 24);
2229                 buf += toHexString((c & 0x00F00000) >> 20);
2230                 buf += toHexString((c & 0x000F0000) >> 16);
2231             }
2232             buf += toHexString((c & 0xF000) >> 12);
2233             buf += toHexString((c & 0x0F00) >> 8);
2234             buf += toHexString((c & 0x00F0) >> 4);
2235             buf += toHexString(c & 0x000F);
2236         }
2237         i += U16_LENGTH(c);
2238     }
2239     return buf;
2240 }
2241 
TestFreezable()2242 void UnicodeSetTest::TestFreezable() {
2243     UErrorCode errorCode=U_ZERO_ERROR;
2244     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2245     UnicodeSet idSet(idPattern, errorCode);
2246     if(U_FAILURE(errorCode)) {
2247         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2248         return;
2249     }
2250 
2251     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2252     UnicodeSet wsSet(wsPattern, errorCode);
2253     if(U_FAILURE(errorCode)) {
2254         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2255         return;
2256     }
2257 
2258     idSet.add(idPattern);
2259     UnicodeSet frozen(idSet);
2260     frozen.freeze();
2261 
2262     if(idSet.isFrozen() || !frozen.isFrozen()) {
2263         errln("FAIL: isFrozen() is wrong");
2264     }
2265     if(frozen!=idSet || !(frozen==idSet)) {
2266         errln("FAIL: a copy-constructed frozen set differs from its original");
2267     }
2268 
2269     frozen=wsSet;
2270     if(frozen!=idSet || !(frozen==idSet)) {
2271         errln("FAIL: a frozen set was modified by operator=");
2272     }
2273 
2274     UnicodeSet frozen2(frozen);
2275     if(frozen2!=frozen || frozen2!=idSet) {
2276         errln("FAIL: a copied frozen set differs from its frozen original");
2277     }
2278     if(!frozen2.isFrozen()) {
2279         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2280     }
2281     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2282     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2283         errln("FAIL: UnicodeSet(5, 55) failed");
2284     }
2285     frozen3=frozen;
2286     if(!frozen3.isFrozen()) {
2287         errln("FAIL: copying a frozen set results in a thawed one");
2288     }
2289 
2290     UnicodeSet *cloned=frozen.clone();
2291     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2292         errln("FAIL: clone() failed");
2293     }
2294     cloned->add(0xd802, 0xd805);
2295     if(cloned->containsSome(0xd802, 0xd805)) {
2296         errln("FAIL: unable to modify clone");
2297     }
2298     delete cloned;
2299 
2300     UnicodeSet *thawed=frozen.cloneAsThawed();
2301     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2302         errln("FAIL: cloneAsThawed() failed");
2303     }
2304     thawed->add(0xd802, 0xd805);
2305     if(!thawed->contains(0xd802, 0xd805)) {
2306         errln("FAIL: unable to modify thawed clone");
2307     }
2308     delete thawed;
2309 
2310     frozen.set(5, 55);
2311     if(frozen!=idSet || !(frozen==idSet)) {
2312         errln("FAIL: UnicodeSet::set() modified a frozen set");
2313     }
2314 
2315     frozen.clear();
2316     if(frozen!=idSet || !(frozen==idSet)) {
2317         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2318     }
2319 
2320     frozen.closeOver(USET_CASE_INSENSITIVE);
2321     if(frozen!=idSet || !(frozen==idSet)) {
2322         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2323     }
2324 
2325     frozen.compact();
2326     if(frozen!=idSet || !(frozen==idSet)) {
2327         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2328     }
2329 
2330     ParsePosition pos;
2331     frozen.
2332         applyPattern(wsPattern, errorCode).
2333         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2334         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2335         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2336         applyPropertyAlias(u"Assigned", UnicodeString(), errorCode);
2337     if(frozen!=idSet || !(frozen==idSet)) {
2338         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2339     }
2340 
2341     frozen.
2342         add(0xd800).
2343         add(0xd802, 0xd805).
2344         add(wsPattern).
2345         addAll(idPattern).
2346         addAll(wsSet);
2347     if(frozen!=idSet || !(frozen==idSet)) {
2348         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2349     }
2350 
2351     frozen.
2352         retain(0x62).
2353         retain(0x64, 0x69).
2354         retainAll(wsPattern).
2355         retainAll(wsSet);
2356     if(frozen!=idSet || !(frozen==idSet)) {
2357         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2358     }
2359 
2360     frozen.
2361         remove(0x62).
2362         remove(0x64, 0x69).
2363         remove(idPattern).
2364         removeAll(idPattern).
2365         removeAll(idSet);
2366     if(frozen!=idSet || !(frozen==idSet)) {
2367         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2368     }
2369 
2370     frozen.
2371         complement().
2372         complement(0x62).
2373         complement(0x64, 0x69).
2374         complement(idPattern).
2375         complementAll(idPattern).
2376         complementAll(idSet);
2377     if(frozen!=idSet || !(frozen==idSet)) {
2378         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2379     }
2380 }
2381 
2382 // Test span() etc. -------------------------------------------------------- ***
2383 
2384 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2385 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2386 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2387     UErrorCode errorCode=U_ZERO_ERROR;
2388     int32_t length8=0;
2389     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2390     if(U_SUCCESS(errorCode)) {
2391         return length8;
2392     } else {
2393         // The string contains an unpaired surrogate.
2394         // Ignore this string.
2395         return 0;
2396     }
2397 }
2398 
2399 class UnicodeSetWithStringsIterator;
2400 
2401 // Make the strings in a UnicodeSet easily accessible.
2402 class UnicodeSetWithStrings {
2403 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2404     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2405             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2406         int32_t size=set.size();
2407         if(size>0 && set.charAt(size-1)<0) {
2408             // If a set's last element is not a code point, then it must contain strings.
2409             // Iterate over the set, skip all code point ranges, and cache the strings.
2410             // Convert them to UTF-8 for spanUTF8().
2411             UnicodeSetIterator iter(set);
2412             const UnicodeString *s;
2413             char *s8=utf8;
2414             int32_t length8, utf8Count=0;
2415             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2416                 if(iter.isString()) {
2417                     // Store the pointer to the set's string element
2418                     // which we happen to know is a stable pointer.
2419                     strings[stringsLength]=s=&iter.getString();
2420                     utf8Count+=
2421                         utf8Lengths[stringsLength]=length8=
2422                         appendUTF8(s->getBuffer(), s->length(),
2423                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2424                     if(length8==0) {
2425                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2426                     }
2427                     s8+=length8;
2428                     ++stringsLength;
2429                 }
2430             }
2431         }
2432     }
2433 
getSet() const2434     const UnicodeSet &getSet() const {
2435         return set;
2436     }
2437 
hasStrings() const2438     UBool hasStrings() const {
2439         return (UBool)(stringsLength>0);
2440     }
2441 
hasStringsWithSurrogates() const2442     UBool hasStringsWithSurrogates() const {
2443         return hasSurrogates;
2444     }
2445 
2446 private:
2447     friend class UnicodeSetWithStringsIterator;
2448 
2449     const UnicodeSet &set;
2450 
2451     const UnicodeString *strings[20];
2452     int32_t stringsLength;
2453     UBool hasSurrogates;
2454 
2455     char utf8[1024];
2456     int32_t utf8Lengths[20];
2457 };
2458 
2459 class UnicodeSetWithStringsIterator {
2460 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2461     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2462             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2463     }
2464 
reset()2465     void reset() {
2466         nextStringIndex=nextUTF8Start=0;
2467     }
2468 
nextString()2469     const UnicodeString *nextString() {
2470         if(nextStringIndex<fSet.stringsLength) {
2471             return fSet.strings[nextStringIndex++];
2472         } else {
2473             return NULL;
2474         }
2475     }
2476 
2477     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2478     const char *nextUTF8(int32_t &length) {
2479         if(nextStringIndex<fSet.stringsLength) {
2480             const char *s8=fSet.utf8+nextUTF8Start;
2481             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2482             return s8;
2483         } else {
2484             length=0;
2485             return NULL;
2486         }
2487     }
2488 
2489 private:
2490     const UnicodeSetWithStrings &fSet;
2491     int32_t nextStringIndex;
2492     int32_t nextUTF8Start;
2493 };
2494 
2495 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2496 // at code point boundaries.
2497 // That is, each edge of a match must not be in the middle of a surrogate pair.
2498 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2499 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2500     s+=start;
2501     limit-=start;
2502     int32_t length=t.length();
2503     return 0==t.compare(s, length) &&
2504            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2505            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2506 }
2507 
2508 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2509 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2510                                  USetSpanCondition spanCondition) {
2511     const UnicodeSet &realSet(set.getSet());
2512     if(!set.hasStrings()) {
2513         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2514             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2515         }
2516 
2517         UChar32 c;
2518         int32_t start=0, prev;
2519         while((prev=start)<length) {
2520             U16_NEXT(s, start, length, c);
2521             if(realSet.contains(c)!=spanCondition) {
2522                 break;
2523             }
2524         }
2525         return prev;
2526     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2527         UnicodeSetWithStringsIterator iter(set);
2528         UChar32 c;
2529         int32_t start, next;
2530         for(start=next=0; start<length;) {
2531             U16_NEXT(s, next, length, c);
2532             if(realSet.contains(c)) {
2533                 break;
2534             }
2535             const UnicodeString *str;
2536             iter.reset();
2537             while((str=iter.nextString())!=NULL) {
2538                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2539                     // spanNeedsStrings=TRUE;
2540                     return start;
2541                 }
2542             }
2543             start=next;
2544         }
2545         return start;
2546     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2547         UnicodeSetWithStringsIterator iter(set);
2548         UChar32 c;
2549         int32_t start, next, maxSpanLimit=0;
2550         for(start=next=0; start<length;) {
2551             U16_NEXT(s, next, length, c);
2552             if(!realSet.contains(c)) {
2553                 next=start;  // Do not span this single, not-contained code point.
2554             }
2555             const UnicodeString *str;
2556             iter.reset();
2557             while((str=iter.nextString())!=NULL) {
2558                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2559                     // spanNeedsStrings=TRUE;
2560                     int32_t matchLimit=start+str->length();
2561                     if(matchLimit==length) {
2562                         return length;
2563                     }
2564                     if(spanCondition==USET_SPAN_CONTAINED) {
2565                         // Iterate for the shortest match at each position.
2566                         // Recurse for each but the shortest match.
2567                         if(next==start) {
2568                             next=matchLimit;  // First match from start.
2569                         } else {
2570                             if(matchLimit<next) {
2571                                 // Remember shortest match from start for iteration.
2572                                 int32_t temp=next;
2573                                 next=matchLimit;
2574                                 matchLimit=temp;
2575                             }
2576                             // Recurse for non-shortest match from start.
2577                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2578                                                                  USET_SPAN_CONTAINED);
2579                             if((matchLimit+spanLength)>maxSpanLimit) {
2580                                 maxSpanLimit=matchLimit+spanLength;
2581                                 if(maxSpanLimit==length) {
2582                                     return length;
2583                                 }
2584                             }
2585                         }
2586                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2587                         if(matchLimit>next) {
2588                             // Remember longest match from start.
2589                             next=matchLimit;
2590                         }
2591                     }
2592                 }
2593             }
2594             if(next==start) {
2595                 break;  // No match from start.
2596             }
2597             start=next;
2598         }
2599         if(start>maxSpanLimit) {
2600             return start;
2601         } else {
2602             return maxSpanLimit;
2603         }
2604     }
2605 }
2606 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2607 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2608                                      USetSpanCondition spanCondition) {
2609     if(length==0) {
2610         return 0;
2611     }
2612     const UnicodeSet &realSet(set.getSet());
2613     if(!set.hasStrings()) {
2614         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2615             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2616         }
2617 
2618         UChar32 c;
2619         int32_t prev=length;
2620         do {
2621             U16_PREV(s, 0, length, c);
2622             if(realSet.contains(c)!=spanCondition) {
2623                 break;
2624             }
2625         } while((prev=length)>0);
2626         return prev;
2627     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2628         UnicodeSetWithStringsIterator iter(set);
2629         UChar32 c;
2630         int32_t prev=length, length0=length;
2631         do {
2632             U16_PREV(s, 0, length, c);
2633             if(realSet.contains(c)) {
2634                 break;
2635             }
2636             const UnicodeString *str;
2637             iter.reset();
2638             while((str=iter.nextString())!=NULL) {
2639                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2640                     // spanNeedsStrings=TRUE;
2641                     return prev;
2642                 }
2643             }
2644         } while((prev=length)>0);
2645         return prev;
2646     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2647         UnicodeSetWithStringsIterator iter(set);
2648         UChar32 c;
2649         int32_t prev=length, minSpanStart=length, length0=length;
2650         do {
2651             U16_PREV(s, 0, length, c);
2652             if(!realSet.contains(c)) {
2653                 length=prev;  // Do not span this single, not-contained code point.
2654             }
2655             const UnicodeString *str;
2656             iter.reset();
2657             while((str=iter.nextString())!=NULL) {
2658                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2659                     // spanNeedsStrings=TRUE;
2660                     int32_t matchStart=prev-str->length();
2661                     if(matchStart==0) {
2662                         return 0;
2663                     }
2664                     if(spanCondition==USET_SPAN_CONTAINED) {
2665                         // Iterate for the shortest match at each position.
2666                         // Recurse for each but the shortest match.
2667                         if(length==prev) {
2668                             length=matchStart;  // First match from prev.
2669                         } else {
2670                             if(matchStart>length) {
2671                                 // Remember shortest match from prev for iteration.
2672                                 int32_t temp=length;
2673                                 length=matchStart;
2674                                 matchStart=temp;
2675                             }
2676                             // Recurse for non-shortest match from prev.
2677                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2678                                                                     USET_SPAN_CONTAINED);
2679                             if(spanStart<minSpanStart) {
2680                                 minSpanStart=spanStart;
2681                                 if(minSpanStart==0) {
2682                                     return 0;
2683                                 }
2684                             }
2685                         }
2686                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2687                         if(matchStart<length) {
2688                             // Remember longest match from prev.
2689                             length=matchStart;
2690                         }
2691                     }
2692                 }
2693             }
2694             if(length==prev) {
2695                 break;  // No match from prev.
2696             }
2697         } while((prev=length)>0);
2698         if(prev<minSpanStart) {
2699             return prev;
2700         } else {
2701             return minSpanStart;
2702         }
2703     }
2704 }
2705 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2706 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2707                                 USetSpanCondition spanCondition) {
2708     const UnicodeSet &realSet(set.getSet());
2709     if(!set.hasStrings()) {
2710         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2711             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2712         }
2713 
2714         UChar32 c;
2715         int32_t start=0, prev;
2716         while((prev=start)<length) {
2717             U8_NEXT_OR_FFFD(s, start, length, c);
2718             if(realSet.contains(c)!=spanCondition) {
2719                 break;
2720             }
2721         }
2722         return prev;
2723     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2724         UnicodeSetWithStringsIterator iter(set);
2725         UChar32 c;
2726         int32_t start, next;
2727         for(start=next=0; start<length;) {
2728             U8_NEXT_OR_FFFD(s, next, length, c);
2729             if(realSet.contains(c)) {
2730                 break;
2731             }
2732             const char *s8;
2733             int32_t length8;
2734             iter.reset();
2735             while((s8=iter.nextUTF8(length8))!=NULL) {
2736                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2737                     // spanNeedsStrings=TRUE;
2738                     return start;
2739                 }
2740             }
2741             start=next;
2742         }
2743         return start;
2744     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2745         UnicodeSetWithStringsIterator iter(set);
2746         UChar32 c;
2747         int32_t start, next, maxSpanLimit=0;
2748         for(start=next=0; start<length;) {
2749             U8_NEXT_OR_FFFD(s, next, length, c);
2750             if(!realSet.contains(c)) {
2751                 next=start;  // Do not span this single, not-contained code point.
2752             }
2753             const char *s8;
2754             int32_t length8;
2755             iter.reset();
2756             while((s8=iter.nextUTF8(length8))!=NULL) {
2757                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2758                     // spanNeedsStrings=TRUE;
2759                     int32_t matchLimit=start+length8;
2760                     if(matchLimit==length) {
2761                         return length;
2762                     }
2763                     if(spanCondition==USET_SPAN_CONTAINED) {
2764                         // Iterate for the shortest match at each position.
2765                         // Recurse for each but the shortest match.
2766                         if(next==start) {
2767                             next=matchLimit;  // First match from start.
2768                         } else {
2769                             if(matchLimit<next) {
2770                                 // Remember shortest match from start for iteration.
2771                                 int32_t temp=next;
2772                                 next=matchLimit;
2773                                 matchLimit=temp;
2774                             }
2775                             // Recurse for non-shortest match from start.
2776                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2777                                                                 USET_SPAN_CONTAINED);
2778                             if((matchLimit+spanLength)>maxSpanLimit) {
2779                                 maxSpanLimit=matchLimit+spanLength;
2780                                 if(maxSpanLimit==length) {
2781                                     return length;
2782                                 }
2783                             }
2784                         }
2785                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2786                         if(matchLimit>next) {
2787                             // Remember longest match from start.
2788                             next=matchLimit;
2789                         }
2790                     }
2791                 }
2792             }
2793             if(next==start) {
2794                 break;  // No match from start.
2795             }
2796             start=next;
2797         }
2798         if(start>maxSpanLimit) {
2799             return start;
2800         } else {
2801             return maxSpanLimit;
2802         }
2803     }
2804 }
2805 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2806 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2807                                     USetSpanCondition spanCondition) {
2808     if(length==0) {
2809         return 0;
2810     }
2811     const UnicodeSet &realSet(set.getSet());
2812     if(!set.hasStrings()) {
2813         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2814             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2815         }
2816 
2817         UChar32 c;
2818         int32_t prev=length;
2819         do {
2820             U8_PREV_OR_FFFD(s, 0, length, c);
2821             if(realSet.contains(c)!=spanCondition) {
2822                 break;
2823             }
2824         } while((prev=length)>0);
2825         return prev;
2826     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2827         UnicodeSetWithStringsIterator iter(set);
2828         UChar32 c;
2829         int32_t prev=length;
2830         do {
2831             U8_PREV_OR_FFFD(s, 0, length, c);
2832             if(realSet.contains(c)) {
2833                 break;
2834             }
2835             const char *s8;
2836             int32_t length8;
2837             iter.reset();
2838             while((s8=iter.nextUTF8(length8))!=NULL) {
2839                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2840                     // spanNeedsStrings=TRUE;
2841                     return prev;
2842                 }
2843             }
2844         } while((prev=length)>0);
2845         return prev;
2846     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2847         UnicodeSetWithStringsIterator iter(set);
2848         UChar32 c;
2849         int32_t prev=length, minSpanStart=length;
2850         do {
2851             U8_PREV_OR_FFFD(s, 0, length, c);
2852             if(!realSet.contains(c)) {
2853                 length=prev;  // Do not span this single, not-contained code point.
2854             }
2855             const char *s8;
2856             int32_t length8;
2857             iter.reset();
2858             while((s8=iter.nextUTF8(length8))!=NULL) {
2859                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2860                     // spanNeedsStrings=TRUE;
2861                     int32_t matchStart=prev-length8;
2862                     if(matchStart==0) {
2863                         return 0;
2864                     }
2865                     if(spanCondition==USET_SPAN_CONTAINED) {
2866                         // Iterate for the shortest match at each position.
2867                         // Recurse for each but the shortest match.
2868                         if(length==prev) {
2869                             length=matchStart;  // First match from prev.
2870                         } else {
2871                             if(matchStart>length) {
2872                                 // Remember shortest match from prev for iteration.
2873                                 int32_t temp=length;
2874                                 length=matchStart;
2875                                 matchStart=temp;
2876                             }
2877                             // Recurse for non-shortest match from prev.
2878                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2879                                                                    USET_SPAN_CONTAINED);
2880                             if(spanStart<minSpanStart) {
2881                                 minSpanStart=spanStart;
2882                                 if(minSpanStart==0) {
2883                                     return 0;
2884                                 }
2885                             }
2886                         }
2887                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2888                         if(matchStart<length) {
2889                             // Remember longest match from prev.
2890                             length=matchStart;
2891                         }
2892                     }
2893                 }
2894             }
2895             if(length==prev) {
2896                 break;  // No match from prev.
2897             }
2898         } while((prev=length)>0);
2899         if(prev<minSpanStart) {
2900             return prev;
2901         } else {
2902             return minSpanStart;
2903         }
2904     }
2905 }
2906 
2907 // spans to be performed and compared
2908 enum {
2909     SPAN_UTF16          =1,
2910     SPAN_UTF8           =2,
2911     SPAN_UTFS           =3,
2912 
2913     SPAN_SET            =4,
2914     SPAN_COMPLEMENT     =8,
2915     SPAN_POLARITY       =0xc,
2916 
2917     SPAN_FWD            =0x10,
2918     SPAN_BACK           =0x20,
2919     SPAN_DIRS           =0x30,
2920 
2921     SPAN_CONTAINED      =0x100,
2922     SPAN_SIMPLE         =0x200,
2923     SPAN_CONDITION      =0x300,
2924 
2925     SPAN_ALL            =0x33f
2926 };
2927 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2928 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2929     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2930 }
2931 
slen(const void * s,UBool isUTF16)2932 static inline int32_t slen(const void *s, UBool isUTF16) {
2933     return isUTF16 ? u_strlen((const UChar *)s) : static_cast<int32_t>(strlen((const char *)s));
2934 }
2935 
2936 /*
2937  * Count spans on a string with the method according to type and set the span limits.
2938  * The set may be the complement of the original.
2939  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2940  * according to the expected number of spans.
2941  * Sets typeName to an empty string if there is no such type.
2942  * Returns -1 if the span option is filtered out.
2943  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2944 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2945                         const void *s, int32_t length, UBool isUTF16,
2946                         uint32_t whichSpans,
2947                         int type, const char *&typeName,
2948                         int32_t limits[], int32_t limitsCapacity,
2949                         int32_t expectCount) {
2950     const UnicodeSet &realSet(set.getSet());
2951     int32_t start, count;
2952     USetSpanCondition spanCondition, firstSpanCondition, contained;
2953     UBool isForward;
2954 
2955     if(type<0 || 7<type) {
2956         typeName="";
2957         return 0;
2958     }
2959 
2960     static const char *const typeNames16[]={
2961         "contains", "contains(LM)",
2962         "span", "span(LM)",
2963         "containsBack", "containsBack(LM)",
2964         "spanBack", "spanBack(LM)"
2965     };
2966 
2967     static const char *const typeNames8[]={
2968         "containsUTF8", "containsUTF8(LM)",
2969         "spanUTF8", "spanUTF8(LM)",
2970         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2971         "spanBackUTF8", "spanBackUTF8(LM)"
2972     };
2973 
2974     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2975 
2976     // filter span options
2977     if(type<=3) {
2978         // span forward
2979         if((whichSpans&SPAN_FWD)==0) {
2980             return -1;
2981         }
2982         isForward=TRUE;
2983     } else {
2984         // span backward
2985         if((whichSpans&SPAN_BACK)==0) {
2986             return -1;
2987         }
2988         isForward=FALSE;
2989     }
2990     if((type&1)==0) {
2991         // use USET_SPAN_CONTAINED
2992         if((whichSpans&SPAN_CONTAINED)==0) {
2993             return -1;
2994         }
2995         contained=USET_SPAN_CONTAINED;
2996     } else {
2997         // use USET_SPAN_SIMPLE
2998         if((whichSpans&SPAN_SIMPLE)==0) {
2999             return -1;
3000         }
3001         contained=USET_SPAN_SIMPLE;
3002     }
3003 
3004     // Default first span condition for going forward with an uncomplemented set.
3005     spanCondition=USET_SPAN_NOT_CONTAINED;
3006     if(isComplement) {
3007         spanCondition=invertSpanCondition(spanCondition, contained);
3008     }
3009 
3010     // First span condition for span(), used to terminate the spanBack() iteration.
3011     firstSpanCondition=spanCondition;
3012 
3013     // spanBack(): Its initial span condition is span()'s last span condition,
3014     // which is the opposite of span()'s first span condition
3015     // if we expect an even number of spans.
3016     // (The loop inverts spanCondition (expectCount-1) times
3017     // before the expectCount'th span() call.)
3018     // If we do not compare forward and backward directions, then we do not have an
3019     // expectCount and just start with firstSpanCondition.
3020     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
3021         spanCondition=invertSpanCondition(spanCondition, contained);
3022     }
3023 
3024     count=0;
3025     switch(type) {
3026     case 0:
3027     case 1:
3028         start=0;
3029         if(length<0) {
3030             length=slen(s, isUTF16);
3031         }
3032         for(;;) {
3033             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
3034                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3035             if(count<limitsCapacity) {
3036                 limits[count]=start;
3037             }
3038             ++count;
3039             if(start>=length) {
3040                 break;
3041             }
3042             spanCondition=invertSpanCondition(spanCondition, contained);
3043         }
3044         break;
3045     case 2:
3046     case 3:
3047         start=0;
3048         for(;;) {
3049             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3050                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3051             if(count<limitsCapacity) {
3052                 limits[count]=start;
3053             }
3054             ++count;
3055             if(length>=0 ? start>=length :
3056                            isUTF16 ? ((const UChar *)s)[start]==0 :
3057                                      ((const char *)s)[start]==0
3058             ) {
3059                 break;
3060             }
3061             spanCondition=invertSpanCondition(spanCondition, contained);
3062         }
3063         break;
3064     case 4:
3065     case 5:
3066         if(length<0) {
3067             length=slen(s, isUTF16);
3068         }
3069         for(;;) {
3070             ++count;
3071             if(count<=limitsCapacity) {
3072                 limits[limitsCapacity-count]=length;
3073             }
3074             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3075                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3076             if(length==0 && spanCondition==firstSpanCondition) {
3077                 break;
3078             }
3079             spanCondition=invertSpanCondition(spanCondition, contained);
3080         }
3081         if(count<limitsCapacity) {
3082             memmove(limits, limits+(limitsCapacity-count), count*4);
3083         }
3084         break;
3085     case 6:
3086     case 7:
3087         for(;;) {
3088             ++count;
3089             if(count<=limitsCapacity) {
3090                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3091             }
3092             // Note: Length<0 is tested only for the first spanBack().
3093             // If we wanted to keep length<0 for all spanBack()s, we would have to
3094             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3095             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3096                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3097             if(length==0 && spanCondition==firstSpanCondition) {
3098                 break;
3099             }
3100             spanCondition=invertSpanCondition(spanCondition, contained);
3101         }
3102         if(count<limitsCapacity) {
3103             memmove(limits, limits+(limitsCapacity-count), count*4);
3104         }
3105         break;
3106     default:
3107         typeName="";
3108         return -1;
3109     }
3110 
3111     return count;
3112 }
3113 
3114 // sets to be tested; odd index=isComplement
3115 enum {
3116     SLOW,
3117     SLOW_NOT,
3118     FAST,
3119     FAST_NOT,
3120     SET_COUNT
3121 };
3122 
3123 static const char *const setNames[SET_COUNT]={
3124     "slow",
3125     "slow.not",
3126     "fast",
3127     "fast.not"
3128 };
3129 
3130 /*
3131  * Verify that we get the same results whether we look at text with contains(),
3132  * span() or spanBack(), using unfrozen or frozen versions of the set,
3133  * and using the set or its complement (switching the spanConditions accordingly).
3134  * The latter verifies that
3135  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3136  *
3137  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3138  * or returned to the caller (with an input expectCount<0).
3139  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3140 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3141                               const void *s, int32_t length, UBool isUTF16,
3142                               uint32_t whichSpans,
3143                               int32_t expectLimits[], int32_t &expectCount,
3144                               const char *testName, int32_t index) {
3145     int32_t limits[500];
3146     int32_t limitsCount;
3147     int i, j;
3148 
3149     const char *typeName;
3150     int type;
3151 
3152     for(i=0; i<SET_COUNT; ++i) {
3153         if((i&1)==0) {
3154             // Even-numbered sets are original, uncomplemented sets.
3155             if((whichSpans&SPAN_SET)==0) {
3156                 continue;
3157             }
3158         } else {
3159             // Odd-numbered sets are complemented.
3160             if((whichSpans&SPAN_COMPLEMENT)==0) {
3161                 continue;
3162             }
3163         }
3164         for(type=0;; ++type) {
3165             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3166                                  s, length, isUTF16,
3167                                  whichSpans,
3168                                  type, typeName,
3169                                  limits, UPRV_LENGTHOF(limits), expectCount);
3170             if(typeName[0]==0) {
3171                 break; // All types tried.
3172             }
3173             if(limitsCount<0) {
3174                 continue; // Span option filtered out.
3175             }
3176             if(expectCount<0) {
3177                 expectCount=limitsCount;
3178                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3179                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3180                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3181                     return;
3182                 }
3183                 memcpy(expectLimits, limits, limitsCount*4);
3184             } else if(limitsCount!=expectCount) {
3185                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3186                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3187             } else {
3188                 for(j=0; j<limitsCount; ++j) {
3189                     if(limits[j]!=expectLimits[j]) {
3190                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3191                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3192                               j, (long)limits[j], (long)expectLimits[j]);
3193                         break;
3194                     }
3195                 }
3196             }
3197         }
3198     }
3199 
3200     // Compare span() with containsAll()/containsNone(),
3201     // but only if we have expectLimits[] from the uncomplemented set.
3202     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3203         const UChar *s16=(const UChar *)s;
3204         UnicodeString string;
3205         int32_t prev=0, limit, length;
3206         for(i=0; i<expectCount; ++i) {
3207             limit=expectLimits[i];
3208             length=limit-prev;
3209             if(length>0) {
3210                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3211                 if(i&1) {
3212                     if(!sets[SLOW]->getSet().containsAll(string)) {
3213                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3214                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3215                         return;
3216                     }
3217                     if(!sets[FAST]->getSet().containsAll(string)) {
3218                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3219                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3220                         return;
3221                     }
3222                 } else {
3223                     if(!sets[SLOW]->getSet().containsNone(string)) {
3224                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3225                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3226                         return;
3227                     }
3228                     if(!sets[FAST]->getSet().containsNone(string)) {
3229                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3230                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3231                         return;
3232                     }
3233                 }
3234             }
3235             prev=limit;
3236         }
3237     }
3238 }
3239 
3240 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3241 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3242                               const void *s, int32_t length, UBool isUTF16,
3243                               uint32_t whichSpans,
3244                               const char *testName, int32_t index) {
3245     int32_t expectLimits[500];
3246     int32_t expectCount=-1;
3247     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3248 }
3249 
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3250 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3251     UChar c, c2;
3252 
3253     if(length>=0) {
3254         while(length>0) {
3255             c=*s++;
3256             --length;
3257             if(0xd800<=c && c<0xe000) {
3258                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3259                     return TRUE;
3260                 }
3261                 --length;
3262             }
3263         }
3264     } else {
3265         while((c=*s++)!=0) {
3266             if(0xd800<=c && c<0xe000) {
3267                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3268                     return TRUE;
3269                 }
3270             }
3271         }
3272     }
3273     return FALSE;
3274 }
3275 
3276 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3277 // unless either UTF is turned off in whichSpans.
3278 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3279 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3280 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3281                                       const UChar *s16, int32_t length16,
3282                                       uint32_t whichSpans,
3283                                       const char *testName, int32_t index) {
3284     int32_t expectLimits[500];
3285     int32_t expectCount;
3286 
3287     expectCount=-1;  // Get expectLimits[] from testSpan().
3288 
3289     if((whichSpans&SPAN_UTF16)!=0) {
3290         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3291     }
3292     if((whichSpans&SPAN_UTF8)==0) {
3293         return;
3294     }
3295 
3296     // Convert s16[] and expectLimits[] to UTF-8.
3297     uint8_t s8[3000];
3298     int32_t offsets[3000];
3299 
3300     const UChar *s16Limit=s16+length16;
3301     char *t=(char *)s8;
3302     char *tLimit=t+sizeof(s8);
3303     int32_t *o=offsets;
3304     UErrorCode errorCode=U_ZERO_ERROR;
3305 
3306     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3307     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3308     if(U_FAILURE(errorCode)) {
3309         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3310               testName, (long)index, u_errorName(errorCode));
3311         ucnv_resetFromUnicode(utf8Cnv);
3312         return;
3313     }
3314     int32_t length8=(int32_t)(t-(char *)s8);
3315 
3316     // Convert expectLimits[].
3317     int32_t i, j, expect;
3318     for(i=j=0; i<expectCount; ++i) {
3319         expect=expectLimits[i];
3320         if(expect==length16) {
3321             expectLimits[i]=length8;
3322         } else {
3323             while(offsets[j]<expect) {
3324                 ++j;
3325             }
3326             expectLimits[i]=j;
3327         }
3328     }
3329 
3330     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3331 }
3332 
nextCodePoint(UChar32 c)3333 static UChar32 nextCodePoint(UChar32 c) {
3334     // Skip some large and boring ranges.
3335     switch(c) {
3336     case 0x3441:
3337         return 0x4d7f;
3338     case 0x5100:
3339         return 0x9f00;
3340     case 0xb040:
3341         return 0xd780;
3342     case 0xe041:
3343         return 0xf8fe;
3344     case 0x10100:
3345         return 0x20000;
3346     case 0x20041:
3347         return 0xe0000;
3348     case 0xe0101:
3349         return 0x10fffd;
3350     default:
3351         return c+1;
3352     }
3353 }
3354 
3355 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3356 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3357     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3358     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3359     // Skip the UTF-8 part of the test - if the string contains surrogates -
3360     // because it is likely to produce a different result.
3361     UBool inconsistentSurrogates=
3362             (!(sets[0]->getSet().contains(0xfffd) ?
3363                sets[0]->getSet().contains(0xd800, 0xdfff) :
3364                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3365              sets[0]->hasStringsWithSurrogates());
3366 
3367     UChar s[1000];
3368     int32_t length=0;
3369     uint32_t localWhichSpans;
3370 
3371     UChar32 c, first;
3372     for(first=c=0;; c=nextCodePoint(c)) {
3373         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3374             localWhichSpans=whichSpans;
3375             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3376                 localWhichSpans&=~SPAN_UTF8;
3377             }
3378             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3379             if(c>0x10ffff) {
3380                 break;
3381             }
3382             length=0;
3383             first=c;
3384         }
3385         U16_APPEND_UNSAFE(s, length, c);
3386     }
3387 }
3388 
3389 // Test with a particular, interesting string.
3390 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3391 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3392     static const UChar s[]={
3393         0x61, 0x62, 0x20,                       // Latin, space
3394         0x3b1, 0x3b2, 0x3b3,                    // Greek
3395         0xd900,                                 // lead surrogate
3396         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3397         0xdc05,                                 // trail surrogate
3398         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3399         0xd900, 0xdc05,                         // unassigned supplementary
3400         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3401         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3402         0                                       // NUL
3403     };
3404 
3405     if((whichSpans&SPAN_UTF16)==0) {
3406         return;
3407     }
3408     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3409     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3410 }
3411 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3412 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3413     static const char s[]={
3414         "abc"                                   // Latin
3415 
3416         /* trail byte in lead position */
3417         "\x80"
3418 
3419         " "                                     // space
3420 
3421         /* truncated multi-byte sequences */
3422         "\xd0"
3423         "\xe0"
3424         "\xe1"
3425         "\xed"
3426         "\xee"
3427         "\xf0"
3428         "\xf1"
3429         "\xf4"
3430         "\xf8"
3431         "\xfc"
3432 
3433         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3434 
3435         /* trail byte in lead position */
3436         "\x80"
3437 
3438         "\xe0\x80"
3439         "\xe0\xa0"
3440         "\xe1\x80"
3441         "\xed\x80"
3442         "\xed\xa0"
3443         "\xee\x80"
3444         "\xf0\x80"
3445         "\xf0\x90"
3446         "\xf1\x80"
3447         "\xf4\x80"
3448         "\xf4\x90"
3449         "\xf8\x80"
3450         "\xfc\x80"
3451 
3452         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3453 
3454         /* trail byte in lead position */
3455         "\x80"
3456 
3457         "\xf0\x80\x80"
3458         "\xf0\x90\x80"
3459         "\xf1\x80\x80"
3460         "\xf4\x80\x80"
3461         "\xf4\x90\x80"
3462         "\xf8\x80\x80"
3463         "\xfc\x80\x80"
3464 
3465         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3466 
3467         /* trail byte in lead position */
3468         "\x80"
3469 
3470         "\xf8\x80\x80\x80"
3471         "\xfc\x80\x80\x80"
3472 
3473         "\xF1\x90\x80\x85"                      // unassigned supplementary
3474 
3475         /* trail byte in lead position */
3476         "\x80"
3477 
3478         "\xfc\x80\x80\x80\x80"
3479 
3480         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3481 
3482         /* trail byte in lead position */
3483         "\x80"
3484 
3485         /* complete sequences but non-shortest forms or out of range etc. */
3486         "\xc0\x80"
3487         "\xe0\x80\x80"
3488         "\xed\xa0\x80"
3489         "\xf0\x80\x80\x80"
3490         "\xf4\x90\x80\x80"
3491         "\xf8\x80\x80\x80\x80"
3492         "\xfc\x80\x80\x80\x80\x80"
3493         "\xfe"
3494         "\xff"
3495 
3496         /* trail byte in lead position */
3497         "\x80"
3498 
3499         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3500     };
3501 
3502     if((whichSpans&SPAN_UTF8)==0) {
3503         return;
3504     }
3505     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3506     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3507 }
3508 
3509 // Take a set of span options and multiply them so that
3510 // each portion only has one of the options a, b and c.
3511 // If b==0, then the set of options is just modified with mask and a.
3512 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3513 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3514 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3515                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3516     uint32_t s;
3517     int32_t i;
3518 
3519     for(i=0; i<whichSpansCount; ++i) {
3520         s=whichSpans[i]&mask;
3521         whichSpans[i]=s|a;
3522         if(b!=0) {
3523             whichSpans[whichSpansCount+i]=s|b;
3524             if(c!=0) {
3525                 whichSpans[2*whichSpansCount+i]=s|c;
3526             }
3527         }
3528     }
3529     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3530 }
3531 
3532 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3533 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3534 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3535 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3536 
TestSpan()3537 void UnicodeSetTest::TestSpan() {
3538     // "[...]" is a UnicodeSet pattern.
3539     // "*" performs tests on all Unicode code points and on a selection of
3540     //   malformed UTF-8/16 strings.
3541     // "-options" limits the scope of testing for the current set.
3542     //   By default, the test verifies that equivalent boundaries are found
3543     //   for UTF-16 and UTF-8, going forward and backward,
3544     //   alternating USET_SPAN_NOT_CONTAINED with
3545     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3546     //   Single-character options:
3547     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3548     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3549     //          or the set contains strings with unpaired surrogates
3550     //          which do not translate to valid UTF-8.
3551     //     c -- set.span() and set.complement().span() boundaries may differ.
3552     //          Cause: Set strings are not complemented.
3553     //     b -- span() and spanBack() boundaries may differ.
3554     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3555     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3556     //          match with non-overlapping substrings.
3557     //          For example, with a set containing "ab" and "ba",
3558     //          span() of "aba" yields boundaries { 0, 2, 3 }
3559     //          because the initial "ab" matches from 0 to 2,
3560     //          while spanBack() yields boundaries { 0, 1, 3 }
3561     //          because the final "ba" matches from 1 to 3.
3562     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3563     //          Cause: Strings in the set overlap, and a longer match may
3564     //          require a sequence including non-longest substrings.
3565     //          For example, with a set containing "ab", "abc" and "cd",
3566     //          span(contained) of "abcd" spans the entire string
3567     //          but span(longest match) only spans the first 3 characters.
3568     //   Each "-options" first resets all options and then applies the specified options.
3569     //   A "-" without options resets the options.
3570     //   The options are also reset for each new set.
3571     // Other strings will be spanned.
3572     static const char *const testdata[]={
3573         "[:ID_Continue:]",
3574         "*",
3575         "[:White_Space:]",
3576         "*",
3577         "[]",
3578         "*",
3579         "[\\u0000-\\U0010FFFF]",
3580         "*",
3581         "[\\u0000\\u0080\\u0800\\U00010000]",
3582         "*",
3583         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3584         "*",
3585         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3586         "-c",
3587         "*",
3588         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3589         "-c",
3590         "*",
3591 
3592         // Overlapping strings cause overlapping attempts to match.
3593         "[x{xy}{xya}{axy}{ax}]",
3594         "-cl",
3595 
3596         // More repetitions of "xya" would take too long with the recursive
3597         // reference implementation.
3598         // containsAll()=FALSE
3599         // test_string 0x14
3600         "xx"
3601         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3602         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3603         "xyaxyaxyaxya"
3604         "xx"
3605         "xyaxyaxyaxya"  // span() ends here.
3606         "aaa",
3607 
3608         // containsAll()=TRUE
3609         // test_string 0x15
3610         "xx"
3611         "xyaxyaxyaxya"
3612         "xx"
3613         "xyaxyaxyaxya"
3614         "xx"
3615         "xyaxyaxyaxy",
3616 
3617         "-bc",
3618         // test_string 0x17
3619         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3620         "-c",
3621         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3622         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3623         "-",
3624         "byaya",     // span() -> { 5 }
3625         "byay",      // span() -> { 4 }
3626         "bya",       // span() -> { 3 }
3627 
3628         // span(longest match) will not span the whole string.
3629         "[a{ab}{bc}]",
3630         "-cl",
3631         // test_string 0x21
3632         "abc",
3633 
3634         "[a{ab}{abc}{cd}]",
3635         "-cl",
3636         "acdabcdabccd",
3637 
3638         // spanBack(longest match) will not span the whole string.
3639         "[c{ab}{bc}]",
3640         "-cl",
3641         "abc",
3642 
3643         "[d{cd}{bcd}{ab}]",
3644         "-cl",
3645         "abbcdabcdabd",
3646 
3647         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3648         // and UTF-8 trail bytes.
3649         // Copies of above test sets and strings, but transliterated to have
3650         // different code points with similar trail units.
3651         // Previous: a      b         c            d
3652         // Unicode:  042B   30AB      200AB        204AB
3653         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3654         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3655         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3656         "-cl",
3657         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3658 
3659         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3660         "-cl",
3661         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3662 
3663         // Stress bookkeeping and recursion.
3664         // The following strings are barely doable with the recursive
3665         // reference implementation.
3666         // The not-contained character at the end prevents an early exit from the span().
3667         "[b{bb}]",
3668         "-c",
3669         // test_string 0x33
3670         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3671         // On complement sets, span() and spanBack() get different results
3672         // because b is not in the complement set and there is an odd number of b's
3673         // in the test string.
3674         "-bc",
3675         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3676 
3677         // Test with set strings with an initial or final code point span
3678         // longer than 254.
3679         "[a{" _64_a _64_a _64_a _64_a "b}"
3680           "{a" _64_b _64_b _64_b _64_b "}]",
3681         "-c",
3682         _64_a _64_a _64_a _63_a "b",
3683         _64_a _64_a _64_a _64_a "b",
3684         _64_a _64_a _64_a _64_a "aaaabbbb",
3685         "a" _64_b _64_b _64_b _63_b,
3686         "a" _64_b _64_b _64_b _64_b,
3687         "aaaabbbb" _64_b _64_b _64_b _64_b,
3688 
3689         // Test with strings containing unpaired surrogates.
3690         // They are not representable in UTF-8, and a leading trail surrogate
3691         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3692         // U+20001 == \\uD840\\uDC01
3693         // U+20400 == \\uD841\\uDC00
3694         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3695         "-8cl",
3696         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3697     };
3698     uint32_t whichSpans[96]={ SPAN_ALL };
3699     int32_t whichSpansCount=1;
3700 
3701     UnicodeSet *sets[SET_COUNT]={ NULL };
3702     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3703 
3704     char testName[1024];
3705     char *testNameLimit=testName;
3706 
3707     int32_t i, j;
3708     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3709         const char *s=testdata[i];
3710         if(s[0]=='[') {
3711             // Create new test sets from this pattern.
3712             for(j=0; j<SET_COUNT; ++j) {
3713                 delete sets_with_str[j];
3714                 delete sets[j];
3715             }
3716             UErrorCode errorCode=U_ZERO_ERROR;
3717             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3718             if(U_FAILURE(errorCode)) {
3719                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3720                 break;
3721             }
3722             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3723             sets[SLOW_NOT]->complement();
3724             // Intermediate set: Test cloning of a frozen set.
3725             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3726             fast->freeze();
3727             sets[FAST]=fast->clone();
3728             delete fast;
3729             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3730             fastNot->freeze();
3731             sets[FAST_NOT]=fastNot->clone();
3732             delete fastNot;
3733 
3734             for(j=0; j<SET_COUNT; ++j) {
3735                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3736             }
3737 
3738             strcpy(testName, s);
3739             testNameLimit=strchr(testName, 0);
3740             *testNameLimit++=':';
3741             *testNameLimit=0;
3742 
3743             whichSpans[0]=SPAN_ALL;
3744             whichSpansCount=1;
3745         } else if(s[0]=='-') {
3746             whichSpans[0]=SPAN_ALL;
3747             whichSpansCount=1;
3748 
3749             while(*++s!=0) {
3750                 switch(*s) {
3751                 case 'c':
3752                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3753                                                    ~SPAN_POLARITY,
3754                                                    SPAN_SET,
3755                                                    SPAN_COMPLEMENT,
3756                                                    0);
3757                     break;
3758                 case 'b':
3759                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3760                                                    ~SPAN_DIRS,
3761                                                    SPAN_FWD,
3762                                                    SPAN_BACK,
3763                                                    0);
3764                     break;
3765                 case 'l':
3766                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3767                     // USET_SPAN_SIMPLE only FWD, and separately
3768                     // USET_SPAN_SIMPLE only BACK
3769                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3770                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3771                                                    SPAN_DIRS|SPAN_CONTAINED,
3772                                                    SPAN_FWD|SPAN_SIMPLE,
3773                                                    SPAN_BACK|SPAN_SIMPLE);
3774                     break;
3775                 case '8':
3776                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3777                                                    ~SPAN_UTFS,
3778                                                    SPAN_UTF16,
3779                                                    SPAN_UTF8,
3780                                                    0);
3781                     break;
3782                 default:
3783                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3784                     break;
3785                 }
3786             }
3787         } else if(0==strcmp(s, "*")) {
3788             strcpy(testNameLimit, "bad_string");
3789             for(j=0; j<whichSpansCount; ++j) {
3790                 if(whichSpansCount>1) {
3791                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3792                             "%%0x%3x",
3793                             whichSpans[j]);
3794                 }
3795                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3796                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3797             }
3798 
3799             strcpy(testNameLimit, "contents");
3800             for(j=0; j<whichSpansCount; ++j) {
3801                 if(whichSpansCount>1) {
3802                     sprintf(testNameLimit+8 /* strlen("contents") */,
3803                             "%%0x%3x",
3804                             whichSpans[j]);
3805                 }
3806                 testSpanContents(sets_with_str, whichSpans[j], testName);
3807             }
3808         } else {
3809             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3810             strcpy(testNameLimit, "test_string");
3811             for(j=0; j<whichSpansCount; ++j) {
3812                 if(whichSpansCount>1) {
3813                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3814                             "%%0x%3x",
3815                             whichSpans[j]);
3816                 }
3817                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3818             }
3819         }
3820     }
3821     for(j=0; j<SET_COUNT; ++j) {
3822         delete sets_with_str[j];
3823         delete sets[j];
3824     }
3825 }
3826 
3827 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3828 void UnicodeSetTest::TestStringSpan() {
3829     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3830     static const char *const string=
3831         "xx"
3832         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3833         "xx"
3834         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3835         "xx"
3836         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3837         "aaaa";
3838 
3839     UErrorCode errorCode=U_ZERO_ERROR;
3840     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3841     UnicodeSet set(pattern16, errorCode);
3842     if(U_FAILURE(errorCode)) {
3843         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3844         return;
3845     }
3846 
3847     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3848 
3849     if(set.containsAll(string16)) {
3850         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3851     }
3852 
3853     // Remove trailing "aaaa".
3854     string16.truncate(string16.length()-4);
3855     if(!set.containsAll(string16)) {
3856         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3857     }
3858 
3859     string16=u"byayaxya";
3860     const UChar *s16=string16.getBuffer();
3861     int32_t length16=string16.length();
3862     (void)length16;   // Suppress set but not used warning.
3863     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3864         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3865         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3866         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3867         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3868         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3869     ) {
3870         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3871     }
3872 
3873     pattern="[a{ab}{abc}{cd}]";
3874     pattern16=UnicodeString(pattern, -1, US_INV);
3875     set.applyPattern(pattern16, errorCode);
3876     if(U_FAILURE(errorCode)) {
3877         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3878         return;
3879     }
3880     string16=u"acdabcdabccd";
3881     s16=string16.getBuffer();
3882     length16=string16.length();
3883     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3884         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3885         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3886     ) {
3887         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3888     }
3889 
3890     pattern="[d{cd}{bcd}{ab}]";
3891     pattern16=UnicodeString(pattern, -1, US_INV);
3892     set.applyPattern(pattern16, errorCode).freeze();
3893     if(U_FAILURE(errorCode)) {
3894         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3895         return;
3896     }
3897     string16=u"abbcdabcdabd";
3898     s16=string16.getBuffer();
3899     length16=string16.length();
3900     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3901         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3902         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3903     ) {
3904         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3905     }
3906 }
3907 
3908 /**
3909  * Including collationroot.h fails here with
3910 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3911  *  .. so, we skip this test on Windows.
3912  *
3913  * the cause is that  intltest builds with /Za which disables language extensions - which means
3914  *  windows header files can't be used.
3915  */
3916 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3917 #include "collationroot.h"
3918 #include "collationtailoring.h"
3919 #endif
3920 
TestUCAUnsafeBackwards()3921 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3922 #if U_PLATFORM_HAS_WIN32_API
3923     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3924 #elif !UCONFIG_NO_COLLATION
3925     UErrorCode errorCode = U_ZERO_ERROR;
3926 
3927     // Get the unsafeBackwardsSet
3928     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3929     if(U_FAILURE(errorCode)) {
3930       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3931       return;
3932     }
3933     //const UVersionInfo &version = rootEntry->tailoring->version;
3934     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3935 
3936     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3937 
3938     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3939         // simple test case
3940         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3941         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3942         UnicodeSet surrogates;
3943         surrogates.add(0xd83a);  // a lead surrogate
3944         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3945         UnicodeString pat;
3946         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3947         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3948         // so that at least one type of surrogate code points are escaped,
3949         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3950         errorCode = U_ZERO_ERROR;
3951         UnicodeSet s2;
3952         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3953         if(U_FAILURE(errorCode)) {
3954             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3955         } else {
3956             checkEqual(surrogates, s2, "surrogates to/from pattern");
3957         }
3958         // This occurs in the UCA unsafe-backwards set.
3959         checkRoundTrip(*unsafeBackwardSet);
3960     }
3961 #endif
3962 }
3963 
TestIntOverflow()3964 void UnicodeSetTest::TestIntOverflow() {
3965     // This test triggers undefined double->int conversion behavior
3966     // if the implementation is not careful.
3967     IcuTestErrorCode errorCode(*this, "TestIntOverflow");
3968     UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
3969     assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
3970     assertEquals("[:ccc=int_overflow:] -> illegal argument",
3971                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3972 }
3973 
TestUnusedCcc()3974 void UnicodeSetTest::TestUnusedCcc() {
3975 #if !UCONFIG_NO_NORMALIZATION
3976     // All numeric ccc values 0..255 are valid, but many are unused.
3977     IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
3978     UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
3979     assertSuccess("[:ccc=2:]", errorCode);
3980     assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
3981 
3982     UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
3983     assertSuccess("[:ccc=255:]", errorCode);
3984     assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
3985 
3986     // Non-integer values and values outside 0..255 are invalid.
3987     UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
3988     assertEquals("[:ccc=-1:] -> illegal argument",
3989                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3990     assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
3991 
3992     UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
3993     assertEquals("[:ccc=256:] -> illegal argument",
3994                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
3995     assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
3996 
3997     UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
3998     assertEquals("[:ccc=1.1:] -> illegal argument",
3999                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4000     assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
4001 #endif
4002 }
4003 
TestDeepPattern()4004 void UnicodeSetTest::TestDeepPattern() {
4005     IcuTestErrorCode errorCode(*this, "TestDeepPattern");
4006     // Nested ranges are parsed via recursion which can use a lot of stack space.
4007     // After a reasonable limit, we should get an error.
4008     constexpr int32_t DEPTH = 20000;
4009     UnicodeString pattern, suffix;
4010     for (int32_t i = 0; i < DEPTH; ++i) {
4011         pattern.append(u"[a", 2);
4012         suffix.append(']');
4013     }
4014     pattern.append(suffix);
4015     UnicodeSet set(pattern, errorCode);
4016     assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
4017     errorCode.reset();
4018 }
4019 
TestEmptyString()4020 void UnicodeSetTest::TestEmptyString() {
4021     IcuTestErrorCode errorCode(*this, "TestEmptyString");
4022     // Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
4023     UnicodeSet set(u"[{}]", errorCode);
4024     if (!assertSuccess("set from pattern with {}", errorCode)) { return; }
4025     assertTrue("set from pattern with {}", set.contains(u""));
4026     assertEquals("set from pattern with {}: size", 1, set.size());
4027     assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
4028 
4029     // Remove, add back, ...
4030     assertFalse("remove empty string", set.remove(u"").contains(u""));
4031     assertEquals("remove empty string: size", 0, set.size());
4032     assertTrue("remove empty string: isEmpty", set.isEmpty());
4033     assertTrue("add empty string", set.add(u"").contains(u""));
4034     // missing API -- assertTrue("retain empty string", set.retain(u"").contains(u""));
4035     assertFalse("complement-remove empty string", set.complement(u"").contains(u""));
4036     assertTrue("complement-add empty string", set.complement(u"").contains(u""));
4037 
4038     assertFalse("clear", set.clear().contains(u""));
4039     assertTrue("add empty string 2", set.add(u"").contains(u""));
4040     assertFalse("removeAllStrings", set.removeAllStrings().contains(u""));
4041     assertTrue("add empty string 3", set.add(u"").contains(u""));
4042     // Note that this leaves the set containing exactly the empty string.
4043 
4044     // strings() access and iteration
4045     // no C++ equivalent for Java strings() -- assertTrue("strings()", set.strings().contains(u""));
4046     UnicodeSetIterator sit(set);
4047     assertTrue("set iterator.next()", sit.next());
4048     assertTrue("set iterator has empty string", sit.isString() && sit.getString().isEmpty());
4049 
4050     // The empty string is ignored in matching.
4051     set.add(u'a').add(u'c');
4052     assertEquals("span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4053     assertEquals("spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4054     assertTrue("containsNone", set.containsNone(u"def"));
4055     assertFalse("containsSome", set.containsSome(u"def"));
4056     set.freeze();
4057     assertEquals("frozen span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4058     assertEquals("frozen spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4059     assertTrue("frozen containsNone", set.containsNone(u"def"));
4060     assertFalse("frozen containsSome", set.containsSome(u"def"));
4061 }
4062