• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 *   Copyright (C) 1999-2016 International Business Machines Corporation and
6 *   others. All Rights Reserved.
7 ********************************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 *   03/22/2000  Madhu       Added additional tests
11 ********************************************************************************
12 */
13 
14 #include <stdio.h>
15 
16 #include <string.h>
17 #include <unordered_map>
18 #include "unicode/utypes.h"
19 #include "usettest.h"
20 #include "unicode/ucnv.h"
21 #include "unicode/uniset.h"
22 #include "unicode/uchar.h"
23 #include "unicode/usetiter.h"
24 #include "unicode/ustring.h"
25 #include "unicode/parsepos.h"
26 #include "unicode/symtable.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
29 #include "unicode/uversion.h"
30 #include "cmemory.h"
31 #include "hash.h"
32 
33 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
34     if (U_FAILURE(status)) { \
35         dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
36                   u_errorName(status)); \
37     } \
38 } UPRV_BLOCK_MACRO_END
39 
40 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
41     if (!(expr)) { \
42         dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
43     } \
44 } UPRV_BLOCK_MACRO_END
45 
operator +(const UnicodeString & left,const UnicodeSet & set)46 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
47     UnicodeString pat;
48     set.toPattern(pat);
49     return left + UnicodeSetTest::escape(pat);
50 }
51 
UnicodeSetTest()52 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(nullptr) {
53 }
54 
openUTF8Converter()55 UConverter *UnicodeSetTest::openUTF8Converter() {
56     if(utf8Cnv==nullptr) {
57         UErrorCode errorCode=U_ZERO_ERROR;
58         utf8Cnv=ucnv_open("UTF-8", &errorCode);
59     }
60     return utf8Cnv;
61 }
62 
~UnicodeSetTest()63 UnicodeSetTest::~UnicodeSetTest() {
64     ucnv_close(utf8Cnv);
65 }
66 
67 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)68 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
69                                const char* &name, char* /*par*/) {
70     if (exec) {
71         logln(u"TestSuite UnicodeSetTest");
72     }
73     TESTCASE_AUTO_BEGIN;
74     TESTCASE_AUTO(TestPatterns);
75     TESTCASE_AUTO(TestAddRemove);
76     TESTCASE_AUTO(TestCategories);
77     TESTCASE_AUTO(TestCloneEqualHash);
78     TESTCASE_AUTO(TestMinimalRep);
79     TESTCASE_AUTO(TestAPI);
80     TESTCASE_AUTO(TestScriptSet);
81     TESTCASE_AUTO(TestPropertySet);
82     TESTCASE_AUTO(TestClone);
83     TESTCASE_AUTO(TestExhaustive);
84     TESTCASE_AUTO(TestToPattern);
85     TESTCASE_AUTO(TestIndexOf);
86     TESTCASE_AUTO(TestStrings);
87     TESTCASE_AUTO(Testj2268);
88     TESTCASE_AUTO(TestCloseOver);
89     TESTCASE_AUTO(TestCloseOverSimpleCaseFolding);
90     TESTCASE_AUTO(TestCloseOverLargeSets);
91     TESTCASE_AUTO(TestEscapePattern);
92     TESTCASE_AUTO(TestInvalidCodePoint);
93     TESTCASE_AUTO(TestSymbolTable);
94     TESTCASE_AUTO(TestSurrogate);
95     TESTCASE_AUTO(TestPosixClasses);
96     TESTCASE_AUTO(TestIteration);
97     TESTCASE_AUTO(TestFreezable);
98     TESTCASE_AUTO(TestSpan);
99     TESTCASE_AUTO(TestStringSpan);
100     TESTCASE_AUTO(TestPatternWithSurrogates);
101     TESTCASE_AUTO(TestIntOverflow);
102     TESTCASE_AUTO(TestUnusedCcc);
103     TESTCASE_AUTO(TestDeepPattern);
104     TESTCASE_AUTO(TestEmptyString);
105     TESTCASE_AUTO(TestSkipToStrings);
106     TESTCASE_AUTO(TestPatternCodePointComplement);
107     TESTCASE_AUTO_END;
108 }
109 
110 static const char NOT[] = "%%%%";
111 
112 /**
113  * UVector was improperly copying contents
114  * This code will crash this is still true
115  */
Testj2268()116 void UnicodeSetTest::Testj2268() {
117   UnicodeSet t;
118   t.add(UnicodeString("abc"));
119   UnicodeSet test(t);
120   UnicodeString ustrPat;
121   test.toPattern(ustrPat, true);
122 }
123 
124 /**
125  * Test toPattern().
126  */
TestToPattern()127 void UnicodeSetTest::TestToPattern() {
128     UErrorCode ec = U_ZERO_ERROR;
129 
130     // Test that toPattern() round trips with syntax characters and
131     // whitespace.
132     {
133         static const char* OTHER_TOPATTERN_TESTS[] = {
134             "[[:latin:]&[:greek:]]",
135             "[[:latin:]-[:greek:]]",
136             "[:nonspacing mark:]",
137             nullptr
138         };
139 
140         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=nullptr; ++j) {
141             ec = U_ZERO_ERROR;
142             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
143             if (U_FAILURE(ec)) {
144                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
145                 continue;
146             }
147             checkPat(OTHER_TOPATTERN_TESTS[j], s);
148         }
149 
150         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
151             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
152 
153                 // check various combinations to make sure they all work.
154                 if (i != 0 && !toPatternAux(i, i)){
155                     continue;
156                 }
157                 if (!toPatternAux(0, i)){
158                     continue;
159                 }
160                 if (!toPatternAux(i, 0xFFFF)){
161                     continue;
162                 }
163             }
164         }
165     }
166 
167     // Test pattern behavior of multicharacter strings.
168     {
169         ec = U_ZERO_ERROR;
170         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
171 
172         // This loop isn't a loop.  It's here to make the compiler happy.
173         // If you're curious, try removing it and changing the 'break'
174         // statements (except for the last) to goto's.
175         for (;;) {
176             if (U_FAILURE(ec)) break;
177             const char* exp1[] = {"aa", "ab", NOT, "ac", nullptr};
178             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
179 
180             s->add("ac");
181             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", nullptr};
182             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
183 
184             s->applyPattern(u"[a-z {\\{l} {r\\}}]", ec);
185             if (U_FAILURE(ec)) break;
186             const char* exp3[] = {"{l", "r}", NOT, "xy", nullptr};
187             expectToPattern(*s, u"[a-z{r\\}}{\\{l}]", exp3);
188 
189             s->add("[]");
190             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", nullptr};
191             expectToPattern(*s, u"[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
192 
193             s->applyPattern(u"[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
194             if (U_FAILURE(ec)) break;
195             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", nullptr};
196             expectToPattern(*s, u"[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
197 
198             // j2189
199             s->clear();
200             s->add(UnicodeString("abc", ""));
201             s->add(UnicodeString("abc", ""));
202             const char* exp6[] = {"abc", NOT, "ab", nullptr};
203             expectToPattern(*s, "[{abc}]", exp6);
204 
205             break;
206         }
207 
208         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
209         delete s;
210     }
211 
212     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
213     UnicodeSet s;
214     s.add(u'a', u'b');
215     expectToPattern(s, "[ab]", nullptr);
216 }
217 
toPatternAux(UChar32 start,UChar32 end)218 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
219 
220     // use Integer.toString because Utility.hex doesn't handle ints
221     UnicodeString pat = "";
222     // TODO do these in hex
223     //String source = "0x" + Integer.toString(start,16).toUpperCase();
224     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
225     UnicodeString source;
226     source = source + (uint32_t)start;
227     if (start != end)
228         source = source + ".." + (uint32_t)end;
229     UnicodeSet testSet;
230     testSet.add(start, end);
231     return checkPat(source, testSet);
232 }
233 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)234 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
235                                const UnicodeSet& testSet) {
236     // What we want to make sure of is that a pattern generated
237     // by toPattern(), with or without escaped unprintables, can
238     // be passed back into the UnicodeSet constructor.
239     UnicodeString pat0;
240 
241     testSet.toPattern(pat0, true);
242 
243     if (!checkPat(source + " (escaped)", testSet, pat0)) return false;
244 
245     //String pat1 = unescapeLeniently(pat0);
246     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
247 
248     UnicodeString pat2;
249     testSet.toPattern(pat2, false);
250     if (!checkPat(source, testSet, pat2)) return false;
251 
252     //String pat3 = unescapeLeniently(pat2);
253     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
254 
255     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
256     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
257     return true;
258 }
259 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)260 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
261                                const UnicodeSet& testSet,
262                                const UnicodeString& pat) {
263     UErrorCode ec = U_ZERO_ERROR;
264     UnicodeSet testSet2(pat, ec);
265     if (testSet2 != testSet) {
266         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
267         return false;
268     }
269     return true;
270 }
271 
272 void
TestPatterns()273 UnicodeSetTest::TestPatterns() {
274     UnicodeSet set;
275     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
276     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
277     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
278     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
279     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
280     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
281 
282     // Throw in a test of complement
283     set.complement();
284     UnicodeString exp;
285     exp.append((char16_t)0x0000).append("aeeoouu").append((char16_t)(u'z'+1)).append((char16_t)0xFFFF);
286     expectPairs(set, exp);
287 }
288 
289 void
TestCategories()290 UnicodeSetTest::TestCategories() {
291     UErrorCode status = U_ZERO_ERROR;
292     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
293     UnicodeSet set(pat, status);
294     if (U_FAILURE(status)) {
295         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
296         return;
297     } else {
298         expectContainment(set, pat, "ABC", "abc");
299     }
300 
301     UChar32 i;
302     int32_t failures = 0;
303     // Make sure generation of L doesn't pollute cached Lu set
304     // First generate L, then Lu
305     set.applyPattern("[:L:]", status);
306     if (U_FAILURE(status)) { errln("FAIL"); return; }
307     for (i=0; i<0x200; ++i) {
308         UBool l = u_isalpha((char16_t)i);
309         if (l != set.contains(i)) {
310             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
311                   set.contains(i));
312             if (++failures == 10) break;
313         }
314     }
315 
316     set.applyPattern("[:Lu:]", status);
317     if (U_FAILURE(status)) { errln("FAIL"); return; }
318     for (i=0; i<0x200; ++i) {
319         UBool lu = (u_charType((char16_t)i) == U_UPPERCASE_LETTER);
320         if (lu != set.contains(i)) {
321             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
322                   set.contains(i));
323             if (++failures == 20) break;
324         }
325     }
326 }
327 void
TestCloneEqualHash()328 UnicodeSetTest::TestCloneEqualHash() {
329     UErrorCode status = U_ZERO_ERROR;
330     // set1 and set2 used to be built with the obsolete constructor taking
331     // UCharCategory values; replaced with pattern constructors
332     // markus 20030502
333     UnicodeSet *set1=new UnicodeSet(u"\\p{Lowercase Letter}", status); //  :Ll: Letter, lowercase
334     UnicodeSet *set1a=new UnicodeSet(u"[:Ll:]", status); //  Letter, lowercase
335     if (U_FAILURE(status)){
336         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
337         return;
338     }
339     UnicodeSet *set2=new UnicodeSet(u"\\p{Decimal Number}", status);   //Number, Decimal digit
340     UnicodeSet *set2a=new UnicodeSet(u"[:Nd:]", status);   //Number, Decimal digit
341     if (U_FAILURE(status)){
342         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
343         return;
344     }
345 
346     if (*set1 != *set1a) {
347         errln("FAIL: category constructor for Ll broken");
348     }
349     if (*set2 != *set2a) {
350         errln("FAIL: category constructor for Nd broken");
351     }
352     delete set1a;
353     delete set2a;
354 
355     logln("Testing copy construction");
356     UnicodeSet *set1copy=new UnicodeSet(*set1);
357     if(*set1 != *set1copy || *set1 == *set2 ||
358         getPairs(*set1) != getPairs(*set1copy) ||
359         set1->hashCode() != set1copy->hashCode()){
360         errln("FAIL : Error in copy construction");
361         return;
362     }
363 
364     logln("Testing =operator");
365     UnicodeSet set1equal=*set1;
366     UnicodeSet set2equal=*set2;
367     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
368         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
369         errln("FAIL: Error in =operator");
370     }
371 
372     logln("Testing clone()");
373     UnicodeSet *set1clone=set1->clone();
374     UnicodeSet *set2clone=set2->clone();
375     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
376         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
377         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
378         errln("FAIL: Error in clone");
379     }
380 
381     logln("Testing hashcode");
382     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
383         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
384         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
385         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
386         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
387         errln("FAIL: Error in hashCode()");
388     }
389 
390     delete set1;
391     delete set1copy;
392     delete set2;
393     delete set1clone;
394     delete set2clone;
395 
396 
397 }
398 void
TestAddRemove()399 UnicodeSetTest::TestAddRemove() {
400     UnicodeSet set; // Construct empty set
401     doAssert(set.isEmpty() == true, "set should be empty");
402     doAssert(set.size() == 0, "size should be 0");
403     set.complement();
404     doAssert(set.size() == 0x110000, "size should be 0x110000");
405     set.clear();
406     set.add(0x0061, 0x007a);
407     expectPairs(set, "az");
408     doAssert(set.isEmpty() == false, "set should not be empty");
409     doAssert(set.size() != 0, "size should not be equal to 0");
410     doAssert(set.size() == 26, "size should be equal to 26");
411     set.remove(0x006d, 0x0070);
412     expectPairs(set, "alqz");
413     doAssert(set.size() == 22, "size should be equal to 22");
414     set.remove(0x0065, 0x0067);
415     expectPairs(set, "adhlqz");
416     doAssert(set.size() == 19, "size should be equal to 19");
417     set.remove(0x0064, 0x0069);
418     expectPairs(set, "acjlqz");
419     doAssert(set.size() == 16, "size should be equal to 16");
420     set.remove(0x0063, 0x0072);
421     expectPairs(set, "absz");
422     doAssert(set.size() == 10, "size should be equal to 10");
423     set.add(0x0066, 0x0071);
424     expectPairs(set, "abfqsz");
425     doAssert(set.size() == 22, "size should be equal to 22");
426     set.remove(0x0061, 0x0067);
427     expectPairs(set, "hqsz");
428     set.remove(0x0061, 0x007a);
429     expectPairs(set, "");
430     doAssert(set.isEmpty() == true, "set should be empty");
431     doAssert(set.size() == 0, "size should be 0");
432     set.add(0x0061);
433     doAssert(set.isEmpty() == false, "set should not be empty");
434     doAssert(set.size() == 1, "size should not be equal to 1");
435     set.add(0x0062);
436     set.add(0x0063);
437     expectPairs(set, "ac");
438     doAssert(set.size() == 3, "size should not be equal to 3");
439     set.add(0x0070);
440     set.add(0x0071);
441     expectPairs(set, "acpq");
442     doAssert(set.size() == 5, "size should not be equal to 5");
443     set.clear();
444     expectPairs(set, "");
445     doAssert(set.isEmpty() == true, "set should be empty");
446     doAssert(set.size() == 0, "size should be 0");
447 
448     // Try removing an entire set from another set
449     expectPattern(set, "[c-x]", "cx");
450     UnicodeSet set2;
451     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
452     set.removeAll(set2);
453     expectPairs(set, "deluxx");
454 
455     // Try adding an entire set to another set
456     expectPattern(set, "[jackiemclean]", "aacceein");
457     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
458     set.addAll(set2);
459     expectPairs(set, "aacehort");
460     doAssert(set.containsAll(set2) == true, "set should contain all the elements in set2");
461 
462     // Try retaining an set of elements contained in another set (intersection)
463     UnicodeSet set3;
464     expectPattern(set3, "[a-c]", "ac");
465     doAssert(set.containsAll(set3) == false, "set doesn't contain all the elements in set3");
466     set3.remove(0x0062);
467     expectPairs(set3, "aacc");
468     doAssert(set.containsAll(set3) == true, "set should contain all the elements in set3");
469     set.retainAll(set3);
470     expectPairs(set, "aacc");
471     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
472     doAssert(set.containsAll(set3) == true, "set should contain all the elements in set3");
473     set.clear();
474     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
475 
476     // Test commutativity
477     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
478     expectPattern(set2, "[jackiemclean]", "aacceein");
479     set.addAll(set2);
480     expectPairs(set, "aacehort");
481     doAssert(set.containsAll(set2) == true, "set should contain all the elements in set2");
482 
483 
484 
485 
486 }
487 
488 /**
489  * Make sure minimal representation is maintained.
490  */
TestMinimalRep()491 void UnicodeSetTest::TestMinimalRep() {
492     UErrorCode status = U_ZERO_ERROR;
493     // This is pretty thoroughly tested by checkCanonicalRep()
494     // run against the exhaustive operation results.  Use the code
495     // here for debugging specific spot problems.
496 
497     // 1 overlap against 2
498     UnicodeSet set("[h-km-q]", status);
499     if (U_FAILURE(status)) { errln("FAIL"); return; }
500     UnicodeSet set2("[i-o]", status);
501     if (U_FAILURE(status)) { errln("FAIL"); return; }
502     set.addAll(set2);
503     expectPairs(set, "hq");
504     // right
505     set.applyPattern("[a-m]", status);
506     if (U_FAILURE(status)) { errln("FAIL"); return; }
507     set2.applyPattern("[e-o]", status);
508     if (U_FAILURE(status)) { errln("FAIL"); return; }
509     set.addAll(set2);
510     expectPairs(set, "ao");
511     // left
512     set.applyPattern("[e-o]", status);
513     if (U_FAILURE(status)) { errln("FAIL"); return; }
514     set2.applyPattern("[a-m]", status);
515     if (U_FAILURE(status)) { errln("FAIL"); return; }
516     set.addAll(set2);
517     expectPairs(set, "ao");
518     // 1 overlap against 3
519     set.applyPattern("[a-eg-mo-w]", status);
520     if (U_FAILURE(status)) { errln("FAIL"); return; }
521     set2.applyPattern("[d-q]", status);
522     if (U_FAILURE(status)) { errln("FAIL"); return; }
523     set.addAll(set2);
524     expectPairs(set, "aw");
525 }
526 
TestAPI()527 void UnicodeSetTest::TestAPI() {
528     UErrorCode status = U_ZERO_ERROR;
529     // default ct
530     UnicodeSet set;
531     if (!set.isEmpty() || set.getRangeCount() != 0) {
532         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
533               set);
534     }
535 
536     // clear(), isEmpty()
537     set.add(0x0061);
538     if (set.isEmpty()) {
539         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
540               set);
541     }
542     set.clear();
543     if (!set.isEmpty()) {
544         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
545               set);
546     }
547 
548     // size()
549     set.clear();
550     if (set.size() != 0) {
551         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
552               ": " + set);
553     }
554     set.add(0x0061);
555     if (set.size() != 1) {
556         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
557               ": " + set);
558     }
559     set.add(0x0031, 0x0039);
560     if (set.size() != 10) {
561         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
562               ": " + set);
563     }
564 
565     // contains(first, last)
566     set.clear();
567     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
568     if (U_FAILURE(status)) { errln("FAIL"); return; }
569     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
570         UChar32 a = set.getRangeStart(i);
571         UChar32 b = set.getRangeEnd(i);
572         if (!set.contains(a, b)) {
573             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
574                   " but doesn't: " + set);
575         }
576         if (set.contains((UChar32)(a-1), b)) {
577             errln((UnicodeString)"FAIL, shouldn't contain " +
578                   (unsigned short)(a-1) + '-' + (unsigned short)b +
579                   " but does: " + set);
580         }
581         if (set.contains(a, (UChar32)(b+1))) {
582             errln((UnicodeString)"FAIL, shouldn't contain " +
583                   (unsigned short)a + '-' + (unsigned short)(b+1) +
584                   " but does: " + set);
585         }
586     }
587 
588     // Ported InversionList test.
589     UnicodeSet a((UChar32)3,(UChar32)10);
590     UnicodeSet b((UChar32)7,(UChar32)15);
591     UnicodeSet c;
592 
593     logln((UnicodeString)"a [3-10]: " + a);
594     logln((UnicodeString)"b [7-15]: " + b);
595     c = a;
596     c.addAll(b);
597     UnicodeSet exp((UChar32)3,(UChar32)15);
598     if (c == exp) {
599         logln((UnicodeString)"c.set(a).add(b): " + c);
600     } else {
601         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
602     }
603     c.complement();
604     exp.set((UChar32)0, (UChar32)2);
605     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
606     if (c == exp) {
607         logln((UnicodeString)"c.complement(): " + c);
608     } else {
609         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
610     }
611     c.complement();
612     exp.set((UChar32)3, (UChar32)15);
613     if (c == exp) {
614         logln((UnicodeString)"c.complement(): " + c);
615     } else {
616         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
617     }
618     c = a;
619     c.complementAll(b);
620     exp.set((UChar32)3,(UChar32)6);
621     exp.add((UChar32)11,(UChar32) 15);
622     if (c == exp) {
623         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
624     } else {
625         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
626     }
627 
628     exp = c;
629     bitsToSet(setToBits(c), c);
630     if (c == exp) {
631         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
632     } else {
633         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
634     }
635 
636     // Additional tests for coverage JB#2118
637     //UnicodeSet::complement(class UnicodeString const &)
638     //UnicodeSet::complementAll(class UnicodeString const &)
639     //UnicodeSet::containsNone(class UnicodeSet const &)
640     //UnicodeSet::containsNone(long,long)
641     //UnicodeSet::containsSome(class UnicodeSet const &)
642     //UnicodeSet::containsSome(long,long)
643     //UnicodeSet::removeAll(class UnicodeString const &)
644     //UnicodeSet::retain(long)
645     //UnicodeSet::retainAll(class UnicodeString const &)
646     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
647     //UnicodeSetIterator::getString()
648     set.clear();
649     set.complement("ab");
650     exp.applyPattern("[{ab}]", status);
651     if (U_FAILURE(status)) { errln("FAIL"); return; }
652     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
653 
654     UnicodeSetIterator iset(set);
655     if (!iset.next() || !iset.isString()) {
656         errln("FAIL: UnicodeSetIterator::next/isString");
657     } else if (iset.getString() != "ab") {
658         errln("FAIL: UnicodeSetIterator::getString");
659     }
660 
661     set.add(u'a', u'z');
662     set.complementAll("alan");
663     exp.applyPattern("[{ab}b-kmo-z]", status);
664     if (U_FAILURE(status)) { errln("FAIL"); return; }
665     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
666 
667     exp.applyPattern("[a-z]", status);
668     if (U_FAILURE(status)) { errln("FAIL"); return; }
669     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
670     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
671     exp.applyPattern("[aln]", status);
672     if (U_FAILURE(status)) { errln("FAIL"); return; }
673     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
674     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
675 
676     if (set.containsNone(u'a', u'z')) {
677         errln("FAIL: containsNone(UChar32, UChar32)");
678     }
679     if (!set.containsSome(u'a', u'z')) {
680         errln("FAIL: containsSome(UChar32, UChar32)");
681     }
682     if (!set.containsNone(u'A', u'Z')) {
683         errln("FAIL: containsNone(UChar32, UChar32)");
684     }
685     if (set.containsSome(u'A', u'Z')) {
686         errln("FAIL: containsSome(UChar32, UChar32)");
687     }
688 
689     set.removeAll("liu");
690     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
691     if (U_FAILURE(status)) { errln("FAIL"); return; }
692     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
693 
694     set.retainAll("star");
695     exp.applyPattern("[rst]", status);
696     if (U_FAILURE(status)) { errln("FAIL"); return; }
697     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
698 
699     set.retain(u's');
700     exp.applyPattern("[s]", status);
701     if (U_FAILURE(status)) { errln("FAIL"); return; }
702     if (set != exp) { errln("FAIL: retain('s')"); return; }
703 
704     // ICU 2.6 coverage tests
705     // public final UnicodeSet retain(String s);
706     // public final UnicodeSet remove(int c);
707     // public final UnicodeSet remove(String s);
708     // public int hashCode();
709     set.applyPattern(u"[a-z{ab}{cd}]", status);
710     if (U_FAILURE(status)) { errln("FAIL"); return; }
711     set.retain(u"cd");
712     exp.applyPattern(u"[{cd}]", status);
713     if (U_FAILURE(status)) { errln("FAIL"); return; }
714     if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
715 
716     set.applyPattern(u"[a-z{ab}{yz}]", status);
717     if (U_FAILURE(status)) { errln("FAIL"); return; }
718     set.retain(u"cd");
719     exp.clear();
720     if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
721 
722     set.applyPattern(u"[a-z{ab}{cd}]", status);
723     if (U_FAILURE(status)) { errln("FAIL"); return; }
724     set.remove(u'c');
725     exp.applyPattern(u"[abd-z{ab}{cd}]", status);
726     if (set != exp) { errln("FAIL: remove('c')"); return; }
727 
728     set.remove(u"cd");
729     exp.applyPattern(u"[abd-z{ab}]", status);
730     if (U_FAILURE(status)) { errln("FAIL"); return; }
731     if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
732 
733     set.applyPattern("[s]", status);
734     if (U_FAILURE(status)) { errln("FAIL"); return; }
735     uint16_t buf[32];
736     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
737     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
738     if (slen != 3 || buf[0] != 2 || buf[1] != u's' || buf[2] != u't') {
739         errln("FAIL: serialize");
740         return;
741     }
742 
743     // Conversions to and from USet
744     UnicodeSet *uniset = &set;
745     USet *uset = uniset->toUSet();
746     TEST_ASSERT((void *)uset == (void *)uniset);
747     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
748     TEST_ASSERT((void *)setx == (void *)uset);
749     const UnicodeSet *constSet = uniset;
750     const USet *constUSet = constSet->toUSet();
751     TEST_ASSERT((void *)constUSet == (void *)constSet);
752     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
753     TEST_ASSERT((void *)constSetx == (void *)constUSet);
754 
755     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
756     UnicodeString longString=u"aaaaaaaaaabbbbbbbbbbcccccccccc";
757     UnicodeSet ac(0x61, 0x63);
758     ac.remove(0x62).freeze();
759     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
760         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
761         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
762         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
763         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
764         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
765         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
766         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
767         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
768         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
769     ) {
770         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
771     }
772     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
773         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
774         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
775         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
776         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
777         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
778         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
779         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
780         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
781         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
782     ) {
783         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
784     }
785 }
786 
TestIteration()787 void UnicodeSetTest::TestIteration() {
788     UErrorCode ec = U_ZERO_ERROR;
789     int i = 0;
790     int outerLoop;
791 
792     // 6 code points, 3 ranges, 2 strings, 8 total elements
793     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
794     UnicodeSet set(u"[zabyc\\U0001abcd{str1}{str2}]", ec);
795     TEST_ASSERT_SUCCESS(ec);
796     UnicodeSetIterator it(set);
797 
798     for (outerLoop=0; outerLoop<3; outerLoop++) {
799         // Run the test multiple times, to check that iterator.reset() is working.
800         for (i=0; i<10; i++) {
801             UBool         nextv        = it.next();
802             UBool         isString     = it.isString();
803             int32_t       codePoint    = it.getCodepoint();
804             //int32_t       codePointEnd = it.getCodepointEnd();
805             UnicodeString s   = it.getString();
806             switch (i) {
807             case 0:
808                 TEST_ASSERT(nextv == true);
809                 TEST_ASSERT(isString == false);
810                 TEST_ASSERT(codePoint==0x61);
811                 TEST_ASSERT(s == "a");
812                 break;
813             case 1:
814                 TEST_ASSERT(nextv == true);
815                 TEST_ASSERT(isString == false);
816                 TEST_ASSERT(codePoint==0x62);
817                 TEST_ASSERT(s == "b");
818                 break;
819             case 2:
820                 TEST_ASSERT(nextv == true);
821                 TEST_ASSERT(isString == false);
822                 TEST_ASSERT(codePoint==0x63);
823                 TEST_ASSERT(s == "c");
824                 break;
825             case 3:
826                 TEST_ASSERT(nextv == true);
827                 TEST_ASSERT(isString == false);
828                 TEST_ASSERT(codePoint==0x79);
829                 TEST_ASSERT(s == "y");
830                 break;
831             case 4:
832                 TEST_ASSERT(nextv == true);
833                 TEST_ASSERT(isString == false);
834                 TEST_ASSERT(codePoint==0x7a);
835                 TEST_ASSERT(s == "z");
836                 break;
837             case 5:
838                 TEST_ASSERT(nextv == true);
839                 TEST_ASSERT(isString == false);
840                 TEST_ASSERT(codePoint==0x1abcd);
841                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
842                 break;
843             case 6:
844                 TEST_ASSERT(nextv == true);
845                 TEST_ASSERT(isString == true);
846                 TEST_ASSERT(s == "str1");
847                 break;
848             case 7:
849                 TEST_ASSERT(nextv == true);
850                 TEST_ASSERT(isString == true);
851                 TEST_ASSERT(s == "str2");
852                 break;
853             case 8:
854                 TEST_ASSERT(nextv == false);
855                 break;
856             case 9:
857                 TEST_ASSERT(nextv == false);
858                 break;
859             }
860         }
861         it.reset();  // prepare to run the iteration again.
862     }
863 }
864 
865 
866 
867 
TestStrings()868 void UnicodeSetTest::TestStrings() {
869     UErrorCode ec = U_ZERO_ERROR;
870 
871     UnicodeSet* testList[] = {
872         UnicodeSet::createFromAll("abc"),
873         new UnicodeSet("[a-c]", ec),
874 
875         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
876         new UnicodeSet("[{ll}{ch}a-z]", ec),
877 
878         UnicodeSet::createFrom("ab}c"),
879         new UnicodeSet("[{ab\\}c}]", ec),
880 
881         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
882         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
883 
884         nullptr
885     };
886 
887     if (U_FAILURE(ec)) {
888         errln("FAIL: couldn't construct test sets");
889     }
890     assertFalse("[a-c].hasStrings()", testList[0]->hasStrings());
891     assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings());
892 
893     for (int32_t i = 0; testList[i] != nullptr; i+=2) {
894         if (U_SUCCESS(ec)) {
895             UnicodeString pat0, pat1;
896             testList[i]->toPattern(pat0, true);
897             testList[i+1]->toPattern(pat1, true);
898             if (*testList[i] == *testList[i+1]) {
899                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
900             } else {
901                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
902             }
903         }
904         delete testList[i];
905         delete testList[i+1];
906     }
907 }
908 
909 /**
910  * Test the [:Latin:] syntax.
911  */
TestScriptSet()912 void UnicodeSetTest::TestScriptSet() {
913     expectContainment(u"[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
914 
915     expectContainment(u"[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
916 
917     /* Jitterbug 1423 */
918     expectContainment(u"[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
919 
920 }
921 
922 /**
923  * Test the [:Latin:] syntax.
924  */
TestPropertySet()925 void UnicodeSetTest::TestPropertySet() {
926     static const char* const DATA[] = {
927         // Pattern, Chars IN, Chars NOT in
928 
929         "[:Latin:]",
930         "aA",
931         "\\u0391\\u03B1",
932 
933         "[\\p{Greek}]",
934         "\\u0391\\u03B1",
935         "aA",
936 
937         "\\P{ GENERAL Category = upper case letter }",
938         "abc",
939         "ABC",
940 
941 #if !UCONFIG_NO_NORMALIZATION
942         // Combining class: @since ICU 2.2
943         // Check both symbolic and numeric
944         "\\p{ccc=Nukta}",
945         "\\u0ABC",
946         "abc",
947 
948         "\\p{Canonical Combining Class = 11}",
949         "\\u05B1",
950         "\\u05B2",
951 
952         "[:c c c = iota subscript :]",
953         "\\u0345",
954         "xyz",
955 #endif
956 
957         // Bidi class: @since ICU 2.2
958         "\\p{bidiclass=lefttoright}",
959         "abc",
960         "\\u0671\\u0672",
961 
962         // Binary properties: @since ICU 2.2
963         "\\p{ideographic}",
964         "\\u4E0A",
965         "x",
966 
967         "[:math=false:]",
968         "q)*(",
969         // weiv: )(and * were removed from math in Unicode 4.0.1
970         //"(*+)",
971         "+<>^",
972 
973         // JB#1767 \N{}, \p{ASCII}
974         "[:Ascii:]",
975         "abc\\u0000\\u007F",
976         "\\u0080\\u4E00",
977 
978         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
979         "az",
980         "qrs",
981 
982         // JB#2015
983         "[:any:]",
984         "a\\U0010FFFF",
985         "",
986 
987         "[:nv=0.5:]",
988         "\\u00BD\\u0F2A",
989         "\\u00BC",
990 
991         // JB#2653: Age
992         "[:Age=1.1:]",
993         "\\u03D6", // 1.1
994         "\\u03D8\\u03D9", // 3.2
995 
996         "[:Age=3.1:]",
997         "\\u1800\\u3400\\U0002f800",
998         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
999 
1000         // JB#2350: Case_Sensitive
1001         "[:Case Sensitive:]",
1002         "A\\u1FFC\\U00010410",
1003         ";\\u00B4\\U00010500",
1004 
1005         // JB#2832: C99-compatibility props
1006         "[:blank:]",
1007         " \\u0009",
1008         "1-9A-Z",
1009 
1010         "[:graph:]",
1011         "19AZ",
1012         " \\u0003\\u0007\\u0009\\u000A\\u000D",
1013 
1014         "[:punct:]",
1015         "!@#%&*()[]{}-_\\/;:,.?'\"",
1016         "09azAZ",
1017 
1018         "[:xdigit:]",
1019         "09afAF",
1020         "gG!",
1021 
1022         // Regex compatibility test
1023         "[-b]", // leading '-' is literal
1024         "-b",
1025         "ac",
1026 
1027         "[^-b]", // leading '-' is literal
1028         "ac",
1029         "-b",
1030 
1031         "[b-]", // trailing '-' is literal
1032         "-b",
1033         "ac",
1034 
1035         "[^b-]", // trailing '-' is literal
1036         "ac",
1037         "-b",
1038 
1039         "[a-b-]", // trailing '-' is literal
1040         "ab-",
1041         "c=",
1042 
1043         "[[a-q]&[p-z]-]", // trailing '-' is literal
1044         "pq-",
1045         "or=",
1046 
1047         "[\\s|\\)|:|$|\\>]", // from regex tests
1048         "s|):$>",
1049         "abc",
1050 
1051         "[\\uDC00cd]", // JB#2906: isolated trail at start
1052         "cd\\uDC00",
1053         "ab\\uD800\\U00010000",
1054 
1055         "[ab\\uD800]", // JB#2906: isolated trail at start
1056         "ab\\uD800",
1057         "cd\\uDC00\\U00010000",
1058 
1059         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1060         "abcd\\uD800",
1061         "ef\\uDC00\\U00010000",
1062 
1063         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1064         "abcd\\uDC00",
1065         "ef\\uD800\\U00010000",
1066 
1067 #if !UCONFIG_NO_NORMALIZATION
1068         "[:^lccc=0:]", // Lead canonical class
1069         "\\u0300\\u0301",
1070         "abcd\\u00c0\\u00c5",
1071 
1072         "[:^tccc=0:]", // Trail canonical class
1073         "\\u0300\\u0301\\u00c0\\u00c5",
1074         "abcd",
1075 
1076         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1077         "\\u0300\\u0301\\u00c0\\u00c5",
1078         "abcd",
1079 
1080         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1081         "",
1082         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1083 
1084         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1085         "\\u0F73\\u0F75\\u0F81",
1086         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1087 #endif /* !UCONFIG_NO_NORMALIZATION */
1088 
1089         "[:Assigned:]",
1090         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1091         "\\u0558\\uFDD3\\uFFFE\\U00050005",
1092 
1093         // Script_Extensions, new in Unicode 6.0
1094         "[:scx=Arab:]",
1095         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1096         "\\u088F\\uFDEF\\uFEFE",
1097 
1098         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1099         // so scx-sc is missing U+FDF2.
1100         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1101         "\\u0640\\u064B\\u0650\\u0655",
1102         "\\uFDF2"
1103     };
1104 
1105     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1106 
1107     for (int32_t i=0; i<DATA_LEN; i+=3) {
1108         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1109                           CharsToUnicodeString(DATA[i+2]));
1110     }
1111 }
1112 
1113 /**
1114   * Test that Posix style character classes [:digit:], etc.
1115   *   have the Unicode definitions from TR 18.
1116   */
TestPosixClasses()1117 void UnicodeSetTest::TestPosixClasses() {
1118     {
1119         UErrorCode status = U_ZERO_ERROR;
1120         UnicodeSet s1("[:alpha:]", status);
1121         UnicodeSet s2(u"\\p{Alphabetic}", status);
1122         TEST_ASSERT_SUCCESS(status);
1123         TEST_ASSERT(s1==s2);
1124     }
1125     {
1126         UErrorCode status = U_ZERO_ERROR;
1127         UnicodeSet s1("[:lower:]", status);
1128         UnicodeSet s2(u"\\p{lowercase}", status);
1129         TEST_ASSERT_SUCCESS(status);
1130         TEST_ASSERT(s1==s2);
1131     }
1132     {
1133         UErrorCode status = U_ZERO_ERROR;
1134         UnicodeSet s1("[:upper:]", status);
1135         UnicodeSet s2(u"\\p{Uppercase}", status);
1136         TEST_ASSERT_SUCCESS(status);
1137         TEST_ASSERT(s1==s2);
1138     }
1139     {
1140         UErrorCode status = U_ZERO_ERROR;
1141         UnicodeSet s1("[:punct:]", status);
1142         UnicodeSet s2(u"\\p{gc=Punctuation}", status);
1143         TEST_ASSERT_SUCCESS(status);
1144         TEST_ASSERT(s1==s2);
1145     }
1146     {
1147         UErrorCode status = U_ZERO_ERROR;
1148         UnicodeSet s1("[:digit:]", status);
1149         UnicodeSet s2(u"\\p{gc=DecimalNumber}", status);
1150         TEST_ASSERT_SUCCESS(status);
1151         TEST_ASSERT(s1==s2);
1152     }
1153     {
1154         UErrorCode status = U_ZERO_ERROR;
1155         UnicodeSet s1("[:xdigit:]", status);
1156         UnicodeSet s2(u"[\\p{DecimalNumber}\\p{HexDigit}]", status);
1157         TEST_ASSERT_SUCCESS(status);
1158         TEST_ASSERT(s1==s2);
1159     }
1160     {
1161         UErrorCode status = U_ZERO_ERROR;
1162         UnicodeSet s1("[:alnum:]", status);
1163         UnicodeSet s2(u"[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1164         TEST_ASSERT_SUCCESS(status);
1165         TEST_ASSERT(s1==s2);
1166     }
1167     {
1168         UErrorCode status = U_ZERO_ERROR;
1169         UnicodeSet s1("[:space:]", status);
1170         UnicodeSet s2(u"\\p{Whitespace}", status);
1171         TEST_ASSERT_SUCCESS(status);
1172         TEST_ASSERT(s1==s2);
1173     }
1174     {
1175         UErrorCode status = U_ZERO_ERROR;
1176         UnicodeSet s1("[:blank:]", status);
1177         TEST_ASSERT_SUCCESS(status);
1178         UnicodeSet s2(u"[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1179             status);
1180         TEST_ASSERT_SUCCESS(status);
1181         TEST_ASSERT(s1==s2);
1182     }
1183     {
1184         UErrorCode status = U_ZERO_ERROR;
1185         UnicodeSet s1("[:cntrl:]", status);
1186         TEST_ASSERT_SUCCESS(status);
1187         UnicodeSet s2(u"\\p{Control}", status);
1188         TEST_ASSERT_SUCCESS(status);
1189         TEST_ASSERT(s1==s2);
1190     }
1191     {
1192         UErrorCode status = U_ZERO_ERROR;
1193         UnicodeSet s1("[:graph:]", status);
1194         TEST_ASSERT_SUCCESS(status);
1195         UnicodeSet s2(u"[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1196         TEST_ASSERT_SUCCESS(status);
1197         TEST_ASSERT(s1==s2);
1198     }
1199     {
1200         UErrorCode status = U_ZERO_ERROR;
1201         UnicodeSet s1("[:print:]", status);
1202         TEST_ASSERT_SUCCESS(status);
1203         UnicodeSet s2(u"[[:graph:][:blank:]-[\\p{Control}]]", status);
1204         TEST_ASSERT_SUCCESS(status);
1205         TEST_ASSERT(s1==s2);
1206     }
1207 }
1208 /**
1209  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1210  */
TestClone()1211 void UnicodeSetTest::TestClone() {
1212     UErrorCode ec = U_ZERO_ERROR;
1213     UnicodeSet s("[abcxyz]", ec);
1214     UnicodeSet t(s);
1215     expectContainment(t, "abc", "def");
1216 }
1217 
1218 /**
1219  * Test the indexOf() and charAt() methods.
1220  */
TestIndexOf()1221 void UnicodeSetTest::TestIndexOf() {
1222     UErrorCode ec = U_ZERO_ERROR;
1223     UnicodeSet set("[a-cx-y3578]", ec);
1224     if (U_FAILURE(ec)) {
1225         errln("FAIL: UnicodeSet constructor");
1226         return;
1227     }
1228     for (int32_t i=0; i<set.size(); ++i) {
1229         UChar32 c = set.charAt(i);
1230         if (set.indexOf(c) != i) {
1231             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1232                 i, c, set.indexOf(c));
1233         }
1234     }
1235     UChar32 c = set.charAt(set.size());
1236     if (c != -1) {
1237         errln("FAIL: charAt(<out of range>) = %X", c);
1238     }
1239     int32_t j = set.indexOf(u'q');
1240     if (j != -1) {
1241         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1242     }
1243 }
1244 
1245 /**
1246  * Test closure API.
1247  */
TestCloseOver()1248 void UnicodeSetTest::TestCloseOver() {
1249     static constexpr char CASE[] = {(char)USET_CASE_INSENSITIVE};
1250     static constexpr char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1251     static constexpr char SIMPLE_CASE_INSENSITIVE[] = {(char)USET_SIMPLE_CASE_INSENSITIVE};
1252     static const char* DATA[] = {
1253         // selector, input, output
1254         CASE,
1255         "[aq\\u00DF{Bc}{bC}{Fi}]",
1256         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1257 
1258         SIMPLE_CASE_INSENSITIVE,
1259         "[aq\\u00DF{Bc}{bC}{Fi}]",
1260         "[aAqQ\\u00DF\\u1E9E{bc}{fi}]",
1261 
1262         CASE,
1263         "[\\u01F1]", // 'DZ'
1264         "[\\u01F1\\u01F2\\u01F3]",
1265 
1266         SIMPLE_CASE_INSENSITIVE,
1267         "[\\u01F1]", // 'DZ'
1268         "[\\u01F1\\u01F2\\u01F3]",
1269 
1270         CASE,
1271         "[\\u1FB4]",
1272         "[\\u1FB4{\\u03AC\\u03B9}]",
1273 
1274         SIMPLE_CASE_INSENSITIVE,
1275         "[\\u1FB4]",
1276         "[\\u1FB4]",
1277 
1278         CASE,
1279         "[{F\\uFB01}]",
1280         "[\\uFB03{ffi}]",
1281 
1282         CASE, // make sure binary search finds limits
1283         "[a\\uFF3A]",
1284         "[aA\\uFF3A\\uFF5A]",
1285 
1286         CASE,
1287         "[a-z]","[A-Za-z\\u017F\\u212A]",
1288 
1289         SIMPLE_CASE_INSENSITIVE,
1290         "[a-z]","[A-Za-z\\u017F\\u212A]",
1291 
1292         CASE,
1293         "[abc]","[A-Ca-c]",
1294         CASE,
1295         "[ABC]","[A-Ca-c]",
1296 
1297         CASE, "[i]", "[iI]",
1298 
1299         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1300         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1301 
1302         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1303 
1304         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1305 
1306         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1307 
1308         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1309 
1310         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1311 
1312         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1313 
1314         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1315         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1316 
1317         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1318 
1319         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1320 
1321         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1322 
1323 #if !UCONFIG_NO_FILE_IO
1324         CASE_MAPPINGS,
1325         "[aq\\u00DF{Bc}{bC}{Fi}]",
1326         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1327 #endif
1328 
1329         CASE_MAPPINGS,
1330         "[\\u01F1]", // 'DZ'
1331         "[\\u01F1\\u01F2\\u01F3]",
1332 
1333         CASE_MAPPINGS,
1334         "[a-z]",
1335         "[A-Za-z]",
1336 
1337         nullptr
1338     };
1339 
1340     UnicodeSet s;
1341     UnicodeSet t;
1342     UnicodeString buf;
1343     for (int32_t i=0; DATA[i]!=nullptr; i+=3) {
1344         int32_t selector = DATA[i][0];
1345         UnicodeString pat(DATA[i+1], -1, US_INV);
1346         UnicodeString exp(DATA[i+2], -1, US_INV);
1347 
1348         UErrorCode ec = U_ZERO_ERROR;
1349         s.applyPattern(pat, ec);
1350         s.closeOver(selector);
1351         t.applyPattern(exp, ec);
1352         if (U_FAILURE(ec)) {
1353             errln("FAIL: applyPattern failed");
1354             continue;
1355         }
1356         if (s == t) {
1357             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1358         } else {
1359             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1360                   s.toPattern(buf, true) + ", expected " + exp);
1361         }
1362     }
1363 
1364     // Test the pattern API
1365     UErrorCode ec = U_ZERO_ERROR;
1366     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, nullptr, ec);
1367     if (U_FAILURE(ec)) {
1368         errln("FAIL: applyPattern failed");
1369     } else {
1370         expectContainment(s, "abcABC", "defDEF");
1371     }
1372     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, nullptr, ec);
1373     if (U_FAILURE(ec)) {
1374         errln("FAIL: constructor failed");
1375     } else {
1376         expectContainment(v, "defDEF", "abcABC");
1377     }
1378     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, nullptr, ec);
1379     if (U_FAILURE(ec)) {
1380         errln("FAIL: construct w/case mappings failed");
1381     } else {
1382         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1383     }
1384 }
1385 
1386 namespace {
1387 
addIfAbsent(const std::unordered_multimap<UChar32,UChar32> & closure,UChar32 c,UChar32 t,std::unordered_multimap<UChar32,UChar32> & additions)1388 void addIfAbsent(const std::unordered_multimap<UChar32, UChar32> &closure, UChar32 c, UChar32 t,
1389                  std::unordered_multimap<UChar32, UChar32> &additions) {
1390     for (auto it = closure.find(c);; ++it) {
1391         if (it == closure.end() || it->first != c) {
1392             // absent
1393             additions.insert({c, t});
1394             break;
1395         } else if (it->second == t) {
1396             // present
1397             break;
1398         }
1399     }
1400 }
1401 
1402 }  // namespace
1403 
TestCloseOverSimpleCaseFolding()1404 void UnicodeSetTest::TestCloseOverSimpleCaseFolding() {
1405     IcuTestErrorCode errorCode(*this, "TestCloseOverSimpleCaseFolding");
1406     const UnicodeSet *sensitive =
1407         UnicodeSet::fromUSet(u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode));
1408     if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE) failed")) {
1409         return;
1410     }
1411     // Compute the scf=Simple_Case_Folding closure:
1412     // For each scf(c)=t, start with mappings c->t and t->c.
1413     std::unordered_multimap<UChar32, UChar32> closure;
1414     UnicodeSetIterator iter(*sensitive);
1415     while (iter.next()) {
1416         UChar32 c = iter.getCodepoint();
1417         UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
1418         if (scfChar != c) {
1419             closure.insert({c, scfChar});
1420             closure.insert({scfChar, c});
1421         }
1422     }
1423     // Complete the closure: Add mappings of mappings.
1424     for (;;) {
1425         std::unordered_multimap<UChar32, UChar32> additions;
1426         // for each mapping c->t
1427         for (auto mapping : closure) {
1428             UChar32 c = mapping.first;
1429             UChar32 t = mapping.second;
1430             // enumerate each t->u
1431             for (auto it = closure.find(t); it != closure.end() && it->first == t; ++it) {
1432                 UChar32 u = it->second;
1433                 if (u != c) {
1434                     addIfAbsent(closure, c, u, additions);
1435                     addIfAbsent(closure, u, c, additions);
1436                 }
1437             }
1438         }
1439         if (additions.empty()) {
1440             break;  // The closure is complete.
1441         }
1442         closure.insert(additions.begin(), additions.end());
1443     }
1444     // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
1445     // Here we focus on single code points as input.
1446     // Other examples, including strings, are tested in TestCloseOver().
1447     int32_t errors = 0;
1448     iter.reset();
1449     UnicodeSet set, expected;
1450     while (iter.next()) {
1451         UChar32 c = iter.getCodepoint();
1452         // closeOver()
1453         set.clear().add(c);
1454         set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
1455         // From-first-principles implementation.
1456         expected.clear().add(c);
1457         for (auto it = closure.find(c); it != closure.end() && it->first == c; ++it) {
1458             expected.add(it->second);
1459         }
1460         // compare
1461         if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
1462             errln("    c=U+%04X", c);
1463             if (++errors == 10) {
1464                 break;
1465             }
1466         }
1467     }
1468 }
1469 
TestCloseOverLargeSets()1470 void UnicodeSetTest::TestCloseOverLargeSets() {
1471     IcuTestErrorCode errorCode(*this, "TestCloseOverLargeSets");
1472     // Check that an optimization for large sets does not change the result.
1473 
1474     // Most code points except ones that are boring for case mappings.
1475     UnicodeSet manyCp(u"[^[:C:][:Ideographic:][:Hang:]]", errorCode);
1476     // Main Unihan block.
1477     constexpr UChar32 LARGE_START = 0x4E00;
1478     constexpr UChar32 LARGE_END = 0x9FFF;
1479 
1480     static constexpr int32_t OPTIONS[] = {
1481         USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE
1482     };
1483     UnicodeSet input, small, large;
1484     for (int32_t option : OPTIONS) {
1485         UnicodeSetIterator iter(manyCp);
1486         while (iter.next()) {
1487             UChar32 c = iter.getCodepoint();
1488             input.clear().add(c);
1489             small = input;
1490             small.closeOver(option);
1491             large = input;
1492             large.add(LARGE_START, LARGE_END);
1493             large.closeOver(option);
1494             large.remove(LARGE_START, LARGE_END);
1495             if (!checkEqual(small, large, "small != large")) {
1496                 errln("    option=%d c=U+%04X", option, c);
1497                 break;
1498             }
1499         }
1500     }
1501 }
1502 
TestEscapePattern()1503 void UnicodeSetTest::TestEscapePattern() {
1504     const char pattern[] =
1505         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1506     const char exp[] =
1507         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1508     // We test this with two passes; in the second pass we
1509     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1510     // this fails -- which is what we expect.
1511     for (int32_t pass=1; pass<=2; ++pass) {
1512         UErrorCode ec = U_ZERO_ERROR;
1513         UnicodeString pat(pattern, -1, US_INV);
1514         if (pass==2) {
1515             pat = pat.unescape();
1516         }
1517         // Pattern is only good for pass 1
1518         UBool isPatternValid = (pass==1);
1519 
1520         UnicodeSet set(pat, ec);
1521         if (U_SUCCESS(ec) != isPatternValid){
1522             errln((UnicodeString)"FAIL: applyPattern(" +
1523                   escape(pat) + ") => " +
1524                   u_errorName(ec));
1525             continue;
1526         }
1527         if (U_FAILURE(ec)) {
1528             continue;
1529         }
1530         if (set.contains(u'\u0644')){
1531             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1532         }
1533 
1534         UnicodeString newpat;
1535         set.toPattern(newpat, true);
1536         if (newpat == UnicodeString(exp, -1, US_INV)) {
1537             logln(escape(pat) + " => " + newpat);
1538         } else {
1539             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1540         }
1541 
1542         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1543             UnicodeString str("Range ");
1544             str.append((char16_t)(u'0' + i))
1545                 .append(": ")
1546                 .append((UChar32)set.getRangeStart(i))
1547                 .append(" - ")
1548                 .append((UChar32)set.getRangeEnd(i));
1549             str = str + " (" + set.getRangeStart(i) + " - " +
1550                 set.getRangeEnd(i) + ")";
1551             if (set.getRangeStart(i) < 0) {
1552                 errln((UnicodeString)"FAIL: " + escape(str));
1553             } else {
1554                 logln(escape(str));
1555             }
1556         }
1557     }
1558 }
1559 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1560 void UnicodeSetTest::expectRange(const UnicodeString& label,
1561                                  const UnicodeSet& set,
1562                                  UChar32 start, UChar32 end) {
1563     UnicodeSet exp(start, end);
1564     UnicodeString pat;
1565     if (set == exp) {
1566         logln(label + " => " + set.toPattern(pat, true));
1567     } else {
1568         UnicodeString xpat;
1569         errln((UnicodeString)"FAIL: " + label + " => " +
1570               set.toPattern(pat, true) +
1571               ", expected " + exp.toPattern(xpat, true));
1572     }
1573 }
1574 
TestInvalidCodePoint()1575 void UnicodeSetTest::TestInvalidCodePoint() {
1576 
1577     const UChar32 DATA[] = {
1578         // Test range             Expected range
1579         0, 0x10FFFF,              0, 0x10FFFF,
1580         (UChar32)-1, 8,           0, 8,
1581         8, 0x110000,              8, 0x10FFFF
1582     };
1583     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1584 
1585     UnicodeString pat;
1586     int32_t i;
1587 
1588     for (i=0; i<DATA_LENGTH; i+=4) {
1589         UChar32 start  = DATA[i];
1590         UChar32 end    = DATA[i+1];
1591         UChar32 xstart = DATA[i+2];
1592         UChar32 xend   = DATA[i+3];
1593 
1594         // Try various API using the test code points
1595 
1596         UnicodeSet set(start, end);
1597         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1598                     set, xstart, xend);
1599 
1600         set.clear();
1601         set.set(start, end);
1602         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1603                     set, xstart, xend);
1604 
1605         UBool b = set.contains(start);
1606         b = set.contains(start, end);
1607         b = set.containsNone(start, end);
1608         b = set.containsSome(start, end);
1609         (void)b;   // Suppress set but not used warning.
1610 
1611         /*int32_t index = set.indexOf(start);*/
1612 
1613         set.clear();
1614         set.add(start);
1615         set.add(start, end);
1616         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1617                     set, xstart, xend);
1618 
1619         set.set(0, 0x10FFFF);
1620         set.retain(start, end);
1621         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1622                     set, xstart, xend);
1623         set.retain(start);
1624 
1625         set.set(0, 0x10FFFF);
1626         set.remove(start);
1627         set.remove(start, end);
1628         set.complement();
1629         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1630                     set, xstart, xend);
1631 
1632         set.set(0, 0x10FFFF);
1633         set.complement(start, end);
1634         set.complement();
1635         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1636                     set, xstart, xend);
1637         set.complement(start);
1638     }
1639 
1640     const UChar32 DATA2[] = {
1641         0,
1642         0x10FFFF,
1643         (UChar32)-1,
1644         0x110000
1645     };
1646     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1647 
1648     for (i=0; i<DATA2_LENGTH; ++i) {
1649         UChar32 c = DATA2[i], end = 0x10FFFF;
1650         UBool valid = (c >= 0 && c <= 0x10FFFF);
1651 
1652         UnicodeSet set(0, 0x10FFFF);
1653 
1654         // For single-codepoint contains, invalid codepoints are NOT contained
1655         UBool b = set.contains(c);
1656         if (b == valid) {
1657             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1658                   ") = " + b);
1659         } else {
1660             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1661                   ") = " + b);
1662         }
1663 
1664         // For codepoint range contains, containsNone, and containsSome,
1665         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1666         b = set.contains(c, end);
1667         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1668               "," + end + ") = " + b);
1669 
1670         b = set.containsNone(c, end);
1671         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1672               "," + end + ") = " + b);
1673 
1674         b = set.containsSome(c, end);
1675         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1676               "," + end + ") = " + b);
1677 
1678         int32_t index = set.indexOf(c);
1679         if ((index >= 0) == valid) {
1680             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1681                   ") = " + index);
1682         } else {
1683             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1684                   ") = " + index);
1685         }
1686     }
1687 }
1688 
1689 // Used by TestSymbolTable
1690 class TokenSymbolTable : public SymbolTable {
1691 public:
1692     Hashtable contents;
1693 
TokenSymbolTable(UErrorCode & ec)1694     TokenSymbolTable(UErrorCode& ec) : contents(false, ec) {
1695         contents.setValueDeleter(uprv_deleteUObject);
1696     }
1697 
~TokenSymbolTable()1698     ~TokenSymbolTable() {}
1699 
1700     /**
1701      * (Non-SymbolTable API) Add the given variable and value to
1702      * the table.  Variable should NOT contain leading '$'.
1703      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1704     void add(const UnicodeString& var, const UnicodeString& value,
1705              UErrorCode& ec) {
1706         if (U_SUCCESS(ec)) {
1707             contents.put(var, new UnicodeString(value), ec);
1708         }
1709     }
1710 
1711     /**
1712      * SymbolTable API
1713      */
lookup(const UnicodeString & s) const1714     virtual const UnicodeString* lookup(const UnicodeString& s) const override {
1715         return static_cast<const UnicodeString*>(contents.get(s));
1716     }
1717 
1718     /**
1719      * SymbolTable API
1720      */
lookupMatcher(UChar32) const1721     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const override {
1722         return nullptr;
1723     }
1724 
1725     /**
1726      * SymbolTable API
1727      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1728     virtual UnicodeString parseReference(const UnicodeString& text,
1729                                          ParsePosition& pos, int32_t limit) const override {
1730         int32_t start = pos.getIndex();
1731         int32_t i = start;
1732         UnicodeString result;
1733         while (i < limit) {
1734             char16_t c = text.charAt(i);
1735             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1736                 break;
1737             }
1738             ++i;
1739         }
1740         if (i == start) { // No valid name chars
1741             return result; // Indicate failure with empty string
1742         }
1743         pos.setIndex(i);
1744         text.extractBetween(start, i, result);
1745         return result;
1746     }
1747 };
1748 
TestSymbolTable()1749 void UnicodeSetTest::TestSymbolTable() {
1750     // Multiple test cases can be set up here.  Each test case
1751     // is terminated by null:
1752     // var, value, var, value,..., input pat., exp. output pat., null
1753     const char* DATA[] = {
1754         "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr,
1755         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr,
1756         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr,
1757         nullptr
1758     };
1759 
1760     for (int32_t i=0; DATA[i]!=nullptr; ++i) {
1761         UErrorCode ec = U_ZERO_ERROR;
1762         TokenSymbolTable sym(ec);
1763         if (U_FAILURE(ec)) {
1764             errln("FAIL: couldn't construct TokenSymbolTable");
1765             continue;
1766         }
1767 
1768         // Set up variables
1769         while (DATA[i+2] != nullptr) {
1770             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1771             if (U_FAILURE(ec)) {
1772                 errln("FAIL: couldn't add to TokenSymbolTable");
1773                 continue;
1774             }
1775             i += 2;
1776         }
1777 
1778         // Input pattern and expected output pattern
1779         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1780         i += 2;
1781 
1782         ParsePosition pos(0);
1783         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1784         if (U_FAILURE(ec)) {
1785             errln("FAIL: couldn't construct UnicodeSet");
1786             continue;
1787         }
1788 
1789         // results
1790         if (pos.getIndex() != inpat.length()) {
1791             errln((UnicodeString)"Failed to read to end of string \""
1792                   + inpat + "\": read to "
1793                   + pos.getIndex() + ", length is "
1794                   + inpat.length());
1795         }
1796 
1797         UnicodeSet us2(exppat, ec);
1798         if (U_FAILURE(ec)) {
1799             errln("FAIL: couldn't construct expected UnicodeSet");
1800             continue;
1801         }
1802 
1803         UnicodeString a, b;
1804         if (us != us2) {
1805             errln((UnicodeString)"Failed, got " + us.toPattern(a, true) +
1806                   ", expected " + us2.toPattern(b, true));
1807         } else {
1808             logln((UnicodeString)"Ok, got " + us.toPattern(a, true));
1809         }
1810     }
1811 }
1812 
TestSurrogate()1813 void UnicodeSetTest::TestSurrogate() {
1814     const char* DATA[] = {
1815         // These should all behave identically
1816         "[abc\\uD800\\uDC00]",
1817         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1818         "[abc\\U00010000]",
1819         nullptr
1820     };
1821     for (int i = 0; DATA[i] != nullptr; ++i) {
1822         UErrorCode ec = U_ZERO_ERROR;
1823         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1824         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1825         UnicodeSet set(str, ec);
1826         if (U_FAILURE(ec)) {
1827             errln("FAIL: UnicodeSet constructor");
1828             continue;
1829         }
1830         expectContainment(set,
1831                           CharsToUnicodeString("abc\\U00010000"),
1832                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1833         if (set.size() != 4) {
1834             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1835                   set.size() + ", expected 4");
1836         }
1837 
1838         {
1839           UErrorCode subErr = U_ZERO_ERROR;
1840           checkRoundTrip(set);
1841           checkSerializeRoundTrip(set, subErr);
1842         }
1843     }
1844 }
1845 
TestExhaustive()1846 void UnicodeSetTest::TestExhaustive() {
1847     // exhaustive tests. Simulate UnicodeSets with integers.
1848     // That gives us very solid tests (except for large memory tests).
1849 
1850     int32_t limit = 128;
1851 
1852     UnicodeSet x, y, z, aa;
1853 
1854     for (int32_t i = 0; i < limit; ++i) {
1855         bitsToSet(i, x);
1856         logln((UnicodeString)"Testing " + i + ", " + x);
1857         _testComplement(i, x, y);
1858 
1859         UnicodeSet &toTest = bitsToSet(i, aa);
1860 
1861         // AS LONG AS WE ARE HERE, check roundtrip
1862         checkRoundTrip(toTest);
1863         UErrorCode ec = U_ZERO_ERROR;
1864         checkSerializeRoundTrip(toTest, ec);
1865 
1866         for (int32_t j = 0; j < limit; ++j) {
1867             _testAdd(i,j,  x,y,z);
1868             _testXor(i,j,  x,y,z);
1869             _testRetain(i,j,  x,y,z);
1870             _testRemove(i,j,  x,y,z);
1871         }
1872     }
1873 }
1874 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1875 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1876     bitsToSet(a, x);
1877     z = x;
1878     z.complement();
1879     int32_t c = setToBits(z);
1880     if (c != (~a)) {
1881         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1882         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1883     }
1884     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1885 }
1886 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1887 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1888     bitsToSet(a, x);
1889     bitsToSet(b, y);
1890     z = x;
1891     z.addAll(y);
1892     int32_t c = setToBits(z);
1893     if (c != (a | b)) {
1894         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1895         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1896     }
1897     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1898 }
1899 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1900 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1901     bitsToSet(a, x);
1902     bitsToSet(b, y);
1903     z = x;
1904     z.retainAll(y);
1905     int32_t c = setToBits(z);
1906     if (c != (a & b)) {
1907         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1908         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1909     }
1910     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1911 }
1912 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1913 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1914     bitsToSet(a, x);
1915     bitsToSet(b, y);
1916     z = x;
1917     z.removeAll(y);
1918     int32_t c = setToBits(z);
1919     if (c != (a &~ b)) {
1920         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1921         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1922     }
1923     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1924 }
1925 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1926 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1927     bitsToSet(a, x);
1928     bitsToSet(b, y);
1929     z = x;
1930     z.complementAll(y);
1931     int32_t c = setToBits(z);
1932     if (c != (a ^ b)) {
1933         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1934         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1935     }
1936     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1937 }
1938 
1939 /**
1940  * Check that ranges are monotonically increasing and non-
1941  * overlapping.
1942  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1943 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1944     int32_t n = set.getRangeCount();
1945     if (n < 0) {
1946         errln((UnicodeString)"FAIL result of " + msg +
1947               ": range count should be >= 0 but is " +
1948               n /*+ " for " + set.toPattern())*/);
1949         return;
1950     }
1951     UChar32 last = 0;
1952     for (int32_t i=0; i<n; ++i) {
1953         UChar32 start = set.getRangeStart(i);
1954         UChar32 end = set.getRangeEnd(i);
1955         if (start > end) {
1956             errln((UnicodeString)"FAIL result of " + msg +
1957                   ": range " + (i+1) +
1958                   " start > end: " + (int)start + ", " + (int)end +
1959                   " for " + set);
1960         }
1961         if (i > 0 && start <= last) {
1962             errln((UnicodeString)"FAIL result of " + msg +
1963                   ": range " + (i+1) +
1964                   " overlaps previous range: " + (int)start + ", " + (int)end +
1965                   " for " + set);
1966         }
1967         last = end;
1968     }
1969 }
1970 
1971 /**
1972  * Convert a bitmask to a UnicodeSet.
1973  */
bitsToSet(int32_t a,UnicodeSet & result)1974 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1975     result.clear();
1976     for (UChar32 i = 0; i < 32; ++i) {
1977         if ((a & (1<<i)) != 0) {
1978             result.add(i);
1979         }
1980     }
1981     return result;
1982 }
1983 
1984 /**
1985  * Convert a UnicodeSet to a bitmask.  Only the characters
1986  * U+0000 to U+0020 are represented in the bitmask.
1987  */
setToBits(const UnicodeSet & x)1988 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1989     int32_t result = 0;
1990     for (int32_t i = 0; i < 32; ++i) {
1991         if (x.contains((UChar32)i)) {
1992             result |= (1<<i);
1993         }
1994     }
1995     return result;
1996 }
1997 
1998 /**
1999  * Return the representation of an inversion list based UnicodeSet
2000  * as a pairs list.  Ranges are listed in ascending Unicode order.
2001  * For example, the set [a-zA-M3] is represented as "33AMaz".
2002  */
getPairs(const UnicodeSet & set)2003 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
2004     UnicodeString pairs;
2005     for (int32_t i=0; i<set.getRangeCount(); ++i) {
2006         UChar32 start = set.getRangeStart(i);
2007         UChar32 end = set.getRangeEnd(i);
2008         if (end > 0xFFFF) {
2009             end = 0xFFFF;
2010             i = set.getRangeCount(); // Should be unnecessary
2011         }
2012         pairs.append((char16_t)start).append((char16_t)end);
2013     }
2014     return pairs;
2015 }
2016 
2017 /**
2018  * Basic consistency check for a few items.
2019  * That the iterator works, and that we can create a pattern and
2020  * get the same thing back
2021  */
checkRoundTrip(const UnicodeSet & s)2022 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
2023     {
2024         UnicodeSet t(s);
2025         checkEqual(s, t, "copy ct");
2026     }
2027 
2028     {
2029         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
2030         t = s;
2031         checkEqual(s, t, "operator=");
2032     }
2033 
2034     {
2035         UnicodeSet t;
2036         copyWithIterator(t, s, false);
2037         checkEqual(s, t, "iterator roundtrip");
2038     }
2039 
2040     {
2041         UnicodeSet t;
2042         copyWithIterator(t, s, true); // try range
2043         checkEqual(s, t, "iterator roundtrip");
2044     }
2045 
2046     {
2047         UnicodeSet t;
2048         UnicodeString pat;
2049         UErrorCode ec = U_ZERO_ERROR;
2050         s.toPattern(pat, false);
2051         t.applyPattern(pat, ec);
2052         if (U_FAILURE(ec)) {
2053             errln("FAIL: toPattern(escapeUnprintable=false), applyPattern - %s", u_errorName(ec));
2054             return;
2055         } else {
2056             checkEqual(s, t, "toPattern(false)");
2057         }
2058     }
2059 
2060     {
2061         UnicodeSet t;
2062         UnicodeString pat;
2063         UErrorCode ec = U_ZERO_ERROR;
2064         s.toPattern(pat, true);
2065         t.applyPattern(pat, ec);
2066         if (U_FAILURE(ec)) {
2067             errln("FAIL: toPattern(escapeUnprintable=true), applyPattern - %s", u_errorName(ec));
2068             return;
2069         } else {
2070             checkEqual(s, t, "toPattern(true)");
2071         }
2072     }
2073 }
2074 
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)2075 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
2076   if(U_FAILURE(status)) return;
2077   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2078   if(status == U_BUFFER_OVERFLOW_ERROR) {
2079     status = U_ZERO_ERROR;
2080     serializeBuffer.resize(len);
2081     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2082     // let 2nd error stand
2083   }
2084   if(U_FAILURE(status)) {
2085     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
2086     return;
2087   }
2088   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
2089   if(U_FAILURE(status)) {
2090     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
2091     return;
2092   }
2093 
2094   checkEqual(t, deserialized, "Set was unequal when deserialized");
2095 }
2096 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)2097 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
2098     t.clear();
2099     UnicodeSetIterator it(s);
2100     if (withRange) {
2101         while (it.nextRange()) {
2102             if (it.isString()) {
2103                 t.add(it.getString());
2104             } else {
2105                 t.add(it.getCodepoint(), it.getCodepointEnd());
2106             }
2107         }
2108     } else {
2109         while (it.next()) {
2110             if (it.isString()) {
2111                 t.add(it.getString());
2112             } else {
2113                 t.add(it.getCodepoint());
2114             }
2115         }
2116     }
2117 }
2118 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2119 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2120   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2121   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2122     UnicodeString source; s.toPattern(source, true);
2123     UnicodeString result; t.toPattern(result, true);
2124     if (s != t) {
2125         errln((UnicodeString)"FAIL: " + message
2126               + "; source = " + source
2127               + "; result = " + result
2128               );
2129         return false;
2130     } else {
2131         logln((UnicodeString)"Ok: " + message
2132               + "; source = " + source
2133               + "; result = " + result
2134               );
2135     }
2136     return true;
2137 }
2138 
2139 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2140 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2141                                   const UnicodeString& charsIn,
2142                                   const UnicodeString& charsOut) {
2143     UErrorCode ec = U_ZERO_ERROR;
2144     UnicodeSet set(pat, ec);
2145     if (U_FAILURE(ec)) {
2146         dataerrln((UnicodeString)"FAIL: pattern \"" +
2147               pat + "\" => " + u_errorName(ec));
2148         return;
2149     }
2150     expectContainment(set, pat, charsIn, charsOut);
2151 }
2152 
2153 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2154 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2155                                   const UnicodeString& charsIn,
2156                                   const UnicodeString& charsOut) {
2157     UnicodeString pat;
2158     set.toPattern(pat);
2159     expectContainment(set, pat, charsIn, charsOut);
2160 }
2161 
2162 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2163 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2164                                   const UnicodeString& setName,
2165                                   const UnicodeString& charsIn,
2166                                   const UnicodeString& charsOut) {
2167     UnicodeString bad;
2168     UChar32 c;
2169     int32_t i;
2170 
2171     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2172         c = charsIn.char32At(i);
2173         if (!set.contains(c)) {
2174             bad.append(c);
2175         }
2176     }
2177     if (bad.length() > 0) {
2178         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2179               ", expected containment of " + prettify(charsIn));
2180     } else {
2181         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2182     }
2183 
2184     bad.truncate(0);
2185     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2186         c = charsOut.char32At(i);
2187         if (set.contains(c)) {
2188             bad.append(c);
2189         }
2190     }
2191     if (bad.length() > 0) {
2192         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2193               ", expected non-containment of " + prettify(charsOut));
2194     } else {
2195         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2196     }
2197 }
2198 
2199 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2200 UnicodeSetTest::expectPattern(UnicodeSet& set,
2201                               const UnicodeString& pattern,
2202                               const UnicodeString& expectedPairs){
2203     UErrorCode status = U_ZERO_ERROR;
2204     set.applyPattern(pattern, status);
2205     if (U_FAILURE(status)) {
2206         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2207               "\") failed");
2208         return;
2209     } else {
2210         if (getPairs(set) != expectedPairs ) {
2211             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2212                   "\") => pairs \"" +
2213                   escape(getPairs(set)) + "\", expected \"" +
2214                   escape(expectedPairs) + "\"");
2215         } else {
2216             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2217                   "\") => pairs \"" +
2218                   escape(getPairs(set)) + "\"");
2219         }
2220     }
2221     // the result of calling set.toPattern(), which is the string representation of
2222     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2223     // will produce another set that is equal to this one.
2224     UnicodeString temppattern;
2225     set.toPattern(temppattern);
2226     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2227     if (U_FAILURE(status)) {
2228         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2229         return;
2230     }
2231     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2232         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2233             escape(getPairs(set)) + "\""));
2234     } else{
2235         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2236     }
2237 
2238     delete tempset;
2239 
2240 }
2241 
2242 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2243 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2244     if (getPairs(set) != expectedPairs) {
2245         errln(UnicodeString("FAIL: Expected pair list \"") +
2246               escape(expectedPairs) + "\", got \"" +
2247               escape(getPairs(set)) + "\"");
2248     }
2249 }
2250 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2251 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2252                                      const UnicodeString& expPat,
2253                                      const char** expStrings) {
2254     UnicodeString pat;
2255     set.toPattern(pat, true);
2256     if (pat == expPat) {
2257         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2258     } else {
2259         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2260         return;
2261     }
2262     if (expStrings == nullptr) {
2263         return;
2264     }
2265     UBool in = true;
2266     for (int32_t i=0; expStrings[i] != nullptr; ++i) {
2267         if (expStrings[i] == NOT) { // sic; pointer comparison
2268             in = false;
2269             continue;
2270         }
2271         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2272         UBool contained = set.contains(s);
2273         if (contained == in) {
2274             logln((UnicodeString)"Ok: " + expPat +
2275                   (contained ? " contains {" : " does not contain {") +
2276                   escape(expStrings[i]) + "}");
2277         } else {
2278             errln((UnicodeString)"FAIL: " + expPat +
2279                   (contained ? " contains {" : " does not contain {") +
2280                   escape(expStrings[i]) + "}");
2281         }
2282     }
2283 }
2284 
toHexString(int32_t i)2285 static char16_t toHexString(int32_t i) { return (char16_t)(i + (i < 10 ? u'0' : (u'A' - 10))); }
2286 
2287 void
doAssert(UBool condition,const char * message)2288 UnicodeSetTest::doAssert(UBool condition, const char *message)
2289 {
2290     if (!condition) {
2291         errln(UnicodeString("ERROR : ") + message);
2292     }
2293 }
2294 
2295 UnicodeString
escape(const UnicodeString & s)2296 UnicodeSetTest::escape(const UnicodeString& s) {
2297     UnicodeString buf;
2298     for (int32_t i=0; i<s.length(); )
2299     {
2300         UChar32 c = s.char32At(i);
2301         if (0x0020 <= c && c <= 0x007F) {
2302             buf += c;
2303         } else {
2304             if (c <= 0xFFFF) {
2305                 buf += u"\\u";
2306             } else {
2307                 buf += u"\\U";
2308                 buf += toHexString((c & 0xF0000000) >> 28);
2309                 buf += toHexString((c & 0x0F000000) >> 24);
2310                 buf += toHexString((c & 0x00F00000) >> 20);
2311                 buf += toHexString((c & 0x000F0000) >> 16);
2312             }
2313             buf += toHexString((c & 0xF000) >> 12);
2314             buf += toHexString((c & 0x0F00) >> 8);
2315             buf += toHexString((c & 0x00F0) >> 4);
2316             buf += toHexString(c & 0x000F);
2317         }
2318         i += U16_LENGTH(c);
2319     }
2320     return buf;
2321 }
2322 
TestFreezable()2323 void UnicodeSetTest::TestFreezable() {
2324     UErrorCode errorCode=U_ZERO_ERROR;
2325     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2326     UnicodeSet idSet(idPattern, errorCode);
2327     if(U_FAILURE(errorCode)) {
2328         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2329         return;
2330     }
2331 
2332     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2333     UnicodeSet wsSet(wsPattern, errorCode);
2334     if(U_FAILURE(errorCode)) {
2335         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2336         return;
2337     }
2338 
2339     idSet.add(idPattern);
2340     UnicodeSet frozen(idSet);
2341     frozen.freeze();
2342 
2343     if(idSet.isFrozen() || !frozen.isFrozen()) {
2344         errln("FAIL: isFrozen() is wrong");
2345     }
2346     if(frozen!=idSet || !(frozen==idSet)) {
2347         errln("FAIL: a copy-constructed frozen set differs from its original");
2348     }
2349 
2350     frozen=wsSet;
2351     if(frozen!=idSet || !(frozen==idSet)) {
2352         errln("FAIL: a frozen set was modified by operator=");
2353     }
2354 
2355     UnicodeSet frozen2(frozen);
2356     if(frozen2!=frozen || frozen2!=idSet) {
2357         errln("FAIL: a copied frozen set differs from its frozen original");
2358     }
2359     if(!frozen2.isFrozen()) {
2360         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2361     }
2362     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2363     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2364         errln("FAIL: UnicodeSet(5, 55) failed");
2365     }
2366     frozen3=frozen;
2367     if(!frozen3.isFrozen()) {
2368         errln("FAIL: copying a frozen set results in a thawed one");
2369     }
2370 
2371     UnicodeSet *cloned=frozen.clone();
2372     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2373         errln("FAIL: clone() failed");
2374     }
2375     cloned->add(0xd802, 0xd805);
2376     if(cloned->containsSome(0xd802, 0xd805)) {
2377         errln("FAIL: unable to modify clone");
2378     }
2379     delete cloned;
2380 
2381     UnicodeSet *thawed=frozen.cloneAsThawed();
2382     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2383         errln("FAIL: cloneAsThawed() failed");
2384     }
2385     thawed->add(0xd802, 0xd805);
2386     if(!thawed->contains(0xd802, 0xd805)) {
2387         errln("FAIL: unable to modify thawed clone");
2388     }
2389     delete thawed;
2390 
2391     frozen.set(5, 55);
2392     if(frozen!=idSet || !(frozen==idSet)) {
2393         errln("FAIL: UnicodeSet::set() modified a frozen set");
2394     }
2395 
2396     frozen.clear();
2397     if(frozen!=idSet || !(frozen==idSet)) {
2398         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2399     }
2400 
2401     frozen.closeOver(USET_CASE_INSENSITIVE);
2402     if(frozen!=idSet || !(frozen==idSet)) {
2403         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2404     }
2405 
2406     frozen.compact();
2407     if(frozen!=idSet || !(frozen==idSet)) {
2408         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2409     }
2410 
2411     ParsePosition pos;
2412     frozen.
2413         applyPattern(wsPattern, errorCode).
2414         applyPattern(wsPattern, USET_IGNORE_SPACE, nullptr, errorCode).
2415         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, nullptr, errorCode).
2416         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2417         applyPropertyAlias(u"Assigned", UnicodeString(), errorCode);
2418     if(frozen!=idSet || !(frozen==idSet)) {
2419         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2420     }
2421 
2422     frozen.
2423         add(0xd800).
2424         add(0xd802, 0xd805).
2425         add(wsPattern).
2426         addAll(idPattern).
2427         addAll(wsSet);
2428     if(frozen!=idSet || !(frozen==idSet)) {
2429         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2430     }
2431 
2432     frozen.
2433         retain(0x62).
2434         retain(0x64, 0x69).
2435         retainAll(wsPattern).
2436         retainAll(wsSet);
2437     if(frozen!=idSet || !(frozen==idSet)) {
2438         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2439     }
2440 
2441     frozen.
2442         remove(0x62).
2443         remove(0x64, 0x69).
2444         remove(idPattern).
2445         removeAll(idPattern).
2446         removeAll(idSet);
2447     if(frozen!=idSet || !(frozen==idSet)) {
2448         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2449     }
2450 
2451     frozen.
2452         complement().
2453         complement(0x62).
2454         complement(0x64, 0x69).
2455         complement(idPattern).
2456         complementAll(idPattern).
2457         complementAll(idSet);
2458     if(frozen!=idSet || !(frozen==idSet)) {
2459         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2460     }
2461 }
2462 
2463 // Test span() etc. -------------------------------------------------------- ***
2464 
2465 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2466 static int32_t
appendUTF8(const char16_t * s,int32_t length,char * t,int32_t capacity)2467 appendUTF8(const char16_t *s, int32_t length, char *t, int32_t capacity) {
2468     UErrorCode errorCode=U_ZERO_ERROR;
2469     int32_t length8=0;
2470     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2471     if(U_SUCCESS(errorCode)) {
2472         return length8;
2473     } else {
2474         // The string contains an unpaired surrogate.
2475         // Ignore this string.
2476         return 0;
2477     }
2478 }
2479 
2480 class UnicodeSetWithStringsIterator;
2481 
2482 // Make the strings in a UnicodeSet easily accessible.
2483 class UnicodeSetWithStrings {
2484 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2485     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2486             set(normalSet), stringsLength(0), hasSurrogates(false) {
2487         int32_t size=set.size();
2488         if(size>0 && set.charAt(size-1)<0) {
2489             // If a set's last element is not a code point, then it must contain strings.
2490             // Iterate over the set, skip all code point ranges, and cache the strings.
2491             // Convert them to UTF-8 for spanUTF8().
2492             UnicodeSetIterator iter(set);
2493             const UnicodeString *s;
2494             char *s8=utf8;
2495             int32_t length8, utf8Count=0;
2496             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2497                 if(iter.isString()) {
2498                     // Store the pointer to the set's string element
2499                     // which we happen to know is a stable pointer.
2500                     strings[stringsLength]=s=&iter.getString();
2501                     utf8Count+=
2502                         utf8Lengths[stringsLength]=length8=
2503                         appendUTF8(s->getBuffer(), s->length(),
2504                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2505                     if(length8==0) {
2506                         hasSurrogates=true;  // Contains unpaired surrogates.
2507                     }
2508                     s8+=length8;
2509                     ++stringsLength;
2510                 }
2511             }
2512         }
2513     }
2514 
getSet() const2515     const UnicodeSet &getSet() const {
2516         return set;
2517     }
2518 
hasStrings() const2519     UBool hasStrings() const {
2520         return (UBool)(stringsLength>0);
2521     }
2522 
hasStringsWithSurrogates() const2523     UBool hasStringsWithSurrogates() const {
2524         return hasSurrogates;
2525     }
2526 
2527 private:
2528     friend class UnicodeSetWithStringsIterator;
2529 
2530     const UnicodeSet &set;
2531 
2532     const UnicodeString *strings[20];
2533     int32_t stringsLength;
2534     UBool hasSurrogates;
2535 
2536     char utf8[1024];
2537     int32_t utf8Lengths[20];
2538 };
2539 
2540 class UnicodeSetWithStringsIterator {
2541 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2542     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2543             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2544     }
2545 
reset()2546     void reset() {
2547         nextStringIndex=nextUTF8Start=0;
2548     }
2549 
nextString()2550     const UnicodeString *nextString() {
2551         if(nextStringIndex<fSet.stringsLength) {
2552             return fSet.strings[nextStringIndex++];
2553         } else {
2554             return nullptr;
2555         }
2556     }
2557 
2558     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2559     const char *nextUTF8(int32_t &length) {
2560         if(nextStringIndex<fSet.stringsLength) {
2561             const char *s8=fSet.utf8+nextUTF8Start;
2562             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2563             return s8;
2564         } else {
2565             length=0;
2566             return nullptr;
2567         }
2568     }
2569 
2570 private:
2571     const UnicodeSetWithStrings &fSet;
2572     int32_t nextStringIndex;
2573     int32_t nextUTF8Start;
2574 };
2575 
2576 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2577 // at code point boundaries.
2578 // That is, each edge of a match must not be in the middle of a surrogate pair.
2579 static inline UBool
matches16CPB(const char16_t * s,int32_t start,int32_t limit,const UnicodeString & t)2580 matches16CPB(const char16_t *s, int32_t start, int32_t limit, const UnicodeString &t) {
2581     s+=start;
2582     limit-=start;
2583     int32_t length=t.length();
2584     return 0==t.compare(s, length) &&
2585            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2586            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2587 }
2588 
2589 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const char16_t * s,int32_t length,USetSpanCondition spanCondition)2590 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const char16_t *s, int32_t length,
2591                                  USetSpanCondition spanCondition) {
2592     const UnicodeSet &realSet(set.getSet());
2593     if(!set.hasStrings()) {
2594         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2595             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2596         }
2597 
2598         UChar32 c;
2599         int32_t start=0, prev;
2600         while((prev=start)<length) {
2601             U16_NEXT(s, start, length, c);
2602             if(realSet.contains(c)!=spanCondition) {
2603                 break;
2604             }
2605         }
2606         return prev;
2607     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2608         UnicodeSetWithStringsIterator iter(set);
2609         UChar32 c;
2610         int32_t start, next;
2611         for(start=next=0; start<length;) {
2612             U16_NEXT(s, next, length, c);
2613             if(realSet.contains(c)) {
2614                 break;
2615             }
2616             const UnicodeString *str;
2617             iter.reset();
2618             while((str=iter.nextString())!=nullptr) {
2619                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2620                     // spanNeedsStrings=true;
2621                     return start;
2622                 }
2623             }
2624             start=next;
2625         }
2626         return start;
2627     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2628         UnicodeSetWithStringsIterator iter(set);
2629         UChar32 c;
2630         int32_t start, next, maxSpanLimit=0;
2631         for(start=next=0; start<length;) {
2632             U16_NEXT(s, next, length, c);
2633             if(!realSet.contains(c)) {
2634                 next=start;  // Do not span this single, not-contained code point.
2635             }
2636             const UnicodeString *str;
2637             iter.reset();
2638             while((str=iter.nextString())!=nullptr) {
2639                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2640                     // spanNeedsStrings=true;
2641                     int32_t matchLimit=start+str->length();
2642                     if(matchLimit==length) {
2643                         return length;
2644                     }
2645                     if(spanCondition==USET_SPAN_CONTAINED) {
2646                         // Iterate for the shortest match at each position.
2647                         // Recurse for each but the shortest match.
2648                         if(next==start) {
2649                             next=matchLimit;  // First match from start.
2650                         } else {
2651                             if(matchLimit<next) {
2652                                 // Remember shortest match from start for iteration.
2653                                 int32_t temp=next;
2654                                 next=matchLimit;
2655                                 matchLimit=temp;
2656                             }
2657                             // Recurse for non-shortest match from start.
2658                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2659                                                                  USET_SPAN_CONTAINED);
2660                             if((matchLimit+spanLength)>maxSpanLimit) {
2661                                 maxSpanLimit=matchLimit+spanLength;
2662                                 if(maxSpanLimit==length) {
2663                                     return length;
2664                                 }
2665                             }
2666                         }
2667                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2668                         if(matchLimit>next) {
2669                             // Remember longest match from start.
2670                             next=matchLimit;
2671                         }
2672                     }
2673                 }
2674             }
2675             if(next==start) {
2676                 break;  // No match from start.
2677             }
2678             start=next;
2679         }
2680         if(start>maxSpanLimit) {
2681             return start;
2682         } else {
2683             return maxSpanLimit;
2684         }
2685     }
2686 }
2687 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const char16_t * s,int32_t length,USetSpanCondition spanCondition)2688 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const char16_t *s, int32_t length,
2689                                      USetSpanCondition spanCondition) {
2690     if(length==0) {
2691         return 0;
2692     }
2693     const UnicodeSet &realSet(set.getSet());
2694     if(!set.hasStrings()) {
2695         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2696             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2697         }
2698 
2699         UChar32 c;
2700         int32_t prev=length;
2701         do {
2702             U16_PREV(s, 0, length, c);
2703             if(realSet.contains(c)!=spanCondition) {
2704                 break;
2705             }
2706         } while((prev=length)>0);
2707         return prev;
2708     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2709         UnicodeSetWithStringsIterator iter(set);
2710         UChar32 c;
2711         int32_t prev=length, length0=length;
2712         do {
2713             U16_PREV(s, 0, length, c);
2714             if(realSet.contains(c)) {
2715                 break;
2716             }
2717             const UnicodeString *str;
2718             iter.reset();
2719             while((str=iter.nextString())!=nullptr) {
2720                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2721                     // spanNeedsStrings=true;
2722                     return prev;
2723                 }
2724             }
2725         } while((prev=length)>0);
2726         return prev;
2727     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2728         UnicodeSetWithStringsIterator iter(set);
2729         UChar32 c;
2730         int32_t prev=length, minSpanStart=length, length0=length;
2731         do {
2732             U16_PREV(s, 0, length, c);
2733             if(!realSet.contains(c)) {
2734                 length=prev;  // Do not span this single, not-contained code point.
2735             }
2736             const UnicodeString *str;
2737             iter.reset();
2738             while((str=iter.nextString())!=nullptr) {
2739                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2740                     // spanNeedsStrings=true;
2741                     int32_t matchStart=prev-str->length();
2742                     if(matchStart==0) {
2743                         return 0;
2744                     }
2745                     if(spanCondition==USET_SPAN_CONTAINED) {
2746                         // Iterate for the shortest match at each position.
2747                         // Recurse for each but the shortest match.
2748                         if(length==prev) {
2749                             length=matchStart;  // First match from prev.
2750                         } else {
2751                             if(matchStart>length) {
2752                                 // Remember shortest match from prev for iteration.
2753                                 int32_t temp=length;
2754                                 length=matchStart;
2755                                 matchStart=temp;
2756                             }
2757                             // Recurse for non-shortest match from prev.
2758                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2759                                                                     USET_SPAN_CONTAINED);
2760                             if(spanStart<minSpanStart) {
2761                                 minSpanStart=spanStart;
2762                                 if(minSpanStart==0) {
2763                                     return 0;
2764                                 }
2765                             }
2766                         }
2767                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2768                         if(matchStart<length) {
2769                             // Remember longest match from prev.
2770                             length=matchStart;
2771                         }
2772                     }
2773                 }
2774             }
2775             if(length==prev) {
2776                 break;  // No match from prev.
2777             }
2778         } while((prev=length)>0);
2779         if(prev<minSpanStart) {
2780             return prev;
2781         } else {
2782             return minSpanStart;
2783         }
2784     }
2785 }
2786 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2787 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2788                                 USetSpanCondition spanCondition) {
2789     const UnicodeSet &realSet(set.getSet());
2790     if(!set.hasStrings()) {
2791         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2792             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2793         }
2794 
2795         UChar32 c;
2796         int32_t start=0, prev;
2797         while((prev=start)<length) {
2798             U8_NEXT_OR_FFFD(s, start, length, c);
2799             if(realSet.contains(c)!=spanCondition) {
2800                 break;
2801             }
2802         }
2803         return prev;
2804     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2805         UnicodeSetWithStringsIterator iter(set);
2806         UChar32 c;
2807         int32_t start, next;
2808         for(start=next=0; start<length;) {
2809             U8_NEXT_OR_FFFD(s, next, length, c);
2810             if(realSet.contains(c)) {
2811                 break;
2812             }
2813             const char *s8;
2814             int32_t length8;
2815             iter.reset();
2816             while((s8=iter.nextUTF8(length8))!=nullptr) {
2817                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2818                     // spanNeedsStrings=true;
2819                     return start;
2820                 }
2821             }
2822             start=next;
2823         }
2824         return start;
2825     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2826         UnicodeSetWithStringsIterator iter(set);
2827         UChar32 c;
2828         int32_t start, next, maxSpanLimit=0;
2829         for(start=next=0; start<length;) {
2830             U8_NEXT_OR_FFFD(s, next, length, c);
2831             if(!realSet.contains(c)) {
2832                 next=start;  // Do not span this single, not-contained code point.
2833             }
2834             const char *s8;
2835             int32_t length8;
2836             iter.reset();
2837             while((s8=iter.nextUTF8(length8))!=nullptr) {
2838                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2839                     // spanNeedsStrings=true;
2840                     int32_t matchLimit=start+length8;
2841                     if(matchLimit==length) {
2842                         return length;
2843                     }
2844                     if(spanCondition==USET_SPAN_CONTAINED) {
2845                         // Iterate for the shortest match at each position.
2846                         // Recurse for each but the shortest match.
2847                         if(next==start) {
2848                             next=matchLimit;  // First match from start.
2849                         } else {
2850                             if(matchLimit<next) {
2851                                 // Remember shortest match from start for iteration.
2852                                 int32_t temp=next;
2853                                 next=matchLimit;
2854                                 matchLimit=temp;
2855                             }
2856                             // Recurse for non-shortest match from start.
2857                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2858                                                                 USET_SPAN_CONTAINED);
2859                             if((matchLimit+spanLength)>maxSpanLimit) {
2860                                 maxSpanLimit=matchLimit+spanLength;
2861                                 if(maxSpanLimit==length) {
2862                                     return length;
2863                                 }
2864                             }
2865                         }
2866                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2867                         if(matchLimit>next) {
2868                             // Remember longest match from start.
2869                             next=matchLimit;
2870                         }
2871                     }
2872                 }
2873             }
2874             if(next==start) {
2875                 break;  // No match from start.
2876             }
2877             start=next;
2878         }
2879         if(start>maxSpanLimit) {
2880             return start;
2881         } else {
2882             return maxSpanLimit;
2883         }
2884     }
2885 }
2886 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2887 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2888                                     USetSpanCondition spanCondition) {
2889     if(length==0) {
2890         return 0;
2891     }
2892     const UnicodeSet &realSet(set.getSet());
2893     if(!set.hasStrings()) {
2894         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2895             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2896         }
2897 
2898         UChar32 c;
2899         int32_t prev=length;
2900         do {
2901             U8_PREV_OR_FFFD(s, 0, length, c);
2902             if(realSet.contains(c)!=spanCondition) {
2903                 break;
2904             }
2905         } while((prev=length)>0);
2906         return prev;
2907     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2908         UnicodeSetWithStringsIterator iter(set);
2909         UChar32 c;
2910         int32_t prev=length;
2911         do {
2912             U8_PREV_OR_FFFD(s, 0, length, c);
2913             if(realSet.contains(c)) {
2914                 break;
2915             }
2916             const char *s8;
2917             int32_t length8;
2918             iter.reset();
2919             while((s8=iter.nextUTF8(length8))!=nullptr) {
2920                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2921                     // spanNeedsStrings=true;
2922                     return prev;
2923                 }
2924             }
2925         } while((prev=length)>0);
2926         return prev;
2927     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2928         UnicodeSetWithStringsIterator iter(set);
2929         UChar32 c;
2930         int32_t prev=length, minSpanStart=length;
2931         do {
2932             U8_PREV_OR_FFFD(s, 0, length, c);
2933             if(!realSet.contains(c)) {
2934                 length=prev;  // Do not span this single, not-contained code point.
2935             }
2936             const char *s8;
2937             int32_t length8;
2938             iter.reset();
2939             while((s8=iter.nextUTF8(length8))!=nullptr) {
2940                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2941                     // spanNeedsStrings=true;
2942                     int32_t matchStart=prev-length8;
2943                     if(matchStart==0) {
2944                         return 0;
2945                     }
2946                     if(spanCondition==USET_SPAN_CONTAINED) {
2947                         // Iterate for the shortest match at each position.
2948                         // Recurse for each but the shortest match.
2949                         if(length==prev) {
2950                             length=matchStart;  // First match from prev.
2951                         } else {
2952                             if(matchStart>length) {
2953                                 // Remember shortest match from prev for iteration.
2954                                 int32_t temp=length;
2955                                 length=matchStart;
2956                                 matchStart=temp;
2957                             }
2958                             // Recurse for non-shortest match from prev.
2959                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2960                                                                    USET_SPAN_CONTAINED);
2961                             if(spanStart<minSpanStart) {
2962                                 minSpanStart=spanStart;
2963                                 if(minSpanStart==0) {
2964                                     return 0;
2965                                 }
2966                             }
2967                         }
2968                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2969                         if(matchStart<length) {
2970                             // Remember longest match from prev.
2971                             length=matchStart;
2972                         }
2973                     }
2974                 }
2975             }
2976             if(length==prev) {
2977                 break;  // No match from prev.
2978             }
2979         } while((prev=length)>0);
2980         if(prev<minSpanStart) {
2981             return prev;
2982         } else {
2983             return minSpanStart;
2984         }
2985     }
2986 }
2987 
2988 // spans to be performed and compared
2989 enum {
2990     SPAN_UTF16          =1,
2991     SPAN_UTF8           =2,
2992     SPAN_UTFS           =3,
2993 
2994     SPAN_SET            =4,
2995     SPAN_COMPLEMENT     =8,
2996     SPAN_POLARITY       =0xc,
2997 
2998     SPAN_FWD            =0x10,
2999     SPAN_BACK           =0x20,
3000     SPAN_DIRS           =0x30,
3001 
3002     SPAN_CONTAINED      =0x100,
3003     SPAN_SIMPLE         =0x200,
3004     SPAN_CONDITION      =0x300,
3005 
3006     SPAN_ALL            =0x33f
3007 };
3008 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)3009 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
3010     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
3011 }
3012 
slen(const void * s,UBool isUTF16)3013 static inline int32_t slen(const void *s, UBool isUTF16) {
3014     return isUTF16 ? u_strlen((const char16_t *)s) : static_cast<int32_t>(strlen((const char *)s));
3015 }
3016 
3017 /*
3018  * Count spans on a string with the method according to type and set the span limits.
3019  * The set may be the complement of the original.
3020  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
3021  * according to the expected number of spans.
3022  * Sets typeName to an empty string if there is no such type.
3023  * Returns -1 if the span option is filtered out.
3024  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)3025 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
3026                         const void *s, int32_t length, UBool isUTF16,
3027                         uint32_t whichSpans,
3028                         int type, const char *&typeName,
3029                         int32_t limits[], int32_t limitsCapacity,
3030                         int32_t expectCount) {
3031     const UnicodeSet &realSet(set.getSet());
3032     int32_t start, count;
3033     USetSpanCondition spanCondition, firstSpanCondition, contained;
3034     UBool isForward;
3035 
3036     if(type<0 || 7<type) {
3037         typeName="";
3038         return 0;
3039     }
3040 
3041     static const char *const typeNames16[]={
3042         "contains", "contains(LM)",
3043         "span", "span(LM)",
3044         "containsBack", "containsBack(LM)",
3045         "spanBack", "spanBack(LM)"
3046     };
3047 
3048     static const char *const typeNames8[]={
3049         "containsUTF8", "containsUTF8(LM)",
3050         "spanUTF8", "spanUTF8(LM)",
3051         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
3052         "spanBackUTF8", "spanBackUTF8(LM)"
3053     };
3054 
3055     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
3056 
3057     // filter span options
3058     if(type<=3) {
3059         // span forward
3060         if((whichSpans&SPAN_FWD)==0) {
3061             return -1;
3062         }
3063         isForward=true;
3064     } else {
3065         // span backward
3066         if((whichSpans&SPAN_BACK)==0) {
3067             return -1;
3068         }
3069         isForward=false;
3070     }
3071     if((type&1)==0) {
3072         // use USET_SPAN_CONTAINED
3073         if((whichSpans&SPAN_CONTAINED)==0) {
3074             return -1;
3075         }
3076         contained=USET_SPAN_CONTAINED;
3077     } else {
3078         // use USET_SPAN_SIMPLE
3079         if((whichSpans&SPAN_SIMPLE)==0) {
3080             return -1;
3081         }
3082         contained=USET_SPAN_SIMPLE;
3083     }
3084 
3085     // Default first span condition for going forward with an uncomplemented set.
3086     spanCondition=USET_SPAN_NOT_CONTAINED;
3087     if(isComplement) {
3088         spanCondition=invertSpanCondition(spanCondition, contained);
3089     }
3090 
3091     // First span condition for span(), used to terminate the spanBack() iteration.
3092     firstSpanCondition=spanCondition;
3093 
3094     // spanBack(): Its initial span condition is span()'s last span condition,
3095     // which is the opposite of span()'s first span condition
3096     // if we expect an even number of spans.
3097     // (The loop inverts spanCondition (expectCount-1) times
3098     // before the expectCount'th span() call.)
3099     // If we do not compare forward and backward directions, then we do not have an
3100     // expectCount and just start with firstSpanCondition.
3101     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
3102         spanCondition=invertSpanCondition(spanCondition, contained);
3103     }
3104 
3105     count=0;
3106     switch(type) {
3107     case 0:
3108     case 1:
3109         start=0;
3110         if(length<0) {
3111             length=slen(s, isUTF16);
3112         }
3113         for(;;) {
3114             start+= isUTF16 ? containsSpanUTF16(set, (const char16_t *)s+start, length-start, spanCondition) :
3115                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3116             if(count<limitsCapacity) {
3117                 limits[count]=start;
3118             }
3119             ++count;
3120             if(start>=length) {
3121                 break;
3122             }
3123             spanCondition=invertSpanCondition(spanCondition, contained);
3124         }
3125         break;
3126     case 2:
3127     case 3:
3128         start=0;
3129         for(;;) {
3130             start+= isUTF16 ? realSet.span((const char16_t *)s+start, length>=0 ? length-start : length, spanCondition) :
3131                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3132             if(count<limitsCapacity) {
3133                 limits[count]=start;
3134             }
3135             ++count;
3136             if(length>=0 ? start>=length :
3137                            isUTF16 ? ((const char16_t *)s)[start]==0 :
3138                                      ((const char *)s)[start]==0
3139             ) {
3140                 break;
3141             }
3142             spanCondition=invertSpanCondition(spanCondition, contained);
3143         }
3144         break;
3145     case 4:
3146     case 5:
3147         if(length<0) {
3148             length=slen(s, isUTF16);
3149         }
3150         for(;;) {
3151             ++count;
3152             if(count<=limitsCapacity) {
3153                 limits[limitsCapacity-count]=length;
3154             }
3155             length= isUTF16 ? containsSpanBackUTF16(set, (const char16_t *)s, length, spanCondition) :
3156                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3157             if(length==0 && spanCondition==firstSpanCondition) {
3158                 break;
3159             }
3160             spanCondition=invertSpanCondition(spanCondition, contained);
3161         }
3162         if(count<limitsCapacity) {
3163             memmove(limits, limits+(limitsCapacity-count), count*4);
3164         }
3165         break;
3166     case 6:
3167     case 7:
3168         for(;;) {
3169             ++count;
3170             if(count<=limitsCapacity) {
3171                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3172             }
3173             // Note: Length<0 is tested only for the first spanBack().
3174             // If we wanted to keep length<0 for all spanBack()s, we would have to
3175             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3176             length= isUTF16 ? realSet.spanBack((const char16_t *)s, length, spanCondition) :
3177                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3178             if(length==0 && spanCondition==firstSpanCondition) {
3179                 break;
3180             }
3181             spanCondition=invertSpanCondition(spanCondition, contained);
3182         }
3183         if(count<limitsCapacity) {
3184             memmove(limits, limits+(limitsCapacity-count), count*4);
3185         }
3186         break;
3187     default:
3188         typeName="";
3189         return -1;
3190     }
3191 
3192     return count;
3193 }
3194 
3195 // sets to be tested; odd index=isComplement
3196 enum {
3197     SLOW,
3198     SLOW_NOT,
3199     FAST,
3200     FAST_NOT,
3201     SET_COUNT
3202 };
3203 
3204 static const char *const setNames[SET_COUNT]={
3205     "slow",
3206     "slow.not",
3207     "fast",
3208     "fast.not"
3209 };
3210 
3211 /*
3212  * Verify that we get the same results whether we look at text with contains(),
3213  * span() or spanBack(), using unfrozen or frozen versions of the set,
3214  * and using the set or its complement (switching the spanConditions accordingly).
3215  * The latter verifies that
3216  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3217  *
3218  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3219  * or returned to the caller (with an input expectCount<0).
3220  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3221 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3222                               const void *s, int32_t length, UBool isUTF16,
3223                               uint32_t whichSpans,
3224                               int32_t expectLimits[], int32_t &expectCount,
3225                               const char *testName, int32_t index) {
3226     int32_t limits[500];
3227     int32_t limitsCount;
3228     int i, j;
3229 
3230     const char *typeName;
3231     int type;
3232 
3233     for(i=0; i<SET_COUNT; ++i) {
3234         if((i&1)==0) {
3235             // Even-numbered sets are original, uncomplemented sets.
3236             if((whichSpans&SPAN_SET)==0) {
3237                 continue;
3238             }
3239         } else {
3240             // Odd-numbered sets are complemented.
3241             if((whichSpans&SPAN_COMPLEMENT)==0) {
3242                 continue;
3243             }
3244         }
3245         for(type=0;; ++type) {
3246             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3247                                  s, length, isUTF16,
3248                                  whichSpans,
3249                                  type, typeName,
3250                                  limits, UPRV_LENGTHOF(limits), expectCount);
3251             if(typeName[0]==0) {
3252                 break; // All types tried.
3253             }
3254             if(limitsCount<0) {
3255                 continue; // Span option filtered out.
3256             }
3257             if(expectCount<0) {
3258                 expectCount=limitsCount;
3259                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3260                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3261                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3262                     return;
3263                 }
3264                 memcpy(expectLimits, limits, limitsCount*4);
3265             } else if(limitsCount!=expectCount) {
3266                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3267                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3268             } else {
3269                 for(j=0; j<limitsCount; ++j) {
3270                     if(limits[j]!=expectLimits[j]) {
3271                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3272                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3273                               j, (long)limits[j], (long)expectLimits[j]);
3274                         break;
3275                     }
3276                 }
3277             }
3278         }
3279     }
3280 
3281     // Compare span() with containsAll()/containsNone(),
3282     // but only if we have expectLimits[] from the uncomplemented set.
3283     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3284         const char16_t *s16=(const char16_t *)s;
3285         UnicodeString string;
3286         int32_t prev=0, limit, length;
3287         for(i=0; i<expectCount; ++i) {
3288             limit=expectLimits[i];
3289             length=limit-prev;
3290             if(length>0) {
3291                 string.setTo(false, s16+prev, length);  // read-only alias
3292                 if(i&1) {
3293                     if(!sets[SLOW]->getSet().containsAll(string)) {
3294                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==false contradicts span()",
3295                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3296                         return;
3297                     }
3298                     if(!sets[FAST]->getSet().containsAll(string)) {
3299                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==false contradicts span()",
3300                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3301                         return;
3302                     }
3303                 } else {
3304                     if(!sets[SLOW]->getSet().containsNone(string)) {
3305                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==false contradicts span()",
3306                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3307                         return;
3308                     }
3309                     if(!sets[FAST]->getSet().containsNone(string)) {
3310                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==false contradicts span()",
3311                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3312                         return;
3313                     }
3314                 }
3315             }
3316             prev=limit;
3317         }
3318     }
3319 }
3320 
3321 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3322 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3323                               const void *s, int32_t length, UBool isUTF16,
3324                               uint32_t whichSpans,
3325                               const char *testName, int32_t index) {
3326     int32_t expectLimits[500];
3327     int32_t expectCount=-1;
3328     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3329 }
3330 
stringContainsUnpairedSurrogate(const char16_t * s,int32_t length)3331 UBool stringContainsUnpairedSurrogate(const char16_t *s, int32_t length) {
3332     char16_t c, c2;
3333 
3334     if(length>=0) {
3335         while(length>0) {
3336             c=*s++;
3337             --length;
3338             if(0xd800<=c && c<0xe000) {
3339                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3340                     return true;
3341                 }
3342                 --length;
3343             }
3344         }
3345     } else {
3346         while((c=*s++)!=0) {
3347             if(0xd800<=c && c<0xe000) {
3348                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3349                     return true;
3350                 }
3351             }
3352         }
3353     }
3354     return false;
3355 }
3356 
3357 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3358 // unless either UTF is turned off in whichSpans.
3359 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3360 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const char16_t * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3361 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3362                                       const char16_t *s16, int32_t length16,
3363                                       uint32_t whichSpans,
3364                                       const char *testName, int32_t index) {
3365     int32_t expectLimits[500];
3366     int32_t expectCount;
3367 
3368     expectCount=-1;  // Get expectLimits[] from testSpan().
3369 
3370     if((whichSpans&SPAN_UTF16)!=0) {
3371         testSpan(sets, s16, length16, true, whichSpans, expectLimits, expectCount, testName, index);
3372     }
3373     if((whichSpans&SPAN_UTF8)==0) {
3374         return;
3375     }
3376 
3377     // Convert s16[] and expectLimits[] to UTF-8.
3378     uint8_t s8[3000];
3379     int32_t offsets[3000];
3380 
3381     const char16_t *s16Limit=s16+length16;
3382     char *t=(char *)s8;
3383     char *tLimit=t+sizeof(s8);
3384     int32_t *o=offsets;
3385     UErrorCode errorCode=U_ZERO_ERROR;
3386 
3387     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3388     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, true, &errorCode);
3389     if(U_FAILURE(errorCode)) {
3390         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3391               testName, (long)index, u_errorName(errorCode));
3392         ucnv_resetFromUnicode(utf8Cnv);
3393         return;
3394     }
3395     int32_t length8=(int32_t)(t-(char *)s8);
3396 
3397     // Convert expectLimits[].
3398     int32_t i, j, expect;
3399     for(i=j=0; i<expectCount; ++i) {
3400         expect=expectLimits[i];
3401         if(expect==length16) {
3402             expectLimits[i]=length8;
3403         } else {
3404             while(offsets[j]<expect) {
3405                 ++j;
3406             }
3407             expectLimits[i]=j;
3408         }
3409     }
3410 
3411     testSpan(sets, s8, length8, false, whichSpans, expectLimits, expectCount, testName, index);
3412 }
3413 
nextCodePoint(UChar32 c)3414 static UChar32 nextCodePoint(UChar32 c) {
3415     // Skip some large and boring ranges.
3416     switch(c) {
3417     case 0x3441:
3418         return 0x4d7f;
3419     case 0x5100:
3420         return 0x9f00;
3421     case 0xb040:
3422         return 0xd780;
3423     case 0xe041:
3424         return 0xf8fe;
3425     case 0x10100:
3426         return 0x20000;
3427     case 0x20041:
3428         return 0xe0000;
3429     case 0xe0101:
3430         return 0x10fffd;
3431     default:
3432         return c+1;
3433     }
3434 }
3435 
3436 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3437 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3438     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3439     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3440     // Skip the UTF-8 part of the test - if the string contains surrogates -
3441     // because it is likely to produce a different result.
3442     UBool inconsistentSurrogates=
3443             (!(sets[0]->getSet().contains(0xfffd) ?
3444                sets[0]->getSet().contains(0xd800, 0xdfff) :
3445                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3446              sets[0]->hasStringsWithSurrogates());
3447 
3448     char16_t s[1000];
3449     int32_t length=0;
3450     uint32_t localWhichSpans;
3451 
3452     UChar32 c, first;
3453     for(first=c=0;; c=nextCodePoint(c)) {
3454         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3455             localWhichSpans=whichSpans;
3456             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3457                 localWhichSpans&=~SPAN_UTF8;
3458             }
3459             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3460             if(c>0x10ffff) {
3461                 break;
3462             }
3463             length=0;
3464             first=c;
3465         }
3466         U16_APPEND_UNSAFE(s, length, c);
3467     }
3468 }
3469 
3470 // Test with a particular, interesting string.
3471 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3472 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3473     static const char16_t s[]={
3474         0x61, 0x62, 0x20,                       // Latin, space
3475         0x3b1, 0x3b2, 0x3b3,                    // Greek
3476         0xd900,                                 // lead surrogate
3477         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3478         0xdc05,                                 // trail surrogate
3479         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3480         0xd900, 0xdc05,                         // unassigned supplementary
3481         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3482         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3483         0                                       // NUL
3484     };
3485 
3486     if((whichSpans&SPAN_UTF16)==0) {
3487         return;
3488     }
3489     testSpan(sets, s, -1, true, (whichSpans&~SPAN_UTF8), testName, 0);
3490     testSpan(sets, s, UPRV_LENGTHOF(s)-1, true, (whichSpans&~SPAN_UTF8), testName, 1);
3491 }
3492 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3493 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3494     static const char s[]={
3495         "abc"                                   // Latin
3496 
3497         /* trail byte in lead position */
3498         "\x80"
3499 
3500         " "                                     // space
3501 
3502         /* truncated multi-byte sequences */
3503         "\xd0"
3504         "\xe0"
3505         "\xe1"
3506         "\xed"
3507         "\xee"
3508         "\xf0"
3509         "\xf1"
3510         "\xf4"
3511         "\xf8"
3512         "\xfc"
3513 
3514         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3515 
3516         /* trail byte in lead position */
3517         "\x80"
3518 
3519         "\xe0\x80"
3520         "\xe0\xa0"
3521         "\xe1\x80"
3522         "\xed\x80"
3523         "\xed\xa0"
3524         "\xee\x80"
3525         "\xf0\x80"
3526         "\xf0\x90"
3527         "\xf1\x80"
3528         "\xf4\x80"
3529         "\xf4\x90"
3530         "\xf8\x80"
3531         "\xfc\x80"
3532 
3533         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3534 
3535         /* trail byte in lead position */
3536         "\x80"
3537 
3538         "\xf0\x80\x80"
3539         "\xf0\x90\x80"
3540         "\xf1\x80\x80"
3541         "\xf4\x80\x80"
3542         "\xf4\x90\x80"
3543         "\xf8\x80\x80"
3544         "\xfc\x80\x80"
3545 
3546         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3547 
3548         /* trail byte in lead position */
3549         "\x80"
3550 
3551         "\xf8\x80\x80\x80"
3552         "\xfc\x80\x80\x80"
3553 
3554         "\xF1\x90\x80\x85"                      // unassigned supplementary
3555 
3556         /* trail byte in lead position */
3557         "\x80"
3558 
3559         "\xfc\x80\x80\x80\x80"
3560 
3561         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3562 
3563         /* trail byte in lead position */
3564         "\x80"
3565 
3566         /* complete sequences but non-shortest forms or out of range etc. */
3567         "\xc0\x80"
3568         "\xe0\x80\x80"
3569         "\xed\xa0\x80"
3570         "\xf0\x80\x80\x80"
3571         "\xf4\x90\x80\x80"
3572         "\xf8\x80\x80\x80\x80"
3573         "\xfc\x80\x80\x80\x80\x80"
3574         "\xfe"
3575         "\xff"
3576 
3577         /* trail byte in lead position */
3578         "\x80"
3579 
3580         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3581     };
3582 
3583     if((whichSpans&SPAN_UTF8)==0) {
3584         return;
3585     }
3586     testSpan(sets, s, -1, false, (whichSpans&~SPAN_UTF16), testName, 0);
3587     testSpan(sets, s, UPRV_LENGTHOF(s)-1, false, (whichSpans&~SPAN_UTF16), testName, 1);
3588 }
3589 
3590 // Take a set of span options and multiply them so that
3591 // each portion only has one of the options a, b and c.
3592 // If b==0, then the set of options is just modified with mask and a.
3593 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3594 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3595 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3596                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3597     uint32_t s;
3598     int32_t i;
3599 
3600     for(i=0; i<whichSpansCount; ++i) {
3601         s=whichSpans[i]&mask;
3602         whichSpans[i]=s|a;
3603         if(b!=0) {
3604             whichSpans[whichSpansCount+i]=s|b;
3605             if(c!=0) {
3606                 whichSpans[2*whichSpansCount+i]=s|c;
3607             }
3608         }
3609     }
3610     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3611 }
3612 
3613 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3614 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3615 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3616 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3617 
TestSpan()3618 void UnicodeSetTest::TestSpan() {
3619     // "[...]" is a UnicodeSet pattern.
3620     // "*" performs tests on all Unicode code points and on a selection of
3621     //   malformed UTF-8/16 strings.
3622     // "-options" limits the scope of testing for the current set.
3623     //   By default, the test verifies that equivalent boundaries are found
3624     //   for UTF-16 and UTF-8, going forward and backward,
3625     //   alternating USET_SPAN_NOT_CONTAINED with
3626     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3627     //   Single-character options:
3628     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3629     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3630     //          or the set contains strings with unpaired surrogates
3631     //          which do not translate to valid UTF-8.
3632     //     c -- set.span() and set.complement().span() boundaries may differ.
3633     //          Cause: Set strings are not complemented.
3634     //     b -- span() and spanBack() boundaries may differ.
3635     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3636     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3637     //          match with non-overlapping substrings.
3638     //          For example, with a set containing "ab" and "ba",
3639     //          span() of "aba" yields boundaries { 0, 2, 3 }
3640     //          because the initial "ab" matches from 0 to 2,
3641     //          while spanBack() yields boundaries { 0, 1, 3 }
3642     //          because the final "ba" matches from 1 to 3.
3643     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3644     //          Cause: Strings in the set overlap, and a longer match may
3645     //          require a sequence including non-longest substrings.
3646     //          For example, with a set containing "ab", "abc" and "cd",
3647     //          span(contained) of "abcd" spans the entire string
3648     //          but span(longest match) only spans the first 3 characters.
3649     //   Each "-options" first resets all options and then applies the specified options.
3650     //   A "-" without options resets the options.
3651     //   The options are also reset for each new set.
3652     // Other strings will be spanned.
3653     static const char *const testdata[]={
3654         "[:ID_Continue:]",
3655         "*",
3656         "[:White_Space:]",
3657         "*",
3658         "[]",
3659         "*",
3660         "[\\u0000-\\U0010FFFF]",
3661         "*",
3662         "[\\u0000\\u0080\\u0800\\U00010000]",
3663         "*",
3664         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3665         "*",
3666         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3667         "-c",
3668         "*",
3669         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3670         "-c",
3671         "*",
3672 
3673         // Overlapping strings cause overlapping attempts to match.
3674         "[x{xy}{xya}{axy}{ax}]",
3675         "-cl",
3676 
3677         // More repetitions of "xya" would take too long with the recursive
3678         // reference implementation.
3679         // containsAll()=false
3680         // test_string 0x14
3681         "xx"
3682         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3683         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3684         "xyaxyaxyaxya"
3685         "xx"
3686         "xyaxyaxyaxya"  // span() ends here.
3687         "aaa",
3688 
3689         // containsAll()=true
3690         // test_string 0x15
3691         "xx"
3692         "xyaxyaxyaxya"
3693         "xx"
3694         "xyaxyaxyaxya"
3695         "xx"
3696         "xyaxyaxyaxy",
3697 
3698         "-bc",
3699         // test_string 0x17
3700         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3701         "-c",
3702         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3703         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3704         "-",
3705         "byaya",     // span() -> { 5 }
3706         "byay",      // span() -> { 4 }
3707         "bya",       // span() -> { 3 }
3708 
3709         // span(longest match) will not span the whole string.
3710         "[a{ab}{bc}]",
3711         "-cl",
3712         // test_string 0x21
3713         "abc",
3714 
3715         "[a{ab}{abc}{cd}]",
3716         "-cl",
3717         "acdabcdabccd",
3718 
3719         // spanBack(longest match) will not span the whole string.
3720         "[c{ab}{bc}]",
3721         "-cl",
3722         "abc",
3723 
3724         "[d{cd}{bcd}{ab}]",
3725         "-cl",
3726         "abbcdabcdabd",
3727 
3728         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3729         // and UTF-8 trail bytes.
3730         // Copies of above test sets and strings, but transliterated to have
3731         // different code points with similar trail units.
3732         // Previous: a      b         c            d
3733         // Unicode:  042B   30AB      200AB        204AB
3734         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3735         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3736         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3737         "-cl",
3738         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3739 
3740         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3741         "-cl",
3742         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3743 
3744         // Stress bookkeeping and recursion.
3745         // The following strings are barely doable with the recursive
3746         // reference implementation.
3747         // The not-contained character at the end prevents an early exit from the span().
3748         "[b{bb}]",
3749         "-c",
3750         // test_string 0x33
3751         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3752         // On complement sets, span() and spanBack() get different results
3753         // because b is not in the complement set and there is an odd number of b's
3754         // in the test string.
3755         "-bc",
3756         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3757 
3758         // Test with set strings with an initial or final code point span
3759         // longer than 254.
3760         "[a{" _64_a _64_a _64_a _64_a "b}"
3761           "{a" _64_b _64_b _64_b _64_b "}]",
3762         "-c",
3763         _64_a _64_a _64_a _63_a "b",
3764         _64_a _64_a _64_a _64_a "b",
3765         _64_a _64_a _64_a _64_a "aaaabbbb",
3766         "a" _64_b _64_b _64_b _63_b,
3767         "a" _64_b _64_b _64_b _64_b,
3768         "aaaabbbb" _64_b _64_b _64_b _64_b,
3769 
3770         // Test with strings containing unpaired surrogates.
3771         // They are not representable in UTF-8, and a leading trail surrogate
3772         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3773         // U+20001 == \\uD840\\uDC01
3774         // U+20400 == \\uD841\\uDC00
3775         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3776         "-8cl",
3777         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3778     };
3779     uint32_t whichSpans[96]={ SPAN_ALL };
3780     int32_t whichSpansCount=1;
3781 
3782     UnicodeSet *sets[SET_COUNT]={ nullptr };
3783     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ nullptr };
3784 
3785     char testName[1024];
3786     char *testNameLimit=testName;
3787 
3788     int32_t i, j;
3789     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3790         const char *s=testdata[i];
3791         if(s[0]=='[') {
3792             // Create new test sets from this pattern.
3793             for(j=0; j<SET_COUNT; ++j) {
3794                 delete sets_with_str[j];
3795                 delete sets[j];
3796             }
3797             UErrorCode errorCode=U_ZERO_ERROR;
3798             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3799             if(U_FAILURE(errorCode)) {
3800                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3801                 break;
3802             }
3803             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3804             sets[SLOW_NOT]->complement();
3805             // Intermediate set: Test cloning of a frozen set.
3806             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3807             fast->freeze();
3808             sets[FAST]=fast->clone();
3809             delete fast;
3810             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3811             fastNot->freeze();
3812             sets[FAST_NOT]=fastNot->clone();
3813             delete fastNot;
3814 
3815             for(j=0; j<SET_COUNT; ++j) {
3816                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3817             }
3818 
3819             strcpy(testName, s);
3820             testNameLimit=strchr(testName, 0);
3821             *testNameLimit++=':';
3822             *testNameLimit=0;
3823 
3824             whichSpans[0]=SPAN_ALL;
3825             whichSpansCount=1;
3826         } else if(s[0]=='-') {
3827             whichSpans[0]=SPAN_ALL;
3828             whichSpansCount=1;
3829 
3830             while(*++s!=0) {
3831                 switch(*s) {
3832                 case 'c':
3833                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3834                                                    ~SPAN_POLARITY,
3835                                                    SPAN_SET,
3836                                                    SPAN_COMPLEMENT,
3837                                                    0);
3838                     break;
3839                 case 'b':
3840                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3841                                                    ~SPAN_DIRS,
3842                                                    SPAN_FWD,
3843                                                    SPAN_BACK,
3844                                                    0);
3845                     break;
3846                 case 'l':
3847                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3848                     // USET_SPAN_SIMPLE only FWD, and separately
3849                     // USET_SPAN_SIMPLE only BACK
3850                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3851                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3852                                                    SPAN_DIRS|SPAN_CONTAINED,
3853                                                    SPAN_FWD|SPAN_SIMPLE,
3854                                                    SPAN_BACK|SPAN_SIMPLE);
3855                     break;
3856                 case '8':
3857                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3858                                                    ~SPAN_UTFS,
3859                                                    SPAN_UTF16,
3860                                                    SPAN_UTF8,
3861                                                    0);
3862                     break;
3863                 default:
3864                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3865                     break;
3866                 }
3867             }
3868         } else if(0==strcmp(s, "*")) {
3869             strcpy(testNameLimit, "bad_string");
3870             for(j=0; j<whichSpansCount; ++j) {
3871                 if(whichSpansCount>1) {
3872                     snprintf(testNameLimit+10 /* strlen("bad_string") */,
3873                              sizeof(testName) - (testNameLimit+10-testName),
3874                             "%%0x%3x",
3875                             whichSpans[j]);
3876                 }
3877                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3878                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3879             }
3880 
3881             strcpy(testNameLimit, "contents");
3882             for(j=0; j<whichSpansCount; ++j) {
3883                 if(whichSpansCount>1) {
3884                     snprintf(testNameLimit+8 /* strlen("contents") */,
3885                             sizeof(testName) - (testNameLimit+8-testName),
3886                             "%%0x%3x",
3887                             whichSpans[j]);
3888                 }
3889                 testSpanContents(sets_with_str, whichSpans[j], testName);
3890             }
3891         } else {
3892             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3893             strcpy(testNameLimit, "test_string");
3894             for(j=0; j<whichSpansCount; ++j) {
3895                 if(whichSpansCount>1) {
3896                     snprintf(testNameLimit+11 /* strlen("test_string") */,
3897                             sizeof(testName) - (testNameLimit+11-testName),
3898                             "%%0x%3x",
3899                             whichSpans[j]);
3900                 }
3901                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3902             }
3903         }
3904     }
3905     for(j=0; j<SET_COUNT; ++j) {
3906         delete sets_with_str[j];
3907         delete sets[j];
3908     }
3909 }
3910 
3911 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3912 void UnicodeSetTest::TestStringSpan() {
3913     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3914     static const char *const string=
3915         "xx"
3916         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3917         "xx"
3918         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3919         "xx"
3920         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3921         "aaaa";
3922 
3923     UErrorCode errorCode=U_ZERO_ERROR;
3924     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3925     UnicodeSet set(pattern16, errorCode);
3926     if(U_FAILURE(errorCode)) {
3927         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3928         return;
3929     }
3930 
3931     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3932 
3933     if(set.containsAll(string16)) {
3934         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be false", pattern, string);
3935     }
3936 
3937     // Remove trailing "aaaa".
3938     string16.truncate(string16.length()-4);
3939     if(!set.containsAll(string16)) {
3940         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be true", pattern, string);
3941     }
3942 
3943     string16=u"byayaxya";
3944     const char16_t *s16=string16.getBuffer();
3945     int32_t length16=string16.length();
3946     (void)length16;   // Suppress set but not used warning.
3947     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3948         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3949         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3950         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3951         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3952         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3953     ) {
3954         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3955     }
3956 
3957     pattern="[a{ab}{abc}{cd}]";
3958     pattern16=UnicodeString(pattern, -1, US_INV);
3959     set.applyPattern(pattern16, errorCode);
3960     if(U_FAILURE(errorCode)) {
3961         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3962         return;
3963     }
3964     string16=u"acdabcdabccd";
3965     s16=string16.getBuffer();
3966     length16=string16.length();
3967     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3968         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3969         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3970     ) {
3971         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3972     }
3973 
3974     pattern="[d{cd}{bcd}{ab}]";
3975     pattern16=UnicodeString(pattern, -1, US_INV);
3976     set.applyPattern(pattern16, errorCode).freeze();
3977     if(U_FAILURE(errorCode)) {
3978         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3979         return;
3980     }
3981     string16=u"abbcdabcdabd";
3982     s16=string16.getBuffer();
3983     length16=string16.length();
3984     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3985         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3986         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3987     ) {
3988         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3989     }
3990 }
3991 
TestPatternWithSurrogates()3992 void UnicodeSetTest::TestPatternWithSurrogates() {
3993     IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
3994     // Regression test for ICU-11891
3995     UnicodeSet surrogates;
3996     surrogates.add(0xd000, 0xd82f);  // a range ending with a lead surrogate code point
3997     surrogates.add(0xd83a);  // a lead surrogate
3998     surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3999     UnicodeString pat;
4000     surrogates.toPattern(pat, false);  // bad if U+D83A is immediately followed by U+DC00
4001     UnicodeSet s2;
4002     // was: U_MALFORMED_SET
4003     // Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
4004     s2.applyPattern(pat, errorCode);
4005     if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
4006     checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
4007 
4008     // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
4009     surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
4010     // add a beyond-surrogates range, up to the last code point
4011     surrogates.add(0x10affe, 0x10ffff);
4012     surrogates.toPattern(pat, false);  // bad if U+DBFF is immediately followed by U+DC00
4013     s2.applyPattern(pat, errorCode);
4014     if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
4015     checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
4016 
4017     // Test the toPattern() code path when the pattern is shorter in complement form:
4018     // [^opposite-ranges]
4019     surrogates.add(0, 0x6789);
4020     surrogates.toPattern(pat, false);
4021     s2.applyPattern(pat, errorCode);
4022     if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
4023     checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
4024 
4025     // Start with a pattern, in case the original pattern is kept but
4026     // without the extra white space.
4027     surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
4028     if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
4029     surrogates.toPattern(pat, false);
4030     s2.applyPattern(pat, errorCode);
4031     if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
4032     checkEqual(surrogates, s2, "surrogates from/to/from pattern");
4033 }
4034 
TestIntOverflow()4035 void UnicodeSetTest::TestIntOverflow() {
4036     // This test triggers undefined double->int conversion behavior
4037     // if the implementation is not careful.
4038     IcuTestErrorCode errorCode(*this, "TestIntOverflow");
4039     UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
4040     assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
4041     assertEquals("[:ccc=int_overflow:] -> illegal argument",
4042                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4043 }
4044 
TestUnusedCcc()4045 void UnicodeSetTest::TestUnusedCcc() {
4046 #if !UCONFIG_NO_NORMALIZATION
4047     // All numeric ccc values 0..255 are valid, but many are unused.
4048     IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
4049     UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
4050     assertSuccess("[:ccc=2:]", errorCode);
4051     assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
4052 
4053     UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
4054     assertSuccess("[:ccc=255:]", errorCode);
4055     assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
4056 
4057     // Non-integer values and values outside 0..255 are invalid.
4058     UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
4059     assertEquals("[:ccc=-1:] -> illegal argument",
4060                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4061     assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
4062 
4063     UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
4064     assertEquals("[:ccc=256:] -> illegal argument",
4065                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4066     assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
4067 
4068     UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
4069     assertEquals("[:ccc=1.1:] -> illegal argument",
4070                  U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4071     assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
4072 #endif
4073 }
4074 
TestDeepPattern()4075 void UnicodeSetTest::TestDeepPattern() {
4076     IcuTestErrorCode errorCode(*this, "TestDeepPattern");
4077     // Nested ranges are parsed via recursion which can use a lot of stack space.
4078     // After a reasonable limit, we should get an error.
4079     constexpr int32_t DEPTH = 20000;
4080     UnicodeString pattern, suffix;
4081     for (int32_t i = 0; i < DEPTH; ++i) {
4082         pattern.append(u"[a", 2);
4083         suffix.append(']');
4084     }
4085     pattern.append(suffix);
4086     UnicodeSet set(pattern, errorCode);
4087     assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
4088     errorCode.reset();
4089 }
4090 
TestEmptyString()4091 void UnicodeSetTest::TestEmptyString() {
4092     IcuTestErrorCode errorCode(*this, "TestEmptyString");
4093     // Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
4094     UnicodeSet set(u"[{}]", errorCode);
4095     if (!assertSuccess("set from pattern with {}", errorCode)) { return; }
4096     assertTrue("set from pattern with {}", set.contains(u""));
4097     assertEquals("set from pattern with {}: size", 1, set.size());
4098     assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
4099 
4100     // Remove, add back, ...
4101     assertFalse("remove empty string", set.remove(u"").contains(u""));
4102     assertEquals("remove empty string: size", 0, set.size());
4103     assertTrue("remove empty string: isEmpty", set.isEmpty());
4104     assertTrue("add empty string", set.add(u"").contains(u""));
4105     // missing API -- assertTrue("retain empty string", set.retain(u"").contains(u""));
4106     assertFalse("complement-remove empty string", set.complement(u"").contains(u""));
4107     assertTrue("complement-add empty string", set.complement(u"").contains(u""));
4108 
4109     assertFalse("clear", set.clear().contains(u""));
4110     assertTrue("add empty string 2", set.add(u"").contains(u""));
4111     assertFalse("removeAllStrings", set.removeAllStrings().contains(u""));
4112     assertTrue("add empty string 3", set.add(u"").contains(u""));
4113     // Note that this leaves the set containing exactly the empty string.
4114 
4115     // strings() access and iteration
4116     // no C++ equivalent for Java strings() -- assertTrue("strings()", set.strings().contains(u""));
4117     UnicodeSetIterator sit(set);
4118     assertTrue("set iterator.next()", sit.next());
4119     assertTrue("set iterator has empty string", sit.isString() && sit.getString().isEmpty());
4120 
4121     // The empty string is ignored in matching.
4122     set.add(u'a').add(u'c');
4123     assertEquals("span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4124     assertEquals("spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4125     assertTrue("containsNone", set.containsNone(u"def"));
4126     assertFalse("containsSome", set.containsSome(u"def"));
4127     set.freeze();
4128     assertEquals("frozen span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4129     assertEquals("frozen spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4130     assertTrue("frozen containsNone", set.containsNone(u"def"));
4131     assertFalse("frozen containsSome", set.containsSome(u"def"));
4132 }
4133 
assertNext(UnicodeSetIterator & iter,const UnicodeString & expected)4134 void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) {
4135     assertTrue(expected + ".next()", iter.next());
4136     assertEquals(expected + ".getString()", expected, iter.getString());
4137 }
4138 
TestSkipToStrings()4139 void UnicodeSetTest::TestSkipToStrings() {
4140     IcuTestErrorCode errorCode(*this, "TestSkipToStrings");
4141     UnicodeSet set(u"[0189{}{ch}]", errorCode);
4142     UnicodeSetIterator iter(set);
4143     assertNext(iter.skipToStrings(), u"");
4144     assertNext(iter, u"ch");
4145     assertFalse("no next", iter.next());
4146 
4147     iter.reset();
4148     assertNext(iter, u"0");
4149     assertNext(iter, u"1");
4150     assertNext(iter, u"8");
4151     assertNext(iter, u"9");
4152     assertNext(iter, u"");
4153     assertNext(iter, u"ch");
4154     assertFalse("no next", iter.next());
4155 
4156     iter.reset();
4157     assertNext(iter, u"0");
4158     iter.skipToStrings();
4159     assertNext(iter, u"");
4160     assertNext(iter, u"ch");
4161     assertFalse("no next", iter.next());
4162 
4163     iter.reset();
4164     iter.nextRange();
4165     assertNext(iter, u"8");
4166     iter.skipToStrings();
4167     assertNext(iter, u"");
4168     assertNext(iter, u"ch");
4169     assertFalse("no next", iter.next());
4170 
4171     iter.reset();
4172     iter.nextRange();
4173     iter.nextRange();
4174     iter.nextRange();
4175     iter.skipToStrings();
4176     assertNext(iter, u"ch");
4177     assertFalse("no next", iter.next());
4178 }
4179 
TestPatternCodePointComplement()4180 void UnicodeSetTest::TestPatternCodePointComplement() {
4181     IcuTestErrorCode errorCode(*this, "TestPatternCodePointComplement");
4182     // ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
4183     // [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
4184     {
4185         UnicodeSet simple(u"[^abc{ch}]", errorCode);
4186         assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
4187         assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
4188         assertFalse("[^abc{ch}] --> no 'a'", simple.contains(u'a'));
4189     }
4190 
4191     {
4192         UnicodeSet notBasic(u"[:^Basic_Emoji:]", errorCode);
4193         if (errorCode.errDataIfFailureAndReset("[:^Basic_Emoji:]")) {
4194             return;
4195         }
4196         assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
4197         assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
4198         assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains(U'��'));
4199     }
4200 
4201     {
4202         UnicodeSet notBasic(u"[:Basic_Emoji=No:]", errorCode);
4203         assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
4204         assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
4205         assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains(U'��'));
4206     }
4207 
4208     {
4209         UnicodeSet notBasic;
4210         notBasic.applyIntPropertyValue(UCHAR_BASIC_EMOJI, 0, errorCode);
4211         assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
4212                 notBasic.size() > 1000);
4213         assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
4214                 notBasic.hasStrings());
4215         assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
4216                 notBasic.contains(U'��'));
4217     }
4218 
4219     {
4220         UnicodeSet notBasic;
4221         notBasic.applyPropertyAlias("Basic_Emoji", "No", errorCode);
4222         assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
4223                 notBasic.size() > 1000);
4224         assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
4225                 notBasic.hasStrings());
4226         assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
4227                 notBasic.contains(U'��'));
4228     }
4229 
4230     // When there are strings, we must not use the complement for a more compact toPattern().
4231     {
4232         UnicodeSet set;
4233         set.add(0,  u'Y').add(u'b', u'q').add(u'x', 0x10ffff);
4234         UnicodeString pattern;
4235         set.toPattern(pattern, true);
4236         UnicodeSet set2(pattern, errorCode);
4237         checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
4238         assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern);
4239 
4240         set.add("ch").add("ss");
4241         set.toPattern(pattern, true);
4242         set2 = UnicodeSet(pattern, errorCode);
4243         checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
4244         assertEquals("set(with 0 & max, with strings).toPattern()",
4245                 u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
4246     }
4247 
4248     // The complement() API behavior does not change under this ticket.
4249     {
4250         UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
4251         notBasic.complement();
4252         assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
4253         assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
4254         assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
4255                 notBasic.contains(u"��\uFE0F"));
4256         assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U'��'));
4257     }
4258 }
4259