• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 *   Copyright (C) 1999-2016 International Business Machines Corporation and
6 *   others. All Rights Reserved.
7 ********************************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 *   03/22/2000  Madhu       Added additional tests
11 ********************************************************************************
12 */
13 
14 #include <stdio.h>
15 
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/uversion.h"
27 #include "cmemory.h"
28 #include "hash.h"
29 
30 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
31     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
32     u_errorName(status));}}
33 
34 #define TEST_ASSERT(expr) {if (!(expr)) { \
35     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
36 
operator +(const UnicodeString & left,const UnicodeSet & set)37 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
38     UnicodeString pat;
39     set.toPattern(pat);
40     return left + UnicodeSetTest::escape(pat);
41 }
42 
43 #define CASE(id,test) case id:                          \
44                           name = #test;                 \
45                           if (exec) {                   \
46                               logln(#test "---");       \
47                               logln();                  \
48                               test();                   \
49                           }                             \
50                           break
51 
UnicodeSetTest()52 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
53 }
54 
openUTF8Converter()55 UConverter *UnicodeSetTest::openUTF8Converter() {
56     if(utf8Cnv==NULL) {
57         UErrorCode errorCode=U_ZERO_ERROR;
58         utf8Cnv=ucnv_open("UTF-8", &errorCode);
59     }
60     return utf8Cnv;
61 }
62 
~UnicodeSetTest()63 UnicodeSetTest::~UnicodeSetTest() {
64     ucnv_close(utf8Cnv);
65 }
66 
67 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)68 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
69                                const char* &name, char* /*par*/) {
70     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
71     switch (index) {
72         CASE(0,TestPatterns);
73         CASE(1,TestAddRemove);
74         CASE(2,TestCategories);
75         CASE(3,TestCloneEqualHash);
76         CASE(4,TestMinimalRep);
77         CASE(5,TestAPI);
78         CASE(6,TestScriptSet);
79         CASE(7,TestPropertySet);
80         CASE(8,TestClone);
81         CASE(9,TestExhaustive);
82         CASE(10,TestToPattern);
83         CASE(11,TestIndexOf);
84         CASE(12,TestStrings);
85         CASE(13,Testj2268);
86         CASE(14,TestCloseOver);
87         CASE(15,TestEscapePattern);
88         CASE(16,TestInvalidCodePoint);
89         CASE(17,TestSymbolTable);
90         CASE(18,TestSurrogate);
91         CASE(19,TestPosixClasses);
92         CASE(20,TestIteration);
93         CASE(21,TestFreezable);
94         CASE(22,TestSpan);
95         CASE(23,TestStringSpan);
96         CASE(24,TestUCAUnsafeBackwards);
97         default: name = ""; break;
98     }
99 }
100 
101 static const char NOT[] = "%%%%";
102 
103 /**
104  * UVector was improperly copying contents
105  * This code will crash this is still true
106  */
Testj2268()107 void UnicodeSetTest::Testj2268() {
108   UnicodeSet t;
109   t.add(UnicodeString("abc"));
110   UnicodeSet test(t);
111   UnicodeString ustrPat;
112   test.toPattern(ustrPat, TRUE);
113 }
114 
115 /**
116  * Test toPattern().
117  */
TestToPattern()118 void UnicodeSetTest::TestToPattern() {
119     UErrorCode ec = U_ZERO_ERROR;
120 
121     // Test that toPattern() round trips with syntax characters and
122     // whitespace.
123     {
124         static const char* OTHER_TOPATTERN_TESTS[] = {
125             "[[:latin:]&[:greek:]]",
126             "[[:latin:]-[:greek:]]",
127             "[:nonspacing mark:]",
128             NULL
129         };
130 
131         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
132             ec = U_ZERO_ERROR;
133             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
134             if (U_FAILURE(ec)) {
135                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
136                 continue;
137             }
138             checkPat(OTHER_TOPATTERN_TESTS[j], s);
139         }
140 
141         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
142             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
143 
144                 // check various combinations to make sure they all work.
145                 if (i != 0 && !toPatternAux(i, i)){
146                     continue;
147                 }
148                 if (!toPatternAux(0, i)){
149                     continue;
150                 }
151                 if (!toPatternAux(i, 0xFFFF)){
152                     continue;
153                 }
154             }
155         }
156     }
157 
158     // Test pattern behavior of multicharacter strings.
159     {
160         ec = U_ZERO_ERROR;
161         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
162 
163         // This loop isn't a loop.  It's here to make the compiler happy.
164         // If you're curious, try removing it and changing the 'break'
165         // statements (except for the last) to goto's.
166         for (;;) {
167             if (U_FAILURE(ec)) break;
168             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
169             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
170 
171             s->add("ac");
172             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
173             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
174 
175             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
176             if (U_FAILURE(ec)) break;
177             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
178             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
179 
180             s->add("[]");
181             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
182             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
183 
184             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
185             if (U_FAILURE(ec)) break;
186             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
187             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
188 
189             // j2189
190             s->clear();
191             s->add(UnicodeString("abc", ""));
192             s->add(UnicodeString("abc", ""));
193             const char* exp6[] = {"abc", NOT, "ab", NULL};
194             expectToPattern(*s, "[{abc}]", exp6);
195 
196             break;
197         }
198 
199         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
200         delete s;
201     }
202 
203     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
204     UnicodeSet s;
205     s.add((UChar)97, (UChar)98); // 'a', 'b'
206     expectToPattern(s, "[ab]", NULL);
207 }
208 
toPatternAux(UChar32 start,UChar32 end)209 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
210 
211     // use Integer.toString because Utility.hex doesn't handle ints
212     UnicodeString pat = "";
213     // TODO do these in hex
214     //String source = "0x" + Integer.toString(start,16).toUpperCase();
215     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
216     UnicodeString source;
217     source = source + (uint32_t)start;
218     if (start != end)
219         source = source + ".." + (uint32_t)end;
220     UnicodeSet testSet;
221     testSet.add(start, end);
222     return checkPat(source, testSet);
223 }
224 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)225 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
226                                const UnicodeSet& testSet) {
227     // What we want to make sure of is that a pattern generated
228     // by toPattern(), with or without escaped unprintables, can
229     // be passed back into the UnicodeSet constructor.
230     UnicodeString pat0;
231 
232     testSet.toPattern(pat0, TRUE);
233 
234     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
235 
236     //String pat1 = unescapeLeniently(pat0);
237     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
238 
239     UnicodeString pat2;
240     testSet.toPattern(pat2, FALSE);
241     if (!checkPat(source, testSet, pat2)) return FALSE;
242 
243     //String pat3 = unescapeLeniently(pat2);
244     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
245 
246     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
247     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
248     return TRUE;
249 }
250 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)251 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
252                                const UnicodeSet& testSet,
253                                const UnicodeString& pat) {
254     UErrorCode ec = U_ZERO_ERROR;
255     UnicodeSet testSet2(pat, ec);
256     if (testSet2 != testSet) {
257         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
258         return FALSE;
259     }
260     return TRUE;
261 }
262 
263 void
TestPatterns(void)264 UnicodeSetTest::TestPatterns(void) {
265     UnicodeSet set;
266     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
267     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
268     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
269     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
270     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
271     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
272 
273     // Throw in a test of complement
274     set.complement();
275     UnicodeString exp;
276     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
277     expectPairs(set, exp);
278 }
279 
280 void
TestCategories(void)281 UnicodeSetTest::TestCategories(void) {
282     UErrorCode status = U_ZERO_ERROR;
283     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
284     UnicodeSet set(pat, status);
285     if (U_FAILURE(status)) {
286         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
287         return;
288     } else {
289         expectContainment(set, pat, "ABC", "abc");
290     }
291 
292     UChar32 i;
293     int32_t failures = 0;
294     // Make sure generation of L doesn't pollute cached Lu set
295     // First generate L, then Lu
296     set.applyPattern("[:L:]", status);
297     if (U_FAILURE(status)) { errln("FAIL"); return; }
298     for (i=0; i<0x200; ++i) {
299         UBool l = u_isalpha((UChar)i);
300         if (l != set.contains(i)) {
301             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
302                   set.contains(i));
303             if (++failures == 10) break;
304         }
305     }
306 
307     set.applyPattern("[:Lu:]", status);
308     if (U_FAILURE(status)) { errln("FAIL"); return; }
309     for (i=0; i<0x200; ++i) {
310         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
311         if (lu != set.contains(i)) {
312             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
313                   set.contains(i));
314             if (++failures == 20) break;
315         }
316     }
317 }
318 void
TestCloneEqualHash(void)319 UnicodeSetTest::TestCloneEqualHash(void) {
320     UErrorCode status = U_ZERO_ERROR;
321     // set1 and set2 used to be built with the obsolete constructor taking
322     // UCharCategory values; replaced with pattern constructors
323     // markus 20030502
324     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
325     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
326     if (U_FAILURE(status)){
327         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
328         return;
329     }
330     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
331     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
332     if (U_FAILURE(status)){
333         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
334         return;
335     }
336 
337     if (*set1 != *set1a) {
338         errln("FAIL: category constructor for Ll broken");
339     }
340     if (*set2 != *set2a) {
341         errln("FAIL: category constructor for Nd broken");
342     }
343     delete set1a;
344     delete set2a;
345 
346     logln("Testing copy construction");
347     UnicodeSet *set1copy=new UnicodeSet(*set1);
348     if(*set1 != *set1copy || *set1 == *set2 ||
349         getPairs(*set1) != getPairs(*set1copy) ||
350         set1->hashCode() != set1copy->hashCode()){
351         errln("FAIL : Error in copy construction");
352         return;
353     }
354 
355     logln("Testing =operator");
356     UnicodeSet set1equal=*set1;
357     UnicodeSet set2equal=*set2;
358     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
359         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
360         errln("FAIL: Error in =operator");
361     }
362 
363     logln("Testing clone()");
364     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
365     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
366     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
367         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
368         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
369         errln("FAIL: Error in clone");
370     }
371 
372     logln("Testing hashcode");
373     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
374         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
375         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
376         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
377         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
378         errln("FAIL: Error in hashCode()");
379     }
380 
381     delete set1;
382     delete set1copy;
383     delete set2;
384     delete set1clone;
385     delete set2clone;
386 
387 
388 }
389 void
TestAddRemove(void)390 UnicodeSetTest::TestAddRemove(void) {
391     UnicodeSet set; // Construct empty set
392     doAssert(set.isEmpty() == TRUE, "set should be empty");
393     doAssert(set.size() == 0, "size should be 0");
394     set.complement();
395     doAssert(set.size() == 0x110000, "size should be 0x110000");
396     set.clear();
397     set.add(0x0061, 0x007a);
398     expectPairs(set, "az");
399     doAssert(set.isEmpty() == FALSE, "set should not be empty");
400     doAssert(set.size() != 0, "size should not be equal to 0");
401     doAssert(set.size() == 26, "size should be equal to 26");
402     set.remove(0x006d, 0x0070);
403     expectPairs(set, "alqz");
404     doAssert(set.size() == 22, "size should be equal to 22");
405     set.remove(0x0065, 0x0067);
406     expectPairs(set, "adhlqz");
407     doAssert(set.size() == 19, "size should be equal to 19");
408     set.remove(0x0064, 0x0069);
409     expectPairs(set, "acjlqz");
410     doAssert(set.size() == 16, "size should be equal to 16");
411     set.remove(0x0063, 0x0072);
412     expectPairs(set, "absz");
413     doAssert(set.size() == 10, "size should be equal to 10");
414     set.add(0x0066, 0x0071);
415     expectPairs(set, "abfqsz");
416     doAssert(set.size() == 22, "size should be equal to 22");
417     set.remove(0x0061, 0x0067);
418     expectPairs(set, "hqsz");
419     set.remove(0x0061, 0x007a);
420     expectPairs(set, "");
421     doAssert(set.isEmpty() == TRUE, "set should be empty");
422     doAssert(set.size() == 0, "size should be 0");
423     set.add(0x0061);
424     doAssert(set.isEmpty() == FALSE, "set should not be empty");
425     doAssert(set.size() == 1, "size should not be equal to 1");
426     set.add(0x0062);
427     set.add(0x0063);
428     expectPairs(set, "ac");
429     doAssert(set.size() == 3, "size should not be equal to 3");
430     set.add(0x0070);
431     set.add(0x0071);
432     expectPairs(set, "acpq");
433     doAssert(set.size() == 5, "size should not be equal to 5");
434     set.clear();
435     expectPairs(set, "");
436     doAssert(set.isEmpty() == TRUE, "set should be empty");
437     doAssert(set.size() == 0, "size should be 0");
438 
439     // Try removing an entire set from another set
440     expectPattern(set, "[c-x]", "cx");
441     UnicodeSet set2;
442     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
443     set.removeAll(set2);
444     expectPairs(set, "deluxx");
445 
446     // Try adding an entire set to another set
447     expectPattern(set, "[jackiemclean]", "aacceein");
448     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
449     set.addAll(set2);
450     expectPairs(set, "aacehort");
451     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
452 
453     // Try retaining an set of elements contained in another set (intersection)
454     UnicodeSet set3;
455     expectPattern(set3, "[a-c]", "ac");
456     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
457     set3.remove(0x0062);
458     expectPairs(set3, "aacc");
459     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
460     set.retainAll(set3);
461     expectPairs(set, "aacc");
462     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
463     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
464     set.clear();
465     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
466 
467     // Test commutativity
468     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
469     expectPattern(set2, "[jackiemclean]", "aacceein");
470     set.addAll(set2);
471     expectPairs(set, "aacehort");
472     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
473 
474 
475 
476 
477 }
478 
479 /**
480  * Make sure minimal representation is maintained.
481  */
TestMinimalRep()482 void UnicodeSetTest::TestMinimalRep() {
483     UErrorCode status = U_ZERO_ERROR;
484     // This is pretty thoroughly tested by checkCanonicalRep()
485     // run against the exhaustive operation results.  Use the code
486     // here for debugging specific spot problems.
487 
488     // 1 overlap against 2
489     UnicodeSet set("[h-km-q]", status);
490     if (U_FAILURE(status)) { errln("FAIL"); return; }
491     UnicodeSet set2("[i-o]", status);
492     if (U_FAILURE(status)) { errln("FAIL"); return; }
493     set.addAll(set2);
494     expectPairs(set, "hq");
495     // right
496     set.applyPattern("[a-m]", status);
497     if (U_FAILURE(status)) { errln("FAIL"); return; }
498     set2.applyPattern("[e-o]", status);
499     if (U_FAILURE(status)) { errln("FAIL"); return; }
500     set.addAll(set2);
501     expectPairs(set, "ao");
502     // left
503     set.applyPattern("[e-o]", status);
504     if (U_FAILURE(status)) { errln("FAIL"); return; }
505     set2.applyPattern("[a-m]", status);
506     if (U_FAILURE(status)) { errln("FAIL"); return; }
507     set.addAll(set2);
508     expectPairs(set, "ao");
509     // 1 overlap against 3
510     set.applyPattern("[a-eg-mo-w]", status);
511     if (U_FAILURE(status)) { errln("FAIL"); return; }
512     set2.applyPattern("[d-q]", status);
513     if (U_FAILURE(status)) { errln("FAIL"); return; }
514     set.addAll(set2);
515     expectPairs(set, "aw");
516 }
517 
TestAPI()518 void UnicodeSetTest::TestAPI() {
519     UErrorCode status = U_ZERO_ERROR;
520     // default ct
521     UnicodeSet set;
522     if (!set.isEmpty() || set.getRangeCount() != 0) {
523         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
524               set);
525     }
526 
527     // clear(), isEmpty()
528     set.add(0x0061);
529     if (set.isEmpty()) {
530         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
531               set);
532     }
533     set.clear();
534     if (!set.isEmpty()) {
535         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
536               set);
537     }
538 
539     // size()
540     set.clear();
541     if (set.size() != 0) {
542         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
543               ": " + set);
544     }
545     set.add(0x0061);
546     if (set.size() != 1) {
547         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
548               ": " + set);
549     }
550     set.add(0x0031, 0x0039);
551     if (set.size() != 10) {
552         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
553               ": " + set);
554     }
555 
556     // contains(first, last)
557     set.clear();
558     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
559     if (U_FAILURE(status)) { errln("FAIL"); return; }
560     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
561         UChar32 a = set.getRangeStart(i);
562         UChar32 b = set.getRangeEnd(i);
563         if (!set.contains(a, b)) {
564             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
565                   " but doesn't: " + set);
566         }
567         if (set.contains((UChar32)(a-1), b)) {
568             errln((UnicodeString)"FAIL, shouldn't contain " +
569                   (unsigned short)(a-1) + '-' + (unsigned short)b +
570                   " but does: " + set);
571         }
572         if (set.contains(a, (UChar32)(b+1))) {
573             errln((UnicodeString)"FAIL, shouldn't contain " +
574                   (unsigned short)a + '-' + (unsigned short)(b+1) +
575                   " but does: " + set);
576         }
577     }
578 
579     // Ported InversionList test.
580     UnicodeSet a((UChar32)3,(UChar32)10);
581     UnicodeSet b((UChar32)7,(UChar32)15);
582     UnicodeSet c;
583 
584     logln((UnicodeString)"a [3-10]: " + a);
585     logln((UnicodeString)"b [7-15]: " + b);
586     c = a;
587     c.addAll(b);
588     UnicodeSet exp((UChar32)3,(UChar32)15);
589     if (c == exp) {
590         logln((UnicodeString)"c.set(a).add(b): " + c);
591     } else {
592         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
593     }
594     c.complement();
595     exp.set((UChar32)0, (UChar32)2);
596     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
597     if (c == exp) {
598         logln((UnicodeString)"c.complement(): " + c);
599     } else {
600         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
601     }
602     c.complement();
603     exp.set((UChar32)3, (UChar32)15);
604     if (c == exp) {
605         logln((UnicodeString)"c.complement(): " + c);
606     } else {
607         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
608     }
609     c = a;
610     c.complementAll(b);
611     exp.set((UChar32)3,(UChar32)6);
612     exp.add((UChar32)11,(UChar32) 15);
613     if (c == exp) {
614         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
615     } else {
616         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
617     }
618 
619     exp = c;
620     bitsToSet(setToBits(c), c);
621     if (c == exp) {
622         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
623     } else {
624         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
625     }
626 
627     // Additional tests for coverage JB#2118
628     //UnicodeSet::complement(class UnicodeString const &)
629     //UnicodeSet::complementAll(class UnicodeString const &)
630     //UnicodeSet::containsNone(class UnicodeSet const &)
631     //UnicodeSet::containsNone(long,long)
632     //UnicodeSet::containsSome(class UnicodeSet const &)
633     //UnicodeSet::containsSome(long,long)
634     //UnicodeSet::removeAll(class UnicodeString const &)
635     //UnicodeSet::retain(long)
636     //UnicodeSet::retainAll(class UnicodeString const &)
637     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
638     //UnicodeSetIterator::getString(void)
639     set.clear();
640     set.complement("ab");
641     exp.applyPattern("[{ab}]", status);
642     if (U_FAILURE(status)) { errln("FAIL"); return; }
643     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
644 
645     UnicodeSetIterator iset(set);
646     if (!iset.next() || !iset.isString()) {
647         errln("FAIL: UnicodeSetIterator::next/isString");
648     } else if (iset.getString() != "ab") {
649         errln("FAIL: UnicodeSetIterator::getString");
650     }
651 
652     set.add((UChar32)0x61, (UChar32)0x7A);
653     set.complementAll("alan");
654     exp.applyPattern("[{ab}b-kmo-z]", status);
655     if (U_FAILURE(status)) { errln("FAIL"); return; }
656     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
657 
658     exp.applyPattern("[a-z]", status);
659     if (U_FAILURE(status)) { errln("FAIL"); return; }
660     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
661     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
662     exp.applyPattern("[aln]", status);
663     if (U_FAILURE(status)) { errln("FAIL"); return; }
664     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
665     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
666 
667     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
668         errln("FAIL: containsNone(UChar32, UChar32)");
669     }
670     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
671         errln("FAIL: containsSome(UChar32, UChar32)");
672     }
673     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
674         errln("FAIL: containsNone(UChar32, UChar32)");
675     }
676     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
677         errln("FAIL: containsSome(UChar32, UChar32)");
678     }
679 
680     set.removeAll("liu");
681     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
682     if (U_FAILURE(status)) { errln("FAIL"); return; }
683     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
684 
685     set.retainAll("star");
686     exp.applyPattern("[rst]", status);
687     if (U_FAILURE(status)) { errln("FAIL"); return; }
688     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
689 
690     set.retain((UChar32)0x73);
691     exp.applyPattern("[s]", status);
692     if (U_FAILURE(status)) { errln("FAIL"); return; }
693     if (set != exp) { errln("FAIL: retain('s')"); return; }
694 
695     uint16_t buf[32];
696     int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
697     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
698     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
699         errln("FAIL: serialize");
700         return;
701     }
702 
703     // Conversions to and from USet
704     UnicodeSet *uniset = &set;
705     USet *uset = uniset->toUSet();
706     TEST_ASSERT((void *)uset == (void *)uniset);
707     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
708     TEST_ASSERT((void *)setx == (void *)uset);
709     const UnicodeSet *constSet = uniset;
710     const USet *constUSet = constSet->toUSet();
711     TEST_ASSERT((void *)constUSet == (void *)constSet);
712     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
713     TEST_ASSERT((void *)constSetx == (void *)constUSet);
714 
715     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
716     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
717     UnicodeSet ac(0x61, 0x63);
718     ac.remove(0x62).freeze();
719     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
720         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
721         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
722         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
723         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
724         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
725         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
726         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
727         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
728         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
729     ) {
730         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
731     }
732     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
733         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
734         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
735         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
736         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
737         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
738         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
739         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
740         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
741         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
742     ) {
743         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
744     }
745 }
746 
TestIteration()747 void UnicodeSetTest::TestIteration() {
748     UErrorCode ec = U_ZERO_ERROR;
749     int i = 0;
750     int outerLoop;
751 
752     // 6 code points, 3 ranges, 2 strings, 8 total elements
753     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
754     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
755     TEST_ASSERT_SUCCESS(ec);
756     UnicodeSetIterator it(set);
757 
758     for (outerLoop=0; outerLoop<3; outerLoop++) {
759         // Run the test multiple times, to check that iterator.reset() is working.
760         for (i=0; i<10; i++) {
761             UBool         nextv        = it.next();
762             UBool         isString     = it.isString();
763             int32_t       codePoint    = it.getCodepoint();
764             //int32_t       codePointEnd = it.getCodepointEnd();
765             UnicodeString s   = it.getString();
766             switch (i) {
767             case 0:
768                 TEST_ASSERT(nextv == TRUE);
769                 TEST_ASSERT(isString == FALSE);
770                 TEST_ASSERT(codePoint==0x61);
771                 TEST_ASSERT(s == "a");
772                 break;
773             case 1:
774                 TEST_ASSERT(nextv == TRUE);
775                 TEST_ASSERT(isString == FALSE);
776                 TEST_ASSERT(codePoint==0x62);
777                 TEST_ASSERT(s == "b");
778                 break;
779             case 2:
780                 TEST_ASSERT(nextv == TRUE);
781                 TEST_ASSERT(isString == FALSE);
782                 TEST_ASSERT(codePoint==0x63);
783                 TEST_ASSERT(s == "c");
784                 break;
785             case 3:
786                 TEST_ASSERT(nextv == TRUE);
787                 TEST_ASSERT(isString == FALSE);
788                 TEST_ASSERT(codePoint==0x79);
789                 TEST_ASSERT(s == "y");
790                 break;
791             case 4:
792                 TEST_ASSERT(nextv == TRUE);
793                 TEST_ASSERT(isString == FALSE);
794                 TEST_ASSERT(codePoint==0x7a);
795                 TEST_ASSERT(s == "z");
796                 break;
797             case 5:
798                 TEST_ASSERT(nextv == TRUE);
799                 TEST_ASSERT(isString == FALSE);
800                 TEST_ASSERT(codePoint==0x1abcd);
801                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
802                 break;
803             case 6:
804                 TEST_ASSERT(nextv == TRUE);
805                 TEST_ASSERT(isString == TRUE);
806                 TEST_ASSERT(s == "str1");
807                 break;
808             case 7:
809                 TEST_ASSERT(nextv == TRUE);
810                 TEST_ASSERT(isString == TRUE);
811                 TEST_ASSERT(s == "str2");
812                 break;
813             case 8:
814                 TEST_ASSERT(nextv == FALSE);
815                 break;
816             case 9:
817                 TEST_ASSERT(nextv == FALSE);
818                 break;
819             }
820         }
821         it.reset();  // prepare to run the iteration again.
822     }
823 }
824 
825 
826 
827 
TestStrings()828 void UnicodeSetTest::TestStrings() {
829     UErrorCode ec = U_ZERO_ERROR;
830 
831     UnicodeSet* testList[] = {
832         UnicodeSet::createFromAll("abc"),
833         new UnicodeSet("[a-c]", ec),
834 
835         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
836         new UnicodeSet("[{ll}{ch}a-z]", ec),
837 
838         UnicodeSet::createFrom("ab}c"),
839         new UnicodeSet("[{ab\\}c}]", ec),
840 
841         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
842         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
843 
844         NULL
845     };
846 
847     if (U_FAILURE(ec)) {
848         errln("FAIL: couldn't construct test sets");
849     }
850 
851     for (int32_t i = 0; testList[i] != NULL; i+=2) {
852         if (U_SUCCESS(ec)) {
853             UnicodeString pat0, pat1;
854             testList[i]->toPattern(pat0, TRUE);
855             testList[i+1]->toPattern(pat1, TRUE);
856             if (*testList[i] == *testList[i+1]) {
857                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
858             } else {
859                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
860             }
861         }
862         delete testList[i];
863         delete testList[i+1];
864     }
865 }
866 
867 /**
868  * Test the [:Latin:] syntax.
869  */
TestScriptSet()870 void UnicodeSetTest::TestScriptSet() {
871     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
872 
873     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
874 
875     /* Jitterbug 1423 */
876     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
877 
878 }
879 
880 /**
881  * Test the [:Latin:] syntax.
882  */
TestPropertySet()883 void UnicodeSetTest::TestPropertySet() {
884     static const char* const DATA[] = {
885         // Pattern, Chars IN, Chars NOT in
886 
887         "[:Latin:]",
888         "aA",
889         "\\u0391\\u03B1",
890 
891         "[\\p{Greek}]",
892         "\\u0391\\u03B1",
893         "aA",
894 
895         "\\P{ GENERAL Category = upper case letter }",
896         "abc",
897         "ABC",
898 
899 #if !UCONFIG_NO_NORMALIZATION
900         // Combining class: @since ICU 2.2
901         // Check both symbolic and numeric
902         "\\p{ccc=Nukta}",
903         "\\u0ABC",
904         "abc",
905 
906         "\\p{Canonical Combining Class = 11}",
907         "\\u05B1",
908         "\\u05B2",
909 
910         "[:c c c = iota subscript :]",
911         "\\u0345",
912         "xyz",
913 #endif
914 
915         // Bidi class: @since ICU 2.2
916         "\\p{bidiclass=lefttoright}",
917         "abc",
918         "\\u0671\\u0672",
919 
920         // Binary properties: @since ICU 2.2
921         "\\p{ideographic}",
922         "\\u4E0A",
923         "x",
924 
925         "[:math=false:]",
926         "q)*(",
927         // weiv: )(and * were removed from math in Unicode 4.0.1
928         //"(*+)",
929         "+<>^",
930 
931         // JB#1767 \N{}, \p{ASCII}
932         "[:Ascii:]",
933         "abc\\u0000\\u007F",
934         "\\u0080\\u4E00",
935 
936         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
937         "az",
938         "qrs",
939 
940         // JB#2015
941         "[:any:]",
942         "a\\U0010FFFF",
943         "",
944 
945         "[:nv=0.5:]",
946         "\\u00BD\\u0F2A",
947         "\\u00BC",
948 
949         // JB#2653: Age
950         "[:Age=1.1:]",
951         "\\u03D6", // 1.1
952         "\\u03D8\\u03D9", // 3.2
953 
954         "[:Age=3.1:]",
955         "\\u1800\\u3400\\U0002f800",
956         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
957 
958         // JB#2350: Case_Sensitive
959         "[:Case Sensitive:]",
960         "A\\u1FFC\\U00010410",
961         ";\\u00B4\\U00010500",
962 
963         // JB#2832: C99-compatibility props
964         "[:blank:]",
965         " \\u0009",
966         "1-9A-Z",
967 
968         "[:graph:]",
969         "19AZ",
970         " \\u0003\\u0007\\u0009\\u000A\\u000D",
971 
972         "[:punct:]",
973         "!@#%&*()[]{}-_\\/;:,.?'\"",
974         "09azAZ",
975 
976         "[:xdigit:]",
977         "09afAF",
978         "gG!",
979 
980         // Regex compatibility test
981         "[-b]", // leading '-' is literal
982         "-b",
983         "ac",
984 
985         "[^-b]", // leading '-' is literal
986         "ac",
987         "-b",
988 
989         "[b-]", // trailing '-' is literal
990         "-b",
991         "ac",
992 
993         "[^b-]", // trailing '-' is literal
994         "ac",
995         "-b",
996 
997         "[a-b-]", // trailing '-' is literal
998         "ab-",
999         "c=",
1000 
1001         "[[a-q]&[p-z]-]", // trailing '-' is literal
1002         "pq-",
1003         "or=",
1004 
1005         "[\\s|\\)|:|$|\\>]", // from regex tests
1006         "s|):$>",
1007         "abc",
1008 
1009         "[\\uDC00cd]", // JB#2906: isolated trail at start
1010         "cd\\uDC00",
1011         "ab\\uD800\\U00010000",
1012 
1013         "[ab\\uD800]", // JB#2906: isolated trail at start
1014         "ab\\uD800",
1015         "cd\\uDC00\\U00010000",
1016 
1017         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1018         "abcd\\uD800",
1019         "ef\\uDC00\\U00010000",
1020 
1021         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1022         "abcd\\uDC00",
1023         "ef\\uD800\\U00010000",
1024 
1025 #if !UCONFIG_NO_NORMALIZATION
1026         "[:^lccc=0:]", // Lead canonical class
1027         "\\u0300\\u0301",
1028         "abcd\\u00c0\\u00c5",
1029 
1030         "[:^tccc=0:]", // Trail canonical class
1031         "\\u0300\\u0301\\u00c0\\u00c5",
1032         "abcd",
1033 
1034         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1035         "\\u0300\\u0301\\u00c0\\u00c5",
1036         "abcd",
1037 
1038         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1039         "",
1040         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1041 
1042         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1043         "\\u0F73\\u0F75\\u0F81",
1044         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1045 #endif /* !UCONFIG_NO_NORMALIZATION */
1046 
1047         "[:Assigned:]",
1048         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1049         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1050 
1051         // Script_Extensions, new in Unicode 6.0
1052         "[:scx=Arab:]",
1053         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1054         "\\u061D\\uFDEF\\uFDFE",
1055 
1056         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1057         // so scx-sc is missing U+FDF2.
1058         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1059         "\\u0640\\u064B\\u0650\\u0655",
1060         "\\uFDF2"
1061     };
1062 
1063     static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1064 
1065     for (int32_t i=0; i<DATA_LEN; i+=3) {
1066         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1067                           CharsToUnicodeString(DATA[i+2]));
1068     }
1069 }
1070 
1071 /**
1072   * Test that Posix style character classes [:digit:], etc.
1073   *   have the Unicode definitions from TR 18.
1074   */
TestPosixClasses()1075 void UnicodeSetTest::TestPosixClasses() {
1076     {
1077         UErrorCode status = U_ZERO_ERROR;
1078         UnicodeSet s1("[:alpha:]", status);
1079         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1080         TEST_ASSERT_SUCCESS(status);
1081         TEST_ASSERT(s1==s2);
1082     }
1083     {
1084         UErrorCode status = U_ZERO_ERROR;
1085         UnicodeSet s1("[:lower:]", status);
1086         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1087         TEST_ASSERT_SUCCESS(status);
1088         TEST_ASSERT(s1==s2);
1089     }
1090     {
1091         UErrorCode status = U_ZERO_ERROR;
1092         UnicodeSet s1("[:upper:]", status);
1093         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1094         TEST_ASSERT_SUCCESS(status);
1095         TEST_ASSERT(s1==s2);
1096     }
1097     {
1098         UErrorCode status = U_ZERO_ERROR;
1099         UnicodeSet s1("[:punct:]", status);
1100         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1101         TEST_ASSERT_SUCCESS(status);
1102         TEST_ASSERT(s1==s2);
1103     }
1104     {
1105         UErrorCode status = U_ZERO_ERROR;
1106         UnicodeSet s1("[:digit:]", status);
1107         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1108         TEST_ASSERT_SUCCESS(status);
1109         TEST_ASSERT(s1==s2);
1110     }
1111     {
1112         UErrorCode status = U_ZERO_ERROR;
1113         UnicodeSet s1("[:xdigit:]", status);
1114         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1115         TEST_ASSERT_SUCCESS(status);
1116         TEST_ASSERT(s1==s2);
1117     }
1118     {
1119         UErrorCode status = U_ZERO_ERROR;
1120         UnicodeSet s1("[:alnum:]", status);
1121         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1122         TEST_ASSERT_SUCCESS(status);
1123         TEST_ASSERT(s1==s2);
1124     }
1125     {
1126         UErrorCode status = U_ZERO_ERROR;
1127         UnicodeSet s1("[:space:]", status);
1128         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1129         TEST_ASSERT_SUCCESS(status);
1130         TEST_ASSERT(s1==s2);
1131     }
1132     {
1133         UErrorCode status = U_ZERO_ERROR;
1134         UnicodeSet s1("[:blank:]", status);
1135         TEST_ASSERT_SUCCESS(status);
1136         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1137             status);
1138         TEST_ASSERT_SUCCESS(status);
1139         TEST_ASSERT(s1==s2);
1140     }
1141     {
1142         UErrorCode status = U_ZERO_ERROR;
1143         UnicodeSet s1("[:cntrl:]", status);
1144         TEST_ASSERT_SUCCESS(status);
1145         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1146         TEST_ASSERT_SUCCESS(status);
1147         TEST_ASSERT(s1==s2);
1148     }
1149     {
1150         UErrorCode status = U_ZERO_ERROR;
1151         UnicodeSet s1("[:graph:]", status);
1152         TEST_ASSERT_SUCCESS(status);
1153         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1154         TEST_ASSERT_SUCCESS(status);
1155         TEST_ASSERT(s1==s2);
1156     }
1157     {
1158         UErrorCode status = U_ZERO_ERROR;
1159         UnicodeSet s1("[:print:]", status);
1160         TEST_ASSERT_SUCCESS(status);
1161         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1162         TEST_ASSERT_SUCCESS(status);
1163         TEST_ASSERT(s1==s2);
1164     }
1165 }
1166 /**
1167  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1168  */
TestClone()1169 void UnicodeSetTest::TestClone() {
1170     UErrorCode ec = U_ZERO_ERROR;
1171     UnicodeSet s("[abcxyz]", ec);
1172     UnicodeSet t(s);
1173     expectContainment(t, "abc", "def");
1174 }
1175 
1176 /**
1177  * Test the indexOf() and charAt() methods.
1178  */
TestIndexOf()1179 void UnicodeSetTest::TestIndexOf() {
1180     UErrorCode ec = U_ZERO_ERROR;
1181     UnicodeSet set("[a-cx-y3578]", ec);
1182     if (U_FAILURE(ec)) {
1183         errln("FAIL: UnicodeSet constructor");
1184         return;
1185     }
1186     for (int32_t i=0; i<set.size(); ++i) {
1187         UChar32 c = set.charAt(i);
1188         if (set.indexOf(c) != i) {
1189             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1190                 i, c, set.indexOf(c));
1191         }
1192     }
1193     UChar32 c = set.charAt(set.size());
1194     if (c != -1) {
1195         errln("FAIL: charAt(<out of range>) = %X", c);
1196     }
1197     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1198     if (j != -1) {
1199         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1200     }
1201 }
1202 
1203 /**
1204  * Test closure API.
1205  */
TestCloseOver()1206 void UnicodeSetTest::TestCloseOver() {
1207     UErrorCode ec = U_ZERO_ERROR;
1208 
1209     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1210     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1211     const char* DATA[] = {
1212         // selector, input, output
1213         CASE,
1214         "[aq\\u00DF{Bc}{bC}{Fi}]",
1215         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1216 
1217         CASE,
1218         "[\\u01F1]", // 'DZ'
1219         "[\\u01F1\\u01F2\\u01F3]",
1220 
1221         CASE,
1222         "[\\u1FB4]",
1223         "[\\u1FB4{\\u03AC\\u03B9}]",
1224 
1225         CASE,
1226         "[{F\\uFB01}]",
1227         "[\\uFB03{ffi}]",
1228 
1229         CASE, // make sure binary search finds limits
1230         "[a\\uFF3A]",
1231         "[aA\\uFF3A\\uFF5A]",
1232 
1233         CASE,
1234         "[a-z]","[A-Za-z\\u017F\\u212A]",
1235         CASE,
1236         "[abc]","[A-Ca-c]",
1237         CASE,
1238         "[ABC]","[A-Ca-c]",
1239 
1240         CASE, "[i]", "[iI]",
1241 
1242         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1243         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1244 
1245         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1246 
1247         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1248 
1249         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1250 
1251         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1252 
1253         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1254 
1255         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1256 
1257         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1258         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1259 
1260         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1261 
1262         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1263 
1264         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1265 
1266 #if !UCONFIG_NO_FILE_IO
1267         CASE_MAPPINGS,
1268         "[aq\\u00DF{Bc}{bC}{Fi}]",
1269         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1270 #endif
1271 
1272         CASE_MAPPINGS,
1273         "[\\u01F1]", // 'DZ'
1274         "[\\u01F1\\u01F2\\u01F3]",
1275 
1276         CASE_MAPPINGS,
1277         "[a-z]",
1278         "[A-Za-z]",
1279 
1280         NULL
1281     };
1282 
1283     UnicodeSet s;
1284     UnicodeSet t;
1285     UnicodeString buf;
1286     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1287         int32_t selector = DATA[i][0];
1288         UnicodeString pat(DATA[i+1], -1, US_INV);
1289         UnicodeString exp(DATA[i+2], -1, US_INV);
1290         s.applyPattern(pat, ec);
1291         s.closeOver(selector);
1292         t.applyPattern(exp, ec);
1293         if (U_FAILURE(ec)) {
1294             errln("FAIL: applyPattern failed");
1295             continue;
1296         }
1297         if (s == t) {
1298             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1299         } else {
1300             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1301                   s.toPattern(buf, TRUE) + ", expected " + exp);
1302         }
1303     }
1304 
1305 #if 0
1306     /*
1307      * Unused test code.
1308      * This was used to compare the old implementation (using USET_CASE)
1309      * with the new one (using 0x100 temporarily)
1310      * while transitioning from hardcoded case closure tables in uniset.cpp
1311      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1312      * and using ucase.c functions for closure.
1313      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1314      *
1315      * Note: The old and new implementation never fully matched because
1316      * the old implementation turned out to not map U+0130 and U+0131 correctly
1317      * (dotted I and dotless i) and because the old implementation's data tables
1318      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1319      * new implementation. (So sigmas and some other characters were not handled
1320      * according to the newer Unicode version.)
1321      */
1322     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1323     UnicodeSetIterator si(sens);
1324     UnicodeString str, buf2;
1325     const UnicodeString *pStr;
1326     UChar32 c;
1327     while(si.next()) {
1328         if(!si.isString()) {
1329             c=si.getCodepoint();
1330             s.clear();
1331             s.add(c);
1332 
1333             str.setTo(c);
1334             str.foldCase();
1335             sens2.add(str);
1336 
1337             t=s;
1338             s.closeOver(USET_CASE);
1339             t.closeOver(0x100);
1340             if(s!=t) {
1341                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1342                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1343             }
1344         }
1345     }
1346     // remove all code points
1347     // should contain all full case folding mapping strings
1348     sens2.remove(0, 0x10ffff);
1349     si.reset(sens2);
1350     while(si.next()) {
1351         if(si.isString()) {
1352             pStr=&si.getString();
1353             s.clear();
1354             s.add(*pStr);
1355             t=s2=s;
1356             s.closeOver(USET_CASE);
1357             t.closeOver(0x100);
1358             if(s!=t) {
1359                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1360                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1361             }
1362         }
1363     }
1364 #endif
1365 
1366     // Test the pattern API
1367     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1368     if (U_FAILURE(ec)) {
1369         errln("FAIL: applyPattern failed");
1370     } else {
1371         expectContainment(s, "abcABC", "defDEF");
1372     }
1373     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1374     if (U_FAILURE(ec)) {
1375         errln("FAIL: constructor failed");
1376     } else {
1377         expectContainment(v, "defDEF", "abcABC");
1378     }
1379     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1380     if (U_FAILURE(ec)) {
1381         errln("FAIL: construct w/case mappings failed");
1382     } else {
1383         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1384     }
1385 }
1386 
TestEscapePattern()1387 void UnicodeSetTest::TestEscapePattern() {
1388     const char pattern[] =
1389         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1390     const char exp[] =
1391         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1392     // We test this with two passes; in the second pass we
1393     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1394     // this fails -- which is what we expect.
1395     for (int32_t pass=1; pass<=2; ++pass) {
1396         UErrorCode ec = U_ZERO_ERROR;
1397         UnicodeString pat(pattern, -1, US_INV);
1398         if (pass==2) {
1399             pat = pat.unescape();
1400         }
1401         // Pattern is only good for pass 1
1402         UBool isPatternValid = (pass==1);
1403 
1404         UnicodeSet set(pat, ec);
1405         if (U_SUCCESS(ec) != isPatternValid){
1406             errln((UnicodeString)"FAIL: applyPattern(" +
1407                   escape(pat) + ") => " +
1408                   u_errorName(ec));
1409             continue;
1410         }
1411         if (U_FAILURE(ec)) {
1412             continue;
1413         }
1414         if (set.contains((UChar)0x0644)){
1415             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1416         }
1417 
1418         UnicodeString newpat;
1419         set.toPattern(newpat, TRUE);
1420         if (newpat == UnicodeString(exp, -1, US_INV)) {
1421             logln(escape(pat) + " => " + newpat);
1422         } else {
1423             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1424         }
1425 
1426         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1427             UnicodeString str("Range ");
1428             str.append((UChar)(0x30 + i))
1429                 .append(": ")
1430                 .append((UChar32)set.getRangeStart(i))
1431                 .append(" - ")
1432                 .append((UChar32)set.getRangeEnd(i));
1433             str = str + " (" + set.getRangeStart(i) + " - " +
1434                 set.getRangeEnd(i) + ")";
1435             if (set.getRangeStart(i) < 0) {
1436                 errln((UnicodeString)"FAIL: " + escape(str));
1437             } else {
1438                 logln(escape(str));
1439             }
1440         }
1441     }
1442 }
1443 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1444 void UnicodeSetTest::expectRange(const UnicodeString& label,
1445                                  const UnicodeSet& set,
1446                                  UChar32 start, UChar32 end) {
1447     UnicodeSet exp(start, end);
1448     UnicodeString pat;
1449     if (set == exp) {
1450         logln(label + " => " + set.toPattern(pat, TRUE));
1451     } else {
1452         UnicodeString xpat;
1453         errln((UnicodeString)"FAIL: " + label + " => " +
1454               set.toPattern(pat, TRUE) +
1455               ", expected " + exp.toPattern(xpat, TRUE));
1456     }
1457 }
1458 
TestInvalidCodePoint()1459 void UnicodeSetTest::TestInvalidCodePoint() {
1460 
1461     const UChar32 DATA[] = {
1462         // Test range             Expected range
1463         0, 0x10FFFF,              0, 0x10FFFF,
1464         (UChar32)-1, 8,           0, 8,
1465         8, 0x110000,              8, 0x10FFFF
1466     };
1467     const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1468 
1469     UnicodeString pat;
1470     int32_t i;
1471 
1472     for (i=0; i<DATA_LENGTH; i+=4) {
1473         UChar32 start  = DATA[i];
1474         UChar32 end    = DATA[i+1];
1475         UChar32 xstart = DATA[i+2];
1476         UChar32 xend   = DATA[i+3];
1477 
1478         // Try various API using the test code points
1479 
1480         UnicodeSet set(start, end);
1481         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1482                     set, xstart, xend);
1483 
1484         set.clear();
1485         set.set(start, end);
1486         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1487                     set, xstart, xend);
1488 
1489         UBool b = set.contains(start);
1490         b = set.contains(start, end);
1491         b = set.containsNone(start, end);
1492         b = set.containsSome(start, end);
1493         (void)b;   // Suppress set but not used warning.
1494 
1495         /*int32_t index = set.indexOf(start);*/
1496 
1497         set.clear();
1498         set.add(start);
1499         set.add(start, end);
1500         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1501                     set, xstart, xend);
1502 
1503         set.set(0, 0x10FFFF);
1504         set.retain(start, end);
1505         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1506                     set, xstart, xend);
1507         set.retain(start);
1508 
1509         set.set(0, 0x10FFFF);
1510         set.remove(start);
1511         set.remove(start, end);
1512         set.complement();
1513         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1514                     set, xstart, xend);
1515 
1516         set.set(0, 0x10FFFF);
1517         set.complement(start, end);
1518         set.complement();
1519         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1520                     set, xstart, xend);
1521         set.complement(start);
1522     }
1523 
1524     const UChar32 DATA2[] = {
1525         0,
1526         0x10FFFF,
1527         (UChar32)-1,
1528         0x110000
1529     };
1530     const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1531 
1532     for (i=0; i<DATA2_LENGTH; ++i) {
1533         UChar32 c = DATA2[i], end = 0x10FFFF;
1534         UBool valid = (c >= 0 && c <= 0x10FFFF);
1535 
1536         UnicodeSet set(0, 0x10FFFF);
1537 
1538         // For single-codepoint contains, invalid codepoints are NOT contained
1539         UBool b = set.contains(c);
1540         if (b == valid) {
1541             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1542                   ") = " + b);
1543         } else {
1544             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1545                   ") = " + b);
1546         }
1547 
1548         // For codepoint range contains, containsNone, and containsSome,
1549         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1550         b = set.contains(c, end);
1551         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1552               "," + end + ") = " + b);
1553 
1554         b = set.containsNone(c, end);
1555         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1556               "," + end + ") = " + b);
1557 
1558         b = set.containsSome(c, end);
1559         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1560               "," + end + ") = " + b);
1561 
1562         int32_t index = set.indexOf(c);
1563         if ((index >= 0) == valid) {
1564             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1565                   ") = " + index);
1566         } else {
1567             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1568                   ") = " + index);
1569         }
1570     }
1571 }
1572 
1573 // Used by TestSymbolTable
1574 class TokenSymbolTable : public SymbolTable {
1575 public:
1576     Hashtable contents;
1577 
TokenSymbolTable(UErrorCode & ec)1578     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1579         contents.setValueDeleter(uprv_deleteUObject);
1580     }
1581 
~TokenSymbolTable()1582     ~TokenSymbolTable() {}
1583 
1584     /**
1585      * (Non-SymbolTable API) Add the given variable and value to
1586      * the table.  Variable should NOT contain leading '$'.
1587      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1588     void add(const UnicodeString& var, const UnicodeString& value,
1589              UErrorCode& ec) {
1590         if (U_SUCCESS(ec)) {
1591             contents.put(var, new UnicodeString(value), ec);
1592         }
1593     }
1594 
1595     /**
1596      * SymbolTable API
1597      */
lookup(const UnicodeString & s) const1598     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1599         return (const UnicodeString*) contents.get(s);
1600     }
1601 
1602     /**
1603      * SymbolTable API
1604      */
lookupMatcher(UChar32) const1605     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1606         return NULL;
1607     }
1608 
1609     /**
1610      * SymbolTable API
1611      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1612     virtual UnicodeString parseReference(const UnicodeString& text,
1613                                          ParsePosition& pos, int32_t limit) const {
1614         int32_t start = pos.getIndex();
1615         int32_t i = start;
1616         UnicodeString result;
1617         while (i < limit) {
1618             UChar c = text.charAt(i);
1619             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1620                 break;
1621             }
1622             ++i;
1623         }
1624         if (i == start) { // No valid name chars
1625             return result; // Indicate failure with empty string
1626         }
1627         pos.setIndex(i);
1628         text.extractBetween(start, i, result);
1629         return result;
1630     }
1631 };
1632 
TestSymbolTable()1633 void UnicodeSetTest::TestSymbolTable() {
1634     // Multiple test cases can be set up here.  Each test case
1635     // is terminated by null:
1636     // var, value, var, value,..., input pat., exp. output pat., null
1637     const char* DATA[] = {
1638         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1639         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1640         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1641         NULL
1642     };
1643 
1644     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1645         UErrorCode ec = U_ZERO_ERROR;
1646         TokenSymbolTable sym(ec);
1647         if (U_FAILURE(ec)) {
1648             errln("FAIL: couldn't construct TokenSymbolTable");
1649             continue;
1650         }
1651 
1652         // Set up variables
1653         while (DATA[i+2] != NULL) {
1654             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1655             if (U_FAILURE(ec)) {
1656                 errln("FAIL: couldn't add to TokenSymbolTable");
1657                 continue;
1658             }
1659             i += 2;
1660         }
1661 
1662         // Input pattern and expected output pattern
1663         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1664         i += 2;
1665 
1666         ParsePosition pos(0);
1667         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1668         if (U_FAILURE(ec)) {
1669             errln("FAIL: couldn't construct UnicodeSet");
1670             continue;
1671         }
1672 
1673         // results
1674         if (pos.getIndex() != inpat.length()) {
1675             errln((UnicodeString)"Failed to read to end of string \""
1676                   + inpat + "\": read to "
1677                   + pos.getIndex() + ", length is "
1678                   + inpat.length());
1679         }
1680 
1681         UnicodeSet us2(exppat, ec);
1682         if (U_FAILURE(ec)) {
1683             errln("FAIL: couldn't construct expected UnicodeSet");
1684             continue;
1685         }
1686 
1687         UnicodeString a, b;
1688         if (us != us2) {
1689             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1690                   ", expected " + us2.toPattern(b, TRUE));
1691         } else {
1692             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1693         }
1694     }
1695 }
1696 
TestSurrogate()1697 void UnicodeSetTest::TestSurrogate() {
1698     const char* DATA[] = {
1699         // These should all behave identically
1700         "[abc\\uD800\\uDC00]",
1701         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1702         "[abc\\U00010000]",
1703         0
1704     };
1705     for (int i=0; DATA[i] != 0; ++i) {
1706         UErrorCode ec = U_ZERO_ERROR;
1707         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1708         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1709         UnicodeSet set(str, ec);
1710         if (U_FAILURE(ec)) {
1711             errln("FAIL: UnicodeSet constructor");
1712             continue;
1713         }
1714         expectContainment(set,
1715                           CharsToUnicodeString("abc\\U00010000"),
1716                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1717         if (set.size() != 4) {
1718             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1719                   set.size() + ", expected 4");
1720         }
1721 
1722         {
1723           UErrorCode subErr = U_ZERO_ERROR;
1724           checkRoundTrip(set);
1725           checkSerializeRoundTrip(set, subErr);
1726         }
1727     }
1728 }
1729 
TestExhaustive()1730 void UnicodeSetTest::TestExhaustive() {
1731     // exhaustive tests. Simulate UnicodeSets with integers.
1732     // That gives us very solid tests (except for large memory tests).
1733 
1734     int32_t limit = 128;
1735 
1736     UnicodeSet x, y, z, aa;
1737 
1738     for (int32_t i = 0; i < limit; ++i) {
1739         bitsToSet(i, x);
1740         logln((UnicodeString)"Testing " + i + ", " + x);
1741         _testComplement(i, x, y);
1742 
1743         UnicodeSet &toTest = bitsToSet(i, aa);
1744 
1745         // AS LONG AS WE ARE HERE, check roundtrip
1746         checkRoundTrip(toTest);
1747         UErrorCode ec = U_ZERO_ERROR;
1748         checkSerializeRoundTrip(toTest, ec);
1749 
1750         for (int32_t j = 0; j < limit; ++j) {
1751             _testAdd(i,j,  x,y,z);
1752             _testXor(i,j,  x,y,z);
1753             _testRetain(i,j,  x,y,z);
1754             _testRemove(i,j,  x,y,z);
1755         }
1756     }
1757 }
1758 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1759 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1760     bitsToSet(a, x);
1761     z = x;
1762     z.complement();
1763     int32_t c = setToBits(z);
1764     if (c != (~a)) {
1765         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1766         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1767     }
1768     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1769 }
1770 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1771 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772     bitsToSet(a, x);
1773     bitsToSet(b, y);
1774     z = x;
1775     z.addAll(y);
1776     int32_t c = setToBits(z);
1777     if (c != (a | b)) {
1778         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1779         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1780     }
1781     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1782 }
1783 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1784 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785     bitsToSet(a, x);
1786     bitsToSet(b, y);
1787     z = x;
1788     z.retainAll(y);
1789     int32_t c = setToBits(z);
1790     if (c != (a & b)) {
1791         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1792         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1793     }
1794     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1795 }
1796 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1797 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798     bitsToSet(a, x);
1799     bitsToSet(b, y);
1800     z = x;
1801     z.removeAll(y);
1802     int32_t c = setToBits(z);
1803     if (c != (a &~ b)) {
1804         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1805         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1806     }
1807     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1808 }
1809 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1810 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1811     bitsToSet(a, x);
1812     bitsToSet(b, y);
1813     z = x;
1814     z.complementAll(y);
1815     int32_t c = setToBits(z);
1816     if (c != (a ^ b)) {
1817         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1818         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1819     }
1820     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1821 }
1822 
1823 /**
1824  * Check that ranges are monotonically increasing and non-
1825  * overlapping.
1826  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1827 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1828     int32_t n = set.getRangeCount();
1829     if (n < 0) {
1830         errln((UnicodeString)"FAIL result of " + msg +
1831               ": range count should be >= 0 but is " +
1832               n /*+ " for " + set.toPattern())*/);
1833         return;
1834     }
1835     UChar32 last = 0;
1836     for (int32_t i=0; i<n; ++i) {
1837         UChar32 start = set.getRangeStart(i);
1838         UChar32 end = set.getRangeEnd(i);
1839         if (start > end) {
1840             errln((UnicodeString)"FAIL result of " + msg +
1841                   ": range " + (i+1) +
1842                   " start > end: " + (int)start + ", " + (int)end +
1843                   " for " + set);
1844         }
1845         if (i > 0 && start <= last) {
1846             errln((UnicodeString)"FAIL result of " + msg +
1847                   ": range " + (i+1) +
1848                   " overlaps previous range: " + (int)start + ", " + (int)end +
1849                   " for " + set);
1850         }
1851         last = end;
1852     }
1853 }
1854 
1855 /**
1856  * Convert a bitmask to a UnicodeSet.
1857  */
bitsToSet(int32_t a,UnicodeSet & result)1858 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1859     result.clear();
1860     for (UChar32 i = 0; i < 32; ++i) {
1861         if ((a & (1<<i)) != 0) {
1862             result.add(i);
1863         }
1864     }
1865     return result;
1866 }
1867 
1868 /**
1869  * Convert a UnicodeSet to a bitmask.  Only the characters
1870  * U+0000 to U+0020 are represented in the bitmask.
1871  */
setToBits(const UnicodeSet & x)1872 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1873     int32_t result = 0;
1874     for (int32_t i = 0; i < 32; ++i) {
1875         if (x.contains((UChar32)i)) {
1876             result |= (1<<i);
1877         }
1878     }
1879     return result;
1880 }
1881 
1882 /**
1883  * Return the representation of an inversion list based UnicodeSet
1884  * as a pairs list.  Ranges are listed in ascending Unicode order.
1885  * For example, the set [a-zA-M3] is represented as "33AMaz".
1886  */
getPairs(const UnicodeSet & set)1887 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1888     UnicodeString pairs;
1889     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1890         UChar32 start = set.getRangeStart(i);
1891         UChar32 end = set.getRangeEnd(i);
1892         if (end > 0xFFFF) {
1893             end = 0xFFFF;
1894             i = set.getRangeCount(); // Should be unnecessary
1895         }
1896         pairs.append((UChar)start).append((UChar)end);
1897     }
1898     return pairs;
1899 }
1900 
1901 /**
1902  * Basic consistency check for a few items.
1903  * That the iterator works, and that we can create a pattern and
1904  * get the same thing back
1905  */
checkRoundTrip(const UnicodeSet & s)1906 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1907     {
1908         UnicodeSet t(s);
1909         checkEqual(s, t, "copy ct");
1910     }
1911 
1912     {
1913         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
1914         t = s;
1915         checkEqual(s, t, "operator=");
1916     }
1917 
1918     {
1919         UnicodeSet t;
1920         copyWithIterator(t, s, FALSE);
1921         checkEqual(s, t, "iterator roundtrip");
1922     }
1923 
1924     {
1925         UnicodeSet t;
1926         copyWithIterator(t, s, TRUE); // try range
1927         checkEqual(s, t, "iterator roundtrip");
1928     }
1929 
1930     {
1931         UnicodeSet t;
1932         UnicodeString pat;
1933         UErrorCode ec = U_ZERO_ERROR;
1934         s.toPattern(pat, FALSE);
1935         t.applyPattern(pat, ec);
1936         if (U_FAILURE(ec)) {
1937             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1938             return;
1939         } else {
1940             checkEqual(s, t, "toPattern(false)");
1941         }
1942     }
1943 
1944     {
1945         UnicodeSet t;
1946         UnicodeString pat;
1947         UErrorCode ec = U_ZERO_ERROR;
1948         s.toPattern(pat, TRUE);
1949         t.applyPattern(pat, ec);
1950         if (U_FAILURE(ec)) {
1951             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1952             return;
1953         } else {
1954             checkEqual(s, t, "toPattern(true)");
1955         }
1956     }
1957 }
1958 
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1959 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1960   if(U_FAILURE(status)) return;
1961   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1962   if(status == U_BUFFER_OVERFLOW_ERROR) {
1963     status = U_ZERO_ERROR;
1964     serializeBuffer.resize(len);
1965     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1966     // let 2nd error stand
1967   }
1968   if(U_FAILURE(status)) {
1969     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1970     return;
1971   }
1972   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1973   if(U_FAILURE(status)) {
1974     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1975     return;
1976   }
1977 
1978   checkEqual(t, deserialized, "Set was unequal when deserialized");
1979 }
1980 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1981 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1982     t.clear();
1983     UnicodeSetIterator it(s);
1984     if (withRange) {
1985         while (it.nextRange()) {
1986             if (it.isString()) {
1987                 t.add(it.getString());
1988             } else {
1989                 t.add(it.getCodepoint(), it.getCodepointEnd());
1990             }
1991         }
1992     } else {
1993         while (it.next()) {
1994             if (it.isString()) {
1995                 t.add(it.getString());
1996             } else {
1997                 t.add(it.getCodepoint());
1998             }
1999         }
2000     }
2001 }
2002 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2003 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2004   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2005   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2006     UnicodeString source; s.toPattern(source, TRUE);
2007     UnicodeString result; t.toPattern(result, TRUE);
2008     if (s != t) {
2009         errln((UnicodeString)"FAIL: " + message
2010               + "; source = " + source
2011               + "; result = " + result
2012               );
2013         return FALSE;
2014     } else {
2015         logln((UnicodeString)"Ok: " + message
2016               + "; source = " + source
2017               + "; result = " + result
2018               );
2019     }
2020     return TRUE;
2021 }
2022 
2023 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2024 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2025                                   const UnicodeString& charsIn,
2026                                   const UnicodeString& charsOut) {
2027     UErrorCode ec = U_ZERO_ERROR;
2028     UnicodeSet set(pat, ec);
2029     if (U_FAILURE(ec)) {
2030         dataerrln((UnicodeString)"FAIL: pattern \"" +
2031               pat + "\" => " + u_errorName(ec));
2032         return;
2033     }
2034     expectContainment(set, pat, charsIn, charsOut);
2035 }
2036 
2037 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2038 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2039                                   const UnicodeString& charsIn,
2040                                   const UnicodeString& charsOut) {
2041     UnicodeString pat;
2042     set.toPattern(pat);
2043     expectContainment(set, pat, charsIn, charsOut);
2044 }
2045 
2046 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2047 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2048                                   const UnicodeString& setName,
2049                                   const UnicodeString& charsIn,
2050                                   const UnicodeString& charsOut) {
2051     UnicodeString bad;
2052     UChar32 c;
2053     int32_t i;
2054 
2055     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2056         c = charsIn.char32At(i);
2057         if (!set.contains(c)) {
2058             bad.append(c);
2059         }
2060     }
2061     if (bad.length() > 0) {
2062         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2063               ", expected containment of " + prettify(charsIn));
2064     } else {
2065         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2066     }
2067 
2068     bad.truncate(0);
2069     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2070         c = charsOut.char32At(i);
2071         if (set.contains(c)) {
2072             bad.append(c);
2073         }
2074     }
2075     if (bad.length() > 0) {
2076         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2077               ", expected non-containment of " + prettify(charsOut));
2078     } else {
2079         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2080     }
2081 }
2082 
2083 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2084 UnicodeSetTest::expectPattern(UnicodeSet& set,
2085                               const UnicodeString& pattern,
2086                               const UnicodeString& expectedPairs){
2087     UErrorCode status = U_ZERO_ERROR;
2088     set.applyPattern(pattern, status);
2089     if (U_FAILURE(status)) {
2090         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2091               "\") failed");
2092         return;
2093     } else {
2094         if (getPairs(set) != expectedPairs ) {
2095             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2096                   "\") => pairs \"" +
2097                   escape(getPairs(set)) + "\", expected \"" +
2098                   escape(expectedPairs) + "\"");
2099         } else {
2100             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2101                   "\") => pairs \"" +
2102                   escape(getPairs(set)) + "\"");
2103         }
2104     }
2105     // the result of calling set.toPattern(), which is the string representation of
2106     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2107     // will produce another set that is equal to this one.
2108     UnicodeString temppattern;
2109     set.toPattern(temppattern);
2110     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2111     if (U_FAILURE(status)) {
2112         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2113         return;
2114     }
2115     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2116         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2117             escape(getPairs(set)) + "\""));
2118     } else{
2119         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2120     }
2121 
2122     delete tempset;
2123 
2124 }
2125 
2126 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2127 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2128     if (getPairs(set) != expectedPairs) {
2129         errln(UnicodeString("FAIL: Expected pair list \"") +
2130               escape(expectedPairs) + "\", got \"" +
2131               escape(getPairs(set)) + "\"");
2132     }
2133 }
2134 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2135 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2136                                      const UnicodeString& expPat,
2137                                      const char** expStrings) {
2138     UnicodeString pat;
2139     set.toPattern(pat, TRUE);
2140     if (pat == expPat) {
2141         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2142     } else {
2143         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2144         return;
2145     }
2146     if (expStrings == NULL) {
2147         return;
2148     }
2149     UBool in = TRUE;
2150     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2151         if (expStrings[i] == NOT) { // sic; pointer comparison
2152             in = FALSE;
2153             continue;
2154         }
2155         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2156         UBool contained = set.contains(s);
2157         if (contained == in) {
2158             logln((UnicodeString)"Ok: " + expPat +
2159                   (contained ? " contains {" : " does not contain {") +
2160                   escape(expStrings[i]) + "}");
2161         } else {
2162             errln((UnicodeString)"FAIL: " + expPat +
2163                   (contained ? " contains {" : " does not contain {") +
2164                   escape(expStrings[i]) + "}");
2165         }
2166     }
2167 }
2168 
toHexString(int32_t i)2169 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2170 
2171 void
doAssert(UBool condition,const char * message)2172 UnicodeSetTest::doAssert(UBool condition, const char *message)
2173 {
2174     if (!condition) {
2175         errln(UnicodeString("ERROR : ") + message);
2176     }
2177 }
2178 
2179 UnicodeString
escape(const UnicodeString & s)2180 UnicodeSetTest::escape(const UnicodeString& s) {
2181     UnicodeString buf;
2182     for (int32_t i=0; i<s.length(); )
2183     {
2184         UChar32 c = s.char32At(i);
2185         if (0x0020 <= c && c <= 0x007F) {
2186             buf += c;
2187         } else {
2188             if (c <= 0xFFFF) {
2189                 buf += (UChar)0x5c; buf += (UChar)0x75;
2190             } else {
2191                 buf += (UChar)0x5c; buf += (UChar)0x55;
2192                 buf += toHexString((c & 0xF0000000) >> 28);
2193                 buf += toHexString((c & 0x0F000000) >> 24);
2194                 buf += toHexString((c & 0x00F00000) >> 20);
2195                 buf += toHexString((c & 0x000F0000) >> 16);
2196             }
2197             buf += toHexString((c & 0xF000) >> 12);
2198             buf += toHexString((c & 0x0F00) >> 8);
2199             buf += toHexString((c & 0x00F0) >> 4);
2200             buf += toHexString(c & 0x000F);
2201         }
2202         i += U16_LENGTH(c);
2203     }
2204     return buf;
2205 }
2206 
TestFreezable()2207 void UnicodeSetTest::TestFreezable() {
2208     UErrorCode errorCode=U_ZERO_ERROR;
2209     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2210     UnicodeSet idSet(idPattern, errorCode);
2211     if(U_FAILURE(errorCode)) {
2212         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2213         return;
2214     }
2215 
2216     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2217     UnicodeSet wsSet(wsPattern, errorCode);
2218     if(U_FAILURE(errorCode)) {
2219         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2220         return;
2221     }
2222 
2223     idSet.add(idPattern);
2224     UnicodeSet frozen(idSet);
2225     frozen.freeze();
2226 
2227     if(idSet.isFrozen() || !frozen.isFrozen()) {
2228         errln("FAIL: isFrozen() is wrong");
2229     }
2230     if(frozen!=idSet || !(frozen==idSet)) {
2231         errln("FAIL: a copy-constructed frozen set differs from its original");
2232     }
2233 
2234     frozen=wsSet;
2235     if(frozen!=idSet || !(frozen==idSet)) {
2236         errln("FAIL: a frozen set was modified by operator=");
2237     }
2238 
2239     UnicodeSet frozen2(frozen);
2240     if(frozen2!=frozen || frozen2!=idSet) {
2241         errln("FAIL: a copied frozen set differs from its frozen original");
2242     }
2243     if(!frozen2.isFrozen()) {
2244         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2245     }
2246     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2247     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2248         errln("FAIL: UnicodeSet(5, 55) failed");
2249     }
2250     frozen3=frozen;
2251     if(!frozen3.isFrozen()) {
2252         errln("FAIL: copying a frozen set results in a thawed one");
2253     }
2254 
2255     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2256     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2257         errln("FAIL: clone() failed");
2258     }
2259     cloned->add(0xd802, 0xd805);
2260     if(cloned->containsSome(0xd802, 0xd805)) {
2261         errln("FAIL: unable to modify clone");
2262     }
2263     delete cloned;
2264 
2265     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2266     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2267         errln("FAIL: cloneAsThawed() failed");
2268     }
2269     thawed->add(0xd802, 0xd805);
2270     if(!thawed->contains(0xd802, 0xd805)) {
2271         errln("FAIL: unable to modify thawed clone");
2272     }
2273     delete thawed;
2274 
2275     frozen.set(5, 55);
2276     if(frozen!=idSet || !(frozen==idSet)) {
2277         errln("FAIL: UnicodeSet::set() modified a frozen set");
2278     }
2279 
2280     frozen.clear();
2281     if(frozen!=idSet || !(frozen==idSet)) {
2282         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2283     }
2284 
2285     frozen.closeOver(USET_CASE_INSENSITIVE);
2286     if(frozen!=idSet || !(frozen==idSet)) {
2287         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2288     }
2289 
2290     frozen.compact();
2291     if(frozen!=idSet || !(frozen==idSet)) {
2292         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2293     }
2294 
2295     ParsePosition pos;
2296     frozen.
2297         applyPattern(wsPattern, errorCode).
2298         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2299         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2300         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2301         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2302     if(frozen!=idSet || !(frozen==idSet)) {
2303         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2304     }
2305 
2306     frozen.
2307         add(0xd800).
2308         add(0xd802, 0xd805).
2309         add(wsPattern).
2310         addAll(idPattern).
2311         addAll(wsSet);
2312     if(frozen!=idSet || !(frozen==idSet)) {
2313         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2314     }
2315 
2316     frozen.
2317         retain(0x62).
2318         retain(0x64, 0x69).
2319         retainAll(wsPattern).
2320         retainAll(wsSet);
2321     if(frozen!=idSet || !(frozen==idSet)) {
2322         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2323     }
2324 
2325     frozen.
2326         remove(0x62).
2327         remove(0x64, 0x69).
2328         remove(idPattern).
2329         removeAll(idPattern).
2330         removeAll(idSet);
2331     if(frozen!=idSet || !(frozen==idSet)) {
2332         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2333     }
2334 
2335     frozen.
2336         complement().
2337         complement(0x62).
2338         complement(0x64, 0x69).
2339         complement(idPattern).
2340         complementAll(idPattern).
2341         complementAll(idSet);
2342     if(frozen!=idSet || !(frozen==idSet)) {
2343         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2344     }
2345 }
2346 
2347 // Test span() etc. -------------------------------------------------------- ***
2348 
2349 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2350 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2351 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2352     UErrorCode errorCode=U_ZERO_ERROR;
2353     int32_t length8=0;
2354     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2355     if(U_SUCCESS(errorCode)) {
2356         return length8;
2357     } else {
2358         // The string contains an unpaired surrogate.
2359         // Ignore this string.
2360         return 0;
2361     }
2362 }
2363 
2364 class UnicodeSetWithStringsIterator;
2365 
2366 // Make the strings in a UnicodeSet easily accessible.
2367 class UnicodeSetWithStrings {
2368 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2369     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2370             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2371         int32_t size=set.size();
2372         if(size>0 && set.charAt(size-1)<0) {
2373             // If a set's last element is not a code point, then it must contain strings.
2374             // Iterate over the set, skip all code point ranges, and cache the strings.
2375             // Convert them to UTF-8 for spanUTF8().
2376             UnicodeSetIterator iter(set);
2377             const UnicodeString *s;
2378             char *s8=utf8;
2379             int32_t length8, utf8Count=0;
2380             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2381                 if(iter.isString()) {
2382                     // Store the pointer to the set's string element
2383                     // which we happen to know is a stable pointer.
2384                     strings[stringsLength]=s=&iter.getString();
2385                     utf8Count+=
2386                         utf8Lengths[stringsLength]=length8=
2387                         appendUTF8(s->getBuffer(), s->length(),
2388                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2389                     if(length8==0) {
2390                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2391                     }
2392                     s8+=length8;
2393                     ++stringsLength;
2394                 }
2395             }
2396         }
2397     }
2398 
getSet() const2399     const UnicodeSet &getSet() const {
2400         return set;
2401     }
2402 
hasStrings() const2403     UBool hasStrings() const {
2404         return (UBool)(stringsLength>0);
2405     }
2406 
hasStringsWithSurrogates() const2407     UBool hasStringsWithSurrogates() const {
2408         return hasSurrogates;
2409     }
2410 
2411 private:
2412     friend class UnicodeSetWithStringsIterator;
2413 
2414     const UnicodeSet &set;
2415 
2416     const UnicodeString *strings[20];
2417     int32_t stringsLength;
2418     UBool hasSurrogates;
2419 
2420     char utf8[1024];
2421     int32_t utf8Lengths[20];
2422 };
2423 
2424 class UnicodeSetWithStringsIterator {
2425 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2426     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2427             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2428     }
2429 
reset()2430     void reset() {
2431         nextStringIndex=nextUTF8Start=0;
2432     }
2433 
nextString()2434     const UnicodeString *nextString() {
2435         if(nextStringIndex<fSet.stringsLength) {
2436             return fSet.strings[nextStringIndex++];
2437         } else {
2438             return NULL;
2439         }
2440     }
2441 
2442     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2443     const char *nextUTF8(int32_t &length) {
2444         if(nextStringIndex<fSet.stringsLength) {
2445             const char *s8=fSet.utf8+nextUTF8Start;
2446             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2447             return s8;
2448         } else {
2449             length=0;
2450             return NULL;
2451         }
2452     }
2453 
2454 private:
2455     const UnicodeSetWithStrings &fSet;
2456     int32_t nextStringIndex;
2457     int32_t nextUTF8Start;
2458 };
2459 
2460 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2461 // at code point boundaries.
2462 // That is, each edge of a match must not be in the middle of a surrogate pair.
2463 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2464 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2465     s+=start;
2466     limit-=start;
2467     int32_t length=t.length();
2468     return 0==t.compare(s, length) &&
2469            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2470            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2471 }
2472 
2473 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2474 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2475                                  USetSpanCondition spanCondition) {
2476     const UnicodeSet &realSet(set.getSet());
2477     if(!set.hasStrings()) {
2478         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2479             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2480         }
2481 
2482         UChar32 c;
2483         int32_t start=0, prev;
2484         while((prev=start)<length) {
2485             U16_NEXT(s, start, length, c);
2486             if(realSet.contains(c)!=spanCondition) {
2487                 break;
2488             }
2489         }
2490         return prev;
2491     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2492         UnicodeSetWithStringsIterator iter(set);
2493         UChar32 c;
2494         int32_t start, next;
2495         for(start=next=0; start<length;) {
2496             U16_NEXT(s, next, length, c);
2497             if(realSet.contains(c)) {
2498                 break;
2499             }
2500             const UnicodeString *str;
2501             iter.reset();
2502             while((str=iter.nextString())!=NULL) {
2503                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2504                     // spanNeedsStrings=TRUE;
2505                     return start;
2506                 }
2507             }
2508             start=next;
2509         }
2510         return start;
2511     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2512         UnicodeSetWithStringsIterator iter(set);
2513         UChar32 c;
2514         int32_t start, next, maxSpanLimit=0;
2515         for(start=next=0; start<length;) {
2516             U16_NEXT(s, next, length, c);
2517             if(!realSet.contains(c)) {
2518                 next=start;  // Do not span this single, not-contained code point.
2519             }
2520             const UnicodeString *str;
2521             iter.reset();
2522             while((str=iter.nextString())!=NULL) {
2523                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2524                     // spanNeedsStrings=TRUE;
2525                     int32_t matchLimit=start+str->length();
2526                     if(matchLimit==length) {
2527                         return length;
2528                     }
2529                     if(spanCondition==USET_SPAN_CONTAINED) {
2530                         // Iterate for the shortest match at each position.
2531                         // Recurse for each but the shortest match.
2532                         if(next==start) {
2533                             next=matchLimit;  // First match from start.
2534                         } else {
2535                             if(matchLimit<next) {
2536                                 // Remember shortest match from start for iteration.
2537                                 int32_t temp=next;
2538                                 next=matchLimit;
2539                                 matchLimit=temp;
2540                             }
2541                             // Recurse for non-shortest match from start.
2542                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2543                                                                  USET_SPAN_CONTAINED);
2544                             if((matchLimit+spanLength)>maxSpanLimit) {
2545                                 maxSpanLimit=matchLimit+spanLength;
2546                                 if(maxSpanLimit==length) {
2547                                     return length;
2548                                 }
2549                             }
2550                         }
2551                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2552                         if(matchLimit>next) {
2553                             // Remember longest match from start.
2554                             next=matchLimit;
2555                         }
2556                     }
2557                 }
2558             }
2559             if(next==start) {
2560                 break;  // No match from start.
2561             }
2562             start=next;
2563         }
2564         if(start>maxSpanLimit) {
2565             return start;
2566         } else {
2567             return maxSpanLimit;
2568         }
2569     }
2570 }
2571 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2572 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2573                                      USetSpanCondition spanCondition) {
2574     if(length==0) {
2575         return 0;
2576     }
2577     const UnicodeSet &realSet(set.getSet());
2578     if(!set.hasStrings()) {
2579         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2580             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2581         }
2582 
2583         UChar32 c;
2584         int32_t prev=length;
2585         do {
2586             U16_PREV(s, 0, length, c);
2587             if(realSet.contains(c)!=spanCondition) {
2588                 break;
2589             }
2590         } while((prev=length)>0);
2591         return prev;
2592     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2593         UnicodeSetWithStringsIterator iter(set);
2594         UChar32 c;
2595         int32_t prev=length, length0=length;
2596         do {
2597             U16_PREV(s, 0, length, c);
2598             if(realSet.contains(c)) {
2599                 break;
2600             }
2601             const UnicodeString *str;
2602             iter.reset();
2603             while((str=iter.nextString())!=NULL) {
2604                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2605                     // spanNeedsStrings=TRUE;
2606                     return prev;
2607                 }
2608             }
2609         } while((prev=length)>0);
2610         return prev;
2611     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2612         UnicodeSetWithStringsIterator iter(set);
2613         UChar32 c;
2614         int32_t prev=length, minSpanStart=length, length0=length;
2615         do {
2616             U16_PREV(s, 0, length, c);
2617             if(!realSet.contains(c)) {
2618                 length=prev;  // Do not span this single, not-contained code point.
2619             }
2620             const UnicodeString *str;
2621             iter.reset();
2622             while((str=iter.nextString())!=NULL) {
2623                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2624                     // spanNeedsStrings=TRUE;
2625                     int32_t matchStart=prev-str->length();
2626                     if(matchStart==0) {
2627                         return 0;
2628                     }
2629                     if(spanCondition==USET_SPAN_CONTAINED) {
2630                         // Iterate for the shortest match at each position.
2631                         // Recurse for each but the shortest match.
2632                         if(length==prev) {
2633                             length=matchStart;  // First match from prev.
2634                         } else {
2635                             if(matchStart>length) {
2636                                 // Remember shortest match from prev for iteration.
2637                                 int32_t temp=length;
2638                                 length=matchStart;
2639                                 matchStart=temp;
2640                             }
2641                             // Recurse for non-shortest match from prev.
2642                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2643                                                                     USET_SPAN_CONTAINED);
2644                             if(spanStart<minSpanStart) {
2645                                 minSpanStart=spanStart;
2646                                 if(minSpanStart==0) {
2647                                     return 0;
2648                                 }
2649                             }
2650                         }
2651                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2652                         if(matchStart<length) {
2653                             // Remember longest match from prev.
2654                             length=matchStart;
2655                         }
2656                     }
2657                 }
2658             }
2659             if(length==prev) {
2660                 break;  // No match from prev.
2661             }
2662         } while((prev=length)>0);
2663         if(prev<minSpanStart) {
2664             return prev;
2665         } else {
2666             return minSpanStart;
2667         }
2668     }
2669 }
2670 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2671 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2672                                 USetSpanCondition spanCondition) {
2673     const UnicodeSet &realSet(set.getSet());
2674     if(!set.hasStrings()) {
2675         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2676             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2677         }
2678 
2679         UChar32 c;
2680         int32_t start=0, prev;
2681         while((prev=start)<length) {
2682             U8_NEXT_OR_FFFD(s, start, length, c);
2683             if(realSet.contains(c)!=spanCondition) {
2684                 break;
2685             }
2686         }
2687         return prev;
2688     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2689         UnicodeSetWithStringsIterator iter(set);
2690         UChar32 c;
2691         int32_t start, next;
2692         for(start=next=0; start<length;) {
2693             U8_NEXT_OR_FFFD(s, next, length, c);
2694             if(realSet.contains(c)) {
2695                 break;
2696             }
2697             const char *s8;
2698             int32_t length8;
2699             iter.reset();
2700             while((s8=iter.nextUTF8(length8))!=NULL) {
2701                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2702                     // spanNeedsStrings=TRUE;
2703                     return start;
2704                 }
2705             }
2706             start=next;
2707         }
2708         return start;
2709     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2710         UnicodeSetWithStringsIterator iter(set);
2711         UChar32 c;
2712         int32_t start, next, maxSpanLimit=0;
2713         for(start=next=0; start<length;) {
2714             U8_NEXT_OR_FFFD(s, next, length, c);
2715             if(!realSet.contains(c)) {
2716                 next=start;  // Do not span this single, not-contained code point.
2717             }
2718             const char *s8;
2719             int32_t length8;
2720             iter.reset();
2721             while((s8=iter.nextUTF8(length8))!=NULL) {
2722                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2723                     // spanNeedsStrings=TRUE;
2724                     int32_t matchLimit=start+length8;
2725                     if(matchLimit==length) {
2726                         return length;
2727                     }
2728                     if(spanCondition==USET_SPAN_CONTAINED) {
2729                         // Iterate for the shortest match at each position.
2730                         // Recurse for each but the shortest match.
2731                         if(next==start) {
2732                             next=matchLimit;  // First match from start.
2733                         } else {
2734                             if(matchLimit<next) {
2735                                 // Remember shortest match from start for iteration.
2736                                 int32_t temp=next;
2737                                 next=matchLimit;
2738                                 matchLimit=temp;
2739                             }
2740                             // Recurse for non-shortest match from start.
2741                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2742                                                                 USET_SPAN_CONTAINED);
2743                             if((matchLimit+spanLength)>maxSpanLimit) {
2744                                 maxSpanLimit=matchLimit+spanLength;
2745                                 if(maxSpanLimit==length) {
2746                                     return length;
2747                                 }
2748                             }
2749                         }
2750                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2751                         if(matchLimit>next) {
2752                             // Remember longest match from start.
2753                             next=matchLimit;
2754                         }
2755                     }
2756                 }
2757             }
2758             if(next==start) {
2759                 break;  // No match from start.
2760             }
2761             start=next;
2762         }
2763         if(start>maxSpanLimit) {
2764             return start;
2765         } else {
2766             return maxSpanLimit;
2767         }
2768     }
2769 }
2770 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2771 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2772                                     USetSpanCondition spanCondition) {
2773     if(length==0) {
2774         return 0;
2775     }
2776     const UnicodeSet &realSet(set.getSet());
2777     if(!set.hasStrings()) {
2778         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2779             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2780         }
2781 
2782         UChar32 c;
2783         int32_t prev=length;
2784         do {
2785             U8_PREV_OR_FFFD(s, 0, length, c);
2786             if(realSet.contains(c)!=spanCondition) {
2787                 break;
2788             }
2789         } while((prev=length)>0);
2790         return prev;
2791     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2792         UnicodeSetWithStringsIterator iter(set);
2793         UChar32 c;
2794         int32_t prev=length;
2795         do {
2796             U8_PREV_OR_FFFD(s, 0, length, c);
2797             if(realSet.contains(c)) {
2798                 break;
2799             }
2800             const char *s8;
2801             int32_t length8;
2802             iter.reset();
2803             while((s8=iter.nextUTF8(length8))!=NULL) {
2804                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2805                     // spanNeedsStrings=TRUE;
2806                     return prev;
2807                 }
2808             }
2809         } while((prev=length)>0);
2810         return prev;
2811     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2812         UnicodeSetWithStringsIterator iter(set);
2813         UChar32 c;
2814         int32_t prev=length, minSpanStart=length;
2815         do {
2816             U8_PREV_OR_FFFD(s, 0, length, c);
2817             if(!realSet.contains(c)) {
2818                 length=prev;  // Do not span this single, not-contained code point.
2819             }
2820             const char *s8;
2821             int32_t length8;
2822             iter.reset();
2823             while((s8=iter.nextUTF8(length8))!=NULL) {
2824                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2825                     // spanNeedsStrings=TRUE;
2826                     int32_t matchStart=prev-length8;
2827                     if(matchStart==0) {
2828                         return 0;
2829                     }
2830                     if(spanCondition==USET_SPAN_CONTAINED) {
2831                         // Iterate for the shortest match at each position.
2832                         // Recurse for each but the shortest match.
2833                         if(length==prev) {
2834                             length=matchStart;  // First match from prev.
2835                         } else {
2836                             if(matchStart>length) {
2837                                 // Remember shortest match from prev for iteration.
2838                                 int32_t temp=length;
2839                                 length=matchStart;
2840                                 matchStart=temp;
2841                             }
2842                             // Recurse for non-shortest match from prev.
2843                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2844                                                                    USET_SPAN_CONTAINED);
2845                             if(spanStart<minSpanStart) {
2846                                 minSpanStart=spanStart;
2847                                 if(minSpanStart==0) {
2848                                     return 0;
2849                                 }
2850                             }
2851                         }
2852                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2853                         if(matchStart<length) {
2854                             // Remember longest match from prev.
2855                             length=matchStart;
2856                         }
2857                     }
2858                 }
2859             }
2860             if(length==prev) {
2861                 break;  // No match from prev.
2862             }
2863         } while((prev=length)>0);
2864         if(prev<minSpanStart) {
2865             return prev;
2866         } else {
2867             return minSpanStart;
2868         }
2869     }
2870 }
2871 
2872 // spans to be performed and compared
2873 enum {
2874     SPAN_UTF16          =1,
2875     SPAN_UTF8           =2,
2876     SPAN_UTFS           =3,
2877 
2878     SPAN_SET            =4,
2879     SPAN_COMPLEMENT     =8,
2880     SPAN_POLARITY       =0xc,
2881 
2882     SPAN_FWD            =0x10,
2883     SPAN_BACK           =0x20,
2884     SPAN_DIRS           =0x30,
2885 
2886     SPAN_CONTAINED      =0x100,
2887     SPAN_SIMPLE         =0x200,
2888     SPAN_CONDITION      =0x300,
2889 
2890     SPAN_ALL            =0x33f
2891 };
2892 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2893 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2894     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2895 }
2896 
slen(const void * s,UBool isUTF16)2897 static inline int32_t slen(const void *s, UBool isUTF16) {
2898     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2899 }
2900 
2901 /*
2902  * Count spans on a string with the method according to type and set the span limits.
2903  * The set may be the complement of the original.
2904  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2905  * according to the expected number of spans.
2906  * Sets typeName to an empty string if there is no such type.
2907  * Returns -1 if the span option is filtered out.
2908  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2909 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2910                         const void *s, int32_t length, UBool isUTF16,
2911                         uint32_t whichSpans,
2912                         int type, const char *&typeName,
2913                         int32_t limits[], int32_t limitsCapacity,
2914                         int32_t expectCount) {
2915     const UnicodeSet &realSet(set.getSet());
2916     int32_t start, count;
2917     USetSpanCondition spanCondition, firstSpanCondition, contained;
2918     UBool isForward;
2919 
2920     if(type<0 || 7<type) {
2921         typeName="";
2922         return 0;
2923     }
2924 
2925     static const char *const typeNames16[]={
2926         "contains", "contains(LM)",
2927         "span", "span(LM)",
2928         "containsBack", "containsBack(LM)",
2929         "spanBack", "spanBack(LM)"
2930     };
2931 
2932     static const char *const typeNames8[]={
2933         "containsUTF8", "containsUTF8(LM)",
2934         "spanUTF8", "spanUTF8(LM)",
2935         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2936         "spanBackUTF8", "spanBackUTF8(LM)"
2937     };
2938 
2939     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2940 
2941     // filter span options
2942     if(type<=3) {
2943         // span forward
2944         if((whichSpans&SPAN_FWD)==0) {
2945             return -1;
2946         }
2947         isForward=TRUE;
2948     } else {
2949         // span backward
2950         if((whichSpans&SPAN_BACK)==0) {
2951             return -1;
2952         }
2953         isForward=FALSE;
2954     }
2955     if((type&1)==0) {
2956         // use USET_SPAN_CONTAINED
2957         if((whichSpans&SPAN_CONTAINED)==0) {
2958             return -1;
2959         }
2960         contained=USET_SPAN_CONTAINED;
2961     } else {
2962         // use USET_SPAN_SIMPLE
2963         if((whichSpans&SPAN_SIMPLE)==0) {
2964             return -1;
2965         }
2966         contained=USET_SPAN_SIMPLE;
2967     }
2968 
2969     // Default first span condition for going forward with an uncomplemented set.
2970     spanCondition=USET_SPAN_NOT_CONTAINED;
2971     if(isComplement) {
2972         spanCondition=invertSpanCondition(spanCondition, contained);
2973     }
2974 
2975     // First span condition for span(), used to terminate the spanBack() iteration.
2976     firstSpanCondition=spanCondition;
2977 
2978     // spanBack(): Its initial span condition is span()'s last span condition,
2979     // which is the opposite of span()'s first span condition
2980     // if we expect an even number of spans.
2981     // (The loop inverts spanCondition (expectCount-1) times
2982     // before the expectCount'th span() call.)
2983     // If we do not compare forward and backward directions, then we do not have an
2984     // expectCount and just start with firstSpanCondition.
2985     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2986         spanCondition=invertSpanCondition(spanCondition, contained);
2987     }
2988 
2989     count=0;
2990     switch(type) {
2991     case 0:
2992     case 1:
2993         start=0;
2994         if(length<0) {
2995             length=slen(s, isUTF16);
2996         }
2997         for(;;) {
2998             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2999                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3000             if(count<limitsCapacity) {
3001                 limits[count]=start;
3002             }
3003             ++count;
3004             if(start>=length) {
3005                 break;
3006             }
3007             spanCondition=invertSpanCondition(spanCondition, contained);
3008         }
3009         break;
3010     case 2:
3011     case 3:
3012         start=0;
3013         for(;;) {
3014             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3015                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3016             if(count<limitsCapacity) {
3017                 limits[count]=start;
3018             }
3019             ++count;
3020             if(length>=0 ? start>=length :
3021                            isUTF16 ? ((const UChar *)s)[start]==0 :
3022                                      ((const char *)s)[start]==0
3023             ) {
3024                 break;
3025             }
3026             spanCondition=invertSpanCondition(spanCondition, contained);
3027         }
3028         break;
3029     case 4:
3030     case 5:
3031         if(length<0) {
3032             length=slen(s, isUTF16);
3033         }
3034         for(;;) {
3035             ++count;
3036             if(count<=limitsCapacity) {
3037                 limits[limitsCapacity-count]=length;
3038             }
3039             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3040                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3041             if(length==0 && spanCondition==firstSpanCondition) {
3042                 break;
3043             }
3044             spanCondition=invertSpanCondition(spanCondition, contained);
3045         }
3046         if(count<limitsCapacity) {
3047             memmove(limits, limits+(limitsCapacity-count), count*4);
3048         }
3049         break;
3050     case 6:
3051     case 7:
3052         for(;;) {
3053             ++count;
3054             if(count<=limitsCapacity) {
3055                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3056             }
3057             // Note: Length<0 is tested only for the first spanBack().
3058             // If we wanted to keep length<0 for all spanBack()s, we would have to
3059             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3060             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3061                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3062             if(length==0 && spanCondition==firstSpanCondition) {
3063                 break;
3064             }
3065             spanCondition=invertSpanCondition(spanCondition, contained);
3066         }
3067         if(count<limitsCapacity) {
3068             memmove(limits, limits+(limitsCapacity-count), count*4);
3069         }
3070         break;
3071     default:
3072         typeName="";
3073         return -1;
3074     }
3075 
3076     return count;
3077 }
3078 
3079 // sets to be tested; odd index=isComplement
3080 enum {
3081     SLOW,
3082     SLOW_NOT,
3083     FAST,
3084     FAST_NOT,
3085     SET_COUNT
3086 };
3087 
3088 static const char *const setNames[SET_COUNT]={
3089     "slow",
3090     "slow.not",
3091     "fast",
3092     "fast.not"
3093 };
3094 
3095 /*
3096  * Verify that we get the same results whether we look at text with contains(),
3097  * span() or spanBack(), using unfrozen or frozen versions of the set,
3098  * and using the set or its complement (switching the spanConditions accordingly).
3099  * The latter verifies that
3100  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3101  *
3102  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3103  * or returned to the caller (with an input expectCount<0).
3104  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3105 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3106                               const void *s, int32_t length, UBool isUTF16,
3107                               uint32_t whichSpans,
3108                               int32_t expectLimits[], int32_t &expectCount,
3109                               const char *testName, int32_t index) {
3110     int32_t limits[500];
3111     int32_t limitsCount;
3112     int i, j;
3113 
3114     const char *typeName;
3115     int type;
3116 
3117     for(i=0; i<SET_COUNT; ++i) {
3118         if((i&1)==0) {
3119             // Even-numbered sets are original, uncomplemented sets.
3120             if((whichSpans&SPAN_SET)==0) {
3121                 continue;
3122             }
3123         } else {
3124             // Odd-numbered sets are complemented.
3125             if((whichSpans&SPAN_COMPLEMENT)==0) {
3126                 continue;
3127             }
3128         }
3129         for(type=0;; ++type) {
3130             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3131                                  s, length, isUTF16,
3132                                  whichSpans,
3133                                  type, typeName,
3134                                  limits, UPRV_LENGTHOF(limits), expectCount);
3135             if(typeName[0]==0) {
3136                 break; // All types tried.
3137             }
3138             if(limitsCount<0) {
3139                 continue; // Span option filtered out.
3140             }
3141             if(expectCount<0) {
3142                 expectCount=limitsCount;
3143                 if(limitsCount>UPRV_LENGTHOF(limits)) {
3144                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3145                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3146                     return;
3147                 }
3148                 memcpy(expectLimits, limits, limitsCount*4);
3149             } else if(limitsCount!=expectCount) {
3150                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3151                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3152             } else {
3153                 for(j=0; j<limitsCount; ++j) {
3154                     if(limits[j]!=expectLimits[j]) {
3155                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3156                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3157                               j, (long)limits[j], (long)expectLimits[j]);
3158                         break;
3159                     }
3160                 }
3161             }
3162         }
3163     }
3164 
3165     // Compare span() with containsAll()/containsNone(),
3166     // but only if we have expectLimits[] from the uncomplemented set.
3167     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3168         const UChar *s16=(const UChar *)s;
3169         UnicodeString string;
3170         int32_t prev=0, limit, length;
3171         for(i=0; i<expectCount; ++i) {
3172             limit=expectLimits[i];
3173             length=limit-prev;
3174             if(length>0) {
3175                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3176                 if(i&1) {
3177                     if(!sets[SLOW]->getSet().containsAll(string)) {
3178                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3179                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3180                         return;
3181                     }
3182                     if(!sets[FAST]->getSet().containsAll(string)) {
3183                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3184                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3185                         return;
3186                     }
3187                 } else {
3188                     if(!sets[SLOW]->getSet().containsNone(string)) {
3189                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3190                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3191                         return;
3192                     }
3193                     if(!sets[FAST]->getSet().containsNone(string)) {
3194                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3195                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3196                         return;
3197                     }
3198                 }
3199             }
3200             prev=limit;
3201         }
3202     }
3203 }
3204 
3205 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3206 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3207                               const void *s, int32_t length, UBool isUTF16,
3208                               uint32_t whichSpans,
3209                               const char *testName, int32_t index) {
3210     int32_t expectLimits[500];
3211     int32_t expectCount=-1;
3212     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3213 }
3214 
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3215 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3216     UChar c, c2;
3217 
3218     if(length>=0) {
3219         while(length>0) {
3220             c=*s++;
3221             --length;
3222             if(0xd800<=c && c<0xe000) {
3223                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3224                     return TRUE;
3225                 }
3226                 --length;
3227             }
3228         }
3229     } else {
3230         while((c=*s++)!=0) {
3231             if(0xd800<=c && c<0xe000) {
3232                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3233                     return TRUE;
3234                 }
3235             }
3236         }
3237     }
3238     return FALSE;
3239 }
3240 
3241 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3242 // unless either UTF is turned off in whichSpans.
3243 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3244 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3245 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3246                                       const UChar *s16, int32_t length16,
3247                                       uint32_t whichSpans,
3248                                       const char *testName, int32_t index) {
3249     int32_t expectLimits[500];
3250     int32_t expectCount;
3251 
3252     expectCount=-1;  // Get expectLimits[] from testSpan().
3253 
3254     if((whichSpans&SPAN_UTF16)!=0) {
3255         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3256     }
3257     if((whichSpans&SPAN_UTF8)==0) {
3258         return;
3259     }
3260 
3261     // Convert s16[] and expectLimits[] to UTF-8.
3262     uint8_t s8[3000];
3263     int32_t offsets[3000];
3264 
3265     const UChar *s16Limit=s16+length16;
3266     char *t=(char *)s8;
3267     char *tLimit=t+sizeof(s8);
3268     int32_t *o=offsets;
3269     UErrorCode errorCode=U_ZERO_ERROR;
3270 
3271     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3272     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3273     if(U_FAILURE(errorCode)) {
3274         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3275               testName, (long)index, u_errorName(errorCode));
3276         ucnv_resetFromUnicode(utf8Cnv);
3277         return;
3278     }
3279     int32_t length8=(int32_t)(t-(char *)s8);
3280 
3281     // Convert expectLimits[].
3282     int32_t i, j, expect;
3283     for(i=j=0; i<expectCount; ++i) {
3284         expect=expectLimits[i];
3285         if(expect==length16) {
3286             expectLimits[i]=length8;
3287         } else {
3288             while(offsets[j]<expect) {
3289                 ++j;
3290             }
3291             expectLimits[i]=j;
3292         }
3293     }
3294 
3295     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3296 }
3297 
nextCodePoint(UChar32 c)3298 static UChar32 nextCodePoint(UChar32 c) {
3299     // Skip some large and boring ranges.
3300     switch(c) {
3301     case 0x3441:
3302         return 0x4d7f;
3303     case 0x5100:
3304         return 0x9f00;
3305     case 0xb040:
3306         return 0xd780;
3307     case 0xe041:
3308         return 0xf8fe;
3309     case 0x10100:
3310         return 0x20000;
3311     case 0x20041:
3312         return 0xe0000;
3313     case 0xe0101:
3314         return 0x10fffd;
3315     default:
3316         return c+1;
3317     }
3318 }
3319 
3320 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3321 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3322     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3323     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3324     // Skip the UTF-8 part of the test - if the string contains surrogates -
3325     // because it is likely to produce a different result.
3326     UBool inconsistentSurrogates=
3327             (!(sets[0]->getSet().contains(0xfffd) ?
3328                sets[0]->getSet().contains(0xd800, 0xdfff) :
3329                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3330              sets[0]->hasStringsWithSurrogates());
3331 
3332     UChar s[1000];
3333     int32_t length=0;
3334     uint32_t localWhichSpans;
3335 
3336     UChar32 c, first;
3337     for(first=c=0;; c=nextCodePoint(c)) {
3338         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3339             localWhichSpans=whichSpans;
3340             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3341                 localWhichSpans&=~SPAN_UTF8;
3342             }
3343             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3344             if(c>0x10ffff) {
3345                 break;
3346             }
3347             length=0;
3348             first=c;
3349         }
3350         U16_APPEND_UNSAFE(s, length, c);
3351     }
3352 }
3353 
3354 // Test with a particular, interesting string.
3355 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3356 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3357     static const UChar s[]={
3358         0x61, 0x62, 0x20,                       // Latin, space
3359         0x3b1, 0x3b2, 0x3b3,                    // Greek
3360         0xd900,                                 // lead surrogate
3361         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3362         0xdc05,                                 // trail surrogate
3363         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3364         0xd900, 0xdc05,                         // unassigned supplementary
3365         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3366         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3367         0                                       // NUL
3368     };
3369 
3370     if((whichSpans&SPAN_UTF16)==0) {
3371         return;
3372     }
3373     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3374     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3375 }
3376 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3377 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3378     static const char s[]={
3379         "abc"                                   // Latin
3380 
3381         /* trail byte in lead position */
3382         "\x80"
3383 
3384         " "                                     // space
3385 
3386         /* truncated multi-byte sequences */
3387         "\xd0"
3388         "\xe0"
3389         "\xe1"
3390         "\xed"
3391         "\xee"
3392         "\xf0"
3393         "\xf1"
3394         "\xf4"
3395         "\xf8"
3396         "\xfc"
3397 
3398         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3399 
3400         /* trail byte in lead position */
3401         "\x80"
3402 
3403         "\xe0\x80"
3404         "\xe0\xa0"
3405         "\xe1\x80"
3406         "\xed\x80"
3407         "\xed\xa0"
3408         "\xee\x80"
3409         "\xf0\x80"
3410         "\xf0\x90"
3411         "\xf1\x80"
3412         "\xf4\x80"
3413         "\xf4\x90"
3414         "\xf8\x80"
3415         "\xfc\x80"
3416 
3417         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3418 
3419         /* trail byte in lead position */
3420         "\x80"
3421 
3422         "\xf0\x80\x80"
3423         "\xf0\x90\x80"
3424         "\xf1\x80\x80"
3425         "\xf4\x80\x80"
3426         "\xf4\x90\x80"
3427         "\xf8\x80\x80"
3428         "\xfc\x80\x80"
3429 
3430         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3431 
3432         /* trail byte in lead position */
3433         "\x80"
3434 
3435         "\xf8\x80\x80\x80"
3436         "\xfc\x80\x80\x80"
3437 
3438         "\xF1\x90\x80\x85"                      // unassigned supplementary
3439 
3440         /* trail byte in lead position */
3441         "\x80"
3442 
3443         "\xfc\x80\x80\x80\x80"
3444 
3445         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3446 
3447         /* trail byte in lead position */
3448         "\x80"
3449 
3450         /* complete sequences but non-shortest forms or out of range etc. */
3451         "\xc0\x80"
3452         "\xe0\x80\x80"
3453         "\xed\xa0\x80"
3454         "\xf0\x80\x80\x80"
3455         "\xf4\x90\x80\x80"
3456         "\xf8\x80\x80\x80\x80"
3457         "\xfc\x80\x80\x80\x80\x80"
3458         "\xfe"
3459         "\xff"
3460 
3461         /* trail byte in lead position */
3462         "\x80"
3463 
3464         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3465     };
3466 
3467     if((whichSpans&SPAN_UTF8)==0) {
3468         return;
3469     }
3470     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3471     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3472 }
3473 
3474 // Take a set of span options and multiply them so that
3475 // each portion only has one of the options a, b and c.
3476 // If b==0, then the set of options is just modified with mask and a.
3477 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3478 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3479 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3480                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3481     uint32_t s;
3482     int32_t i;
3483 
3484     for(i=0; i<whichSpansCount; ++i) {
3485         s=whichSpans[i]&mask;
3486         whichSpans[i]=s|a;
3487         if(b!=0) {
3488             whichSpans[whichSpansCount+i]=s|b;
3489             if(c!=0) {
3490                 whichSpans[2*whichSpansCount+i]=s|c;
3491             }
3492         }
3493     }
3494     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3495 }
3496 
3497 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3498 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3499 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3500 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3501 
TestSpan()3502 void UnicodeSetTest::TestSpan() {
3503     // "[...]" is a UnicodeSet pattern.
3504     // "*" performs tests on all Unicode code points and on a selection of
3505     //   malformed UTF-8/16 strings.
3506     // "-options" limits the scope of testing for the current set.
3507     //   By default, the test verifies that equivalent boundaries are found
3508     //   for UTF-16 and UTF-8, going forward and backward,
3509     //   alternating USET_SPAN_NOT_CONTAINED with
3510     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3511     //   Single-character options:
3512     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3513     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3514     //          or the set contains strings with unpaired surrogates
3515     //          which do not translate to valid UTF-8.
3516     //     c -- set.span() and set.complement().span() boundaries may differ.
3517     //          Cause: Set strings are not complemented.
3518     //     b -- span() and spanBack() boundaries may differ.
3519     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3520     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3521     //          match with non-overlapping substrings.
3522     //          For example, with a set containing "ab" and "ba",
3523     //          span() of "aba" yields boundaries { 0, 2, 3 }
3524     //          because the initial "ab" matches from 0 to 2,
3525     //          while spanBack() yields boundaries { 0, 1, 3 }
3526     //          because the final "ba" matches from 1 to 3.
3527     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3528     //          Cause: Strings in the set overlap, and a longer match may
3529     //          require a sequence including non-longest substrings.
3530     //          For example, with a set containing "ab", "abc" and "cd",
3531     //          span(contained) of "abcd" spans the entire string
3532     //          but span(longest match) only spans the first 3 characters.
3533     //   Each "-options" first resets all options and then applies the specified options.
3534     //   A "-" without options resets the options.
3535     //   The options are also reset for each new set.
3536     // Other strings will be spanned.
3537     static const char *const testdata[]={
3538         "[:ID_Continue:]",
3539         "*",
3540         "[:White_Space:]",
3541         "*",
3542         "[]",
3543         "*",
3544         "[\\u0000-\\U0010FFFF]",
3545         "*",
3546         "[\\u0000\\u0080\\u0800\\U00010000]",
3547         "*",
3548         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3549         "*",
3550         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3551         "-c",
3552         "*",
3553         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3554         "-c",
3555         "*",
3556 
3557         // Overlapping strings cause overlapping attempts to match.
3558         "[x{xy}{xya}{axy}{ax}]",
3559         "-cl",
3560 
3561         // More repetitions of "xya" would take too long with the recursive
3562         // reference implementation.
3563         // containsAll()=FALSE
3564         // test_string 0x14
3565         "xx"
3566         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3567         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3568         "xyaxyaxyaxya"
3569         "xx"
3570         "xyaxyaxyaxya"  // span() ends here.
3571         "aaa",
3572 
3573         // containsAll()=TRUE
3574         // test_string 0x15
3575         "xx"
3576         "xyaxyaxyaxya"
3577         "xx"
3578         "xyaxyaxyaxya"
3579         "xx"
3580         "xyaxyaxyaxy",
3581 
3582         "-bc",
3583         // test_string 0x17
3584         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3585         "-c",
3586         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3587         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3588         "-",
3589         "byaya",     // span() -> { 5 }
3590         "byay",      // span() -> { 4 }
3591         "bya",       // span() -> { 3 }
3592 
3593         // span(longest match) will not span the whole string.
3594         "[a{ab}{bc}]",
3595         "-cl",
3596         // test_string 0x21
3597         "abc",
3598 
3599         "[a{ab}{abc}{cd}]",
3600         "-cl",
3601         "acdabcdabccd",
3602 
3603         // spanBack(longest match) will not span the whole string.
3604         "[c{ab}{bc}]",
3605         "-cl",
3606         "abc",
3607 
3608         "[d{cd}{bcd}{ab}]",
3609         "-cl",
3610         "abbcdabcdabd",
3611 
3612         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3613         // and UTF-8 trail bytes.
3614         // Copies of above test sets and strings, but transliterated to have
3615         // different code points with similar trail units.
3616         // Previous: a      b         c            d
3617         // Unicode:  042B   30AB      200AB        204AB
3618         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3619         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3620         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3621         "-cl",
3622         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3623 
3624         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3625         "-cl",
3626         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3627 
3628         // Stress bookkeeping and recursion.
3629         // The following strings are barely doable with the recursive
3630         // reference implementation.
3631         // The not-contained character at the end prevents an early exit from the span().
3632         "[b{bb}]",
3633         "-c",
3634         // test_string 0x33
3635         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3636         // On complement sets, span() and spanBack() get different results
3637         // because b is not in the complement set and there is an odd number of b's
3638         // in the test string.
3639         "-bc",
3640         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3641 
3642         // Test with set strings with an initial or final code point span
3643         // longer than 254.
3644         "[a{" _64_a _64_a _64_a _64_a "b}"
3645           "{a" _64_b _64_b _64_b _64_b "}]",
3646         "-c",
3647         _64_a _64_a _64_a _63_a "b",
3648         _64_a _64_a _64_a _64_a "b",
3649         _64_a _64_a _64_a _64_a "aaaabbbb",
3650         "a" _64_b _64_b _64_b _63_b,
3651         "a" _64_b _64_b _64_b _64_b,
3652         "aaaabbbb" _64_b _64_b _64_b _64_b,
3653 
3654         // Test with strings containing unpaired surrogates.
3655         // They are not representable in UTF-8, and a leading trail surrogate
3656         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3657         // U+20001 == \\uD840\\uDC01
3658         // U+20400 == \\uD841\\uDC00
3659         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3660         "-8cl",
3661         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3662     };
3663     uint32_t whichSpans[96]={ SPAN_ALL };
3664     int32_t whichSpansCount=1;
3665 
3666     UnicodeSet *sets[SET_COUNT]={ NULL };
3667     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3668 
3669     char testName[1024];
3670     char *testNameLimit=testName;
3671 
3672     int32_t i, j;
3673     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3674         const char *s=testdata[i];
3675         if(s[0]=='[') {
3676             // Create new test sets from this pattern.
3677             for(j=0; j<SET_COUNT; ++j) {
3678                 delete sets_with_str[j];
3679                 delete sets[j];
3680             }
3681             UErrorCode errorCode=U_ZERO_ERROR;
3682             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3683             if(U_FAILURE(errorCode)) {
3684                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3685                 break;
3686             }
3687             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3688             sets[SLOW_NOT]->complement();
3689             // Intermediate set: Test cloning of a frozen set.
3690             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3691             fast->freeze();
3692             sets[FAST]=(UnicodeSet *)fast->clone();
3693             delete fast;
3694             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3695             fastNot->freeze();
3696             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3697             delete fastNot;
3698 
3699             for(j=0; j<SET_COUNT; ++j) {
3700                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3701             }
3702 
3703             strcpy(testName, s);
3704             testNameLimit=strchr(testName, 0);
3705             *testNameLimit++=':';
3706             *testNameLimit=0;
3707 
3708             whichSpans[0]=SPAN_ALL;
3709             whichSpansCount=1;
3710         } else if(s[0]=='-') {
3711             whichSpans[0]=SPAN_ALL;
3712             whichSpansCount=1;
3713 
3714             while(*++s!=0) {
3715                 switch(*s) {
3716                 case 'c':
3717                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3718                                                    ~SPAN_POLARITY,
3719                                                    SPAN_SET,
3720                                                    SPAN_COMPLEMENT,
3721                                                    0);
3722                     break;
3723                 case 'b':
3724                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3725                                                    ~SPAN_DIRS,
3726                                                    SPAN_FWD,
3727                                                    SPAN_BACK,
3728                                                    0);
3729                     break;
3730                 case 'l':
3731                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3732                     // USET_SPAN_SIMPLE only FWD, and separately
3733                     // USET_SPAN_SIMPLE only BACK
3734                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3735                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3736                                                    SPAN_DIRS|SPAN_CONTAINED,
3737                                                    SPAN_FWD|SPAN_SIMPLE,
3738                                                    SPAN_BACK|SPAN_SIMPLE);
3739                     break;
3740                 case '8':
3741                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3742                                                    ~SPAN_UTFS,
3743                                                    SPAN_UTF16,
3744                                                    SPAN_UTF8,
3745                                                    0);
3746                     break;
3747                 default:
3748                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3749                     break;
3750                 }
3751             }
3752         } else if(0==strcmp(s, "*")) {
3753             strcpy(testNameLimit, "bad_string");
3754             for(j=0; j<whichSpansCount; ++j) {
3755                 if(whichSpansCount>1) {
3756                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3757                             "%%0x%3x",
3758                             whichSpans[j]);
3759                 }
3760                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3761                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3762             }
3763 
3764             strcpy(testNameLimit, "contents");
3765             for(j=0; j<whichSpansCount; ++j) {
3766                 if(whichSpansCount>1) {
3767                     sprintf(testNameLimit+8 /* strlen("contents") */,
3768                             "%%0x%3x",
3769                             whichSpans[j]);
3770                 }
3771                 testSpanContents(sets_with_str, whichSpans[j], testName);
3772             }
3773         } else {
3774             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3775             strcpy(testNameLimit, "test_string");
3776             for(j=0; j<whichSpansCount; ++j) {
3777                 if(whichSpansCount>1) {
3778                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3779                             "%%0x%3x",
3780                             whichSpans[j]);
3781                 }
3782                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3783             }
3784         }
3785     }
3786     for(j=0; j<SET_COUNT; ++j) {
3787         delete sets_with_str[j];
3788         delete sets[j];
3789     }
3790 }
3791 
3792 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3793 void UnicodeSetTest::TestStringSpan() {
3794     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3795     static const char *const string=
3796         "xx"
3797         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3798         "xx"
3799         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3800         "xx"
3801         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3802         "aaaa";
3803 
3804     UErrorCode errorCode=U_ZERO_ERROR;
3805     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3806     UnicodeSet set(pattern16, errorCode);
3807     if(U_FAILURE(errorCode)) {
3808         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3809         return;
3810     }
3811 
3812     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3813 
3814     if(set.containsAll(string16)) {
3815         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3816     }
3817 
3818     // Remove trailing "aaaa".
3819     string16.truncate(string16.length()-4);
3820     if(!set.containsAll(string16)) {
3821         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3822     }
3823 
3824     string16=UNICODE_STRING_SIMPLE("byayaxya");
3825     const UChar *s16=string16.getBuffer();
3826     int32_t length16=string16.length();
3827     (void)length16;   // Suppress set but not used warning.
3828     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3829         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3830         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3831         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3832         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3833         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3834     ) {
3835         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3836     }
3837 
3838     pattern="[a{ab}{abc}{cd}]";
3839     pattern16=UnicodeString(pattern, -1, US_INV);
3840     set.applyPattern(pattern16, errorCode);
3841     if(U_FAILURE(errorCode)) {
3842         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3843         return;
3844     }
3845     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3846     s16=string16.getBuffer();
3847     length16=string16.length();
3848     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3849         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3850         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3851     ) {
3852         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3853     }
3854 
3855     pattern="[d{cd}{bcd}{ab}]";
3856     pattern16=UnicodeString(pattern, -1, US_INV);
3857     set.applyPattern(pattern16, errorCode).freeze();
3858     if(U_FAILURE(errorCode)) {
3859         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3860         return;
3861     }
3862     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3863     s16=string16.getBuffer();
3864     length16=string16.length();
3865     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3866         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3867         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3868     ) {
3869         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3870     }
3871 }
3872 
3873 /**
3874  * Including collationroot.h fails here with
3875 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3876  *  .. so, we skip this test on Windows.
3877  *
3878  * the cause is that  intltest builds with /Za which disables language extensions - which means
3879  *  windows header files can't be used.
3880  */
3881 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3882 #include "collationroot.h"
3883 #include "collationtailoring.h"
3884 #endif
3885 
TestUCAUnsafeBackwards()3886 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3887 #if U_PLATFORM_HAS_WIN32_API
3888     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3889 #elif !UCONFIG_NO_COLLATION
3890     UErrorCode errorCode = U_ZERO_ERROR;
3891 
3892     // Get the unsafeBackwardsSet
3893     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3894     if(U_FAILURE(errorCode)) {
3895       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3896       return;
3897     }
3898     //const UVersionInfo &version = rootEntry->tailoring->version;
3899     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3900 
3901     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3902 
3903     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3904         // simple test case
3905         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3906         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3907         UnicodeSet surrogates;
3908         surrogates.add(0xd83a);  // a lead surrogate
3909         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
3910         UnicodeString pat;
3911         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3912         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3913         // so that at least one type of surrogate code points are escaped,
3914         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3915         errorCode = U_ZERO_ERROR;
3916         UnicodeSet s2;
3917         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3918         if(U_FAILURE(errorCode)) {
3919             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3920         } else {
3921             checkEqual(surrogates, s2, "surrogates to/from pattern");
3922         }
3923         // This occurs in the UCA unsafe-backwards set.
3924         checkRoundTrip(*unsafeBackwardSet);
3925     }
3926 #endif
3927 }
3928