• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ********************************************************************************
3 *   Copyright (C) 1999-2010 International Business Machines Corporation and
4 *   others. All Rights Reserved.
5 ********************************************************************************
6 *   Date        Name        Description
7 *   10/20/99    alan        Creation.
8 *   03/22/2000  Madhu       Added additional tests
9 ********************************************************************************
10 */
11 
12 #include <stdio.h>
13 
14 #include <string.h>
15 #include "unicode/utypes.h"
16 #include "usettest.h"
17 #include "unicode/ucnv.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/usetiter.h"
21 #include "unicode/ustring.h"
22 #include "unicode/parsepos.h"
23 #include "unicode/symtable.h"
24 #include "unicode/uversion.h"
25 #include "hash.h"
26 
27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
28 
29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
30     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
31     u_errorName(status));}}
32 
33 #define TEST_ASSERT(expr) {if (!(expr)) { \
34     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
35 
operator +(const UnicodeString & left,const UnicodeSet & set)36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37     UnicodeString pat;
38     set.toPattern(pat);
39     return left + UnicodeSetTest::escape(pat);
40 }
41 
42 #define CASE(id,test) case id:                          \
43                           name = #test;                 \
44                           if (exec) {                   \
45                               logln(#test "---");       \
46                               logln();                  \
47                               test();                   \
48                           }                             \
49                           break
50 
UnicodeSetTest()51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53 
openUTF8Converter()54 UConverter *UnicodeSetTest::openUTF8Converter() {
55     if(utf8Cnv==NULL) {
56         UErrorCode errorCode=U_ZERO_ERROR;
57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
58     }
59     return utf8Cnv;
60 }
61 
~UnicodeSetTest()62 UnicodeSetTest::~UnicodeSetTest() {
63     ucnv_close(utf8Cnv);
64 }
65 
66 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68                                const char* &name, char* /*par*/) {
69     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70     switch (index) {
71         CASE(0,TestPatterns);
72         CASE(1,TestAddRemove);
73         CASE(2,TestCategories);
74         CASE(3,TestCloneEqualHash);
75         CASE(4,TestMinimalRep);
76         CASE(5,TestAPI);
77         CASE(6,TestScriptSet);
78         CASE(7,TestPropertySet);
79         CASE(8,TestClone);
80         CASE(9,TestExhaustive);
81         CASE(10,TestToPattern);
82         CASE(11,TestIndexOf);
83         CASE(12,TestStrings);
84         CASE(13,Testj2268);
85         CASE(14,TestCloseOver);
86         CASE(15,TestEscapePattern);
87         CASE(16,TestInvalidCodePoint);
88         CASE(17,TestSymbolTable);
89         CASE(18,TestSurrogate);
90         CASE(19,TestPosixClasses);
91         CASE(20,TestIteration);
92         CASE(21,TestFreezable);
93         CASE(22,TestSpan);
94         CASE(23,TestStringSpan);
95         default: name = ""; break;
96     }
97 }
98 
99 static const char NOT[] = "%%%%";
100 
101 /**
102  * UVector was improperly copying contents
103  * This code will crash this is still true
104  */
Testj2268()105 void UnicodeSetTest::Testj2268() {
106   UnicodeSet t;
107   t.add(UnicodeString("abc"));
108   UnicodeSet test(t);
109   UnicodeString ustrPat;
110   test.toPattern(ustrPat, TRUE);
111 }
112 
113 /**
114  * Test toPattern().
115  */
TestToPattern()116 void UnicodeSetTest::TestToPattern() {
117     UErrorCode ec = U_ZERO_ERROR;
118 
119     // Test that toPattern() round trips with syntax characters and
120     // whitespace.
121     {
122         static const char* OTHER_TOPATTERN_TESTS[] = {
123             "[[:latin:]&[:greek:]]",
124             "[[:latin:]-[:greek:]]",
125             "[:nonspacing mark:]",
126             NULL
127         };
128 
129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130             ec = U_ZERO_ERROR;
131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132             if (U_FAILURE(ec)) {
133                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
134                 continue;
135             }
136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
137         }
138 
139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141 
142                 // check various combinations to make sure they all work.
143                 if (i != 0 && !toPatternAux(i, i)){
144                     continue;
145                 }
146                 if (!toPatternAux(0, i)){
147                     continue;
148                 }
149                 if (!toPatternAux(i, 0xFFFF)){
150                     continue;
151                 }
152             }
153         }
154     }
155 
156     // Test pattern behavior of multicharacter strings.
157     {
158         ec = U_ZERO_ERROR;
159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160 
161         // This loop isn't a loop.  It's here to make the compiler happy.
162         // If you're curious, try removing it and changing the 'break'
163         // statements (except for the last) to goto's.
164         for (;;) {
165             if (U_FAILURE(ec)) break;
166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168 
169             s->add("ac");
170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172 
173             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
174             if (U_FAILURE(ec)) break;
175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
177 
178             s->add("[]");
179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
181 
182             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
183             if (U_FAILURE(ec)) break;
184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
186 
187             // j2189
188             s->clear();
189             s->add(UnicodeString("abc", ""));
190             s->add(UnicodeString("abc", ""));
191             const char* exp6[] = {"abc", NOT, "ab", NULL};
192             expectToPattern(*s, "[{abc}]", exp6);
193 
194             break;
195         }
196 
197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198         delete s;
199     }
200 
201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202     UnicodeSet s;
203     s.add((UChar)97, (UChar)98); // 'a', 'b'
204     expectToPattern(s, "[ab]", NULL);
205 }
206 
toPatternAux(UChar32 start,UChar32 end)207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208 
209     // use Integer.toString because Utility.hex doesn't handle ints
210     UnicodeString pat = "";
211     // TODO do these in hex
212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214     UnicodeString source;
215     source = source + (uint32_t)start;
216     if (start != end)
217         source = source + ".." + (uint32_t)end;
218     UnicodeSet testSet;
219     testSet.add(start, end);
220     return checkPat(source, testSet);
221 }
222 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224                                const UnicodeSet& testSet) {
225     // What we want to make sure of is that a pattern generated
226     // by toPattern(), with or without escaped unprintables, can
227     // be passed back into the UnicodeSet constructor.
228     UnicodeString pat0;
229 
230     testSet.toPattern(pat0, TRUE);
231 
232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233 
234     //String pat1 = unescapeLeniently(pat0);
235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236 
237     UnicodeString pat2;
238     testSet.toPattern(pat2, FALSE);
239     if (!checkPat(source, testSet, pat2)) return FALSE;
240 
241     //String pat3 = unescapeLeniently(pat2);
242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243 
244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246     return TRUE;
247 }
248 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250                                const UnicodeSet& testSet,
251                                const UnicodeString& pat) {
252     UErrorCode ec = U_ZERO_ERROR;
253     UnicodeSet testSet2(pat, ec);
254     if (testSet2 != testSet) {
255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256         return FALSE;
257     }
258     return TRUE;
259 }
260 
261 void
TestPatterns(void)262 UnicodeSetTest::TestPatterns(void) {
263     UnicodeSet set;
264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270 
271     // Throw in a test of complement
272     set.complement();
273     UnicodeString exp;
274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275     expectPairs(set, exp);
276 }
277 
278 void
TestCategories(void)279 UnicodeSetTest::TestCategories(void) {
280     UErrorCode status = U_ZERO_ERROR;
281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282     UnicodeSet set(pat, status);
283     if (U_FAILURE(status)) {
284         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
285         return;
286     } else {
287         expectContainment(set, pat, "ABC", "abc");
288     }
289 
290     UChar32 i;
291     int32_t failures = 0;
292     // Make sure generation of L doesn't pollute cached Lu set
293     // First generate L, then Lu
294     set.applyPattern("[:L:]", status);
295     if (U_FAILURE(status)) { errln("FAIL"); return; }
296     for (i=0; i<0x200; ++i) {
297         UBool l = u_isalpha((UChar)i);
298         if (l != set.contains(i)) {
299             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
300                   set.contains(i));
301             if (++failures == 10) break;
302         }
303     }
304 
305     set.applyPattern("[:Lu:]", status);
306     if (U_FAILURE(status)) { errln("FAIL"); return; }
307     for (i=0; i<0x200; ++i) {
308         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
309         if (lu != set.contains(i)) {
310             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
311                   set.contains(i));
312             if (++failures == 20) break;
313         }
314     }
315 }
316 void
TestCloneEqualHash(void)317 UnicodeSetTest::TestCloneEqualHash(void) {
318     UErrorCode status = U_ZERO_ERROR;
319     // set1 and set2 used to be built with the obsolete constructor taking
320     // UCharCategory values; replaced with pattern constructors
321     // markus 20030502
322     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
323     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
324     if (U_FAILURE(status)){
325         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
326         return;
327     }
328     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
329     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
330     if (U_FAILURE(status)){
331         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
332         return;
333     }
334 
335     if (*set1 != *set1a) {
336         errln("FAIL: category constructor for Ll broken");
337     }
338     if (*set2 != *set2a) {
339         errln("FAIL: category constructor for Nd broken");
340     }
341     delete set1a;
342     delete set2a;
343 
344     logln("Testing copy construction");
345     UnicodeSet *set1copy=new UnicodeSet(*set1);
346     if(*set1 != *set1copy || *set1 == *set2 ||
347         getPairs(*set1) != getPairs(*set1copy) ||
348         set1->hashCode() != set1copy->hashCode()){
349         errln("FAIL : Error in copy construction");
350         return;
351     }
352 
353     logln("Testing =operator");
354     UnicodeSet set1equal=*set1;
355     UnicodeSet set2equal=*set2;
356     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
357         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
358         errln("FAIL: Error in =operator");
359     }
360 
361     logln("Testing clone()");
362     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
363     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
364     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
365         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
366         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
367         errln("FAIL: Error in clone");
368     }
369 
370     logln("Testing hashcode");
371     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
372         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
373         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
374         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
375         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
376         errln("FAIL: Error in hashCode()");
377     }
378 
379     delete set1;
380     delete set1copy;
381     delete set2;
382     delete set1clone;
383     delete set2clone;
384 
385 
386 }
387 void
TestAddRemove(void)388 UnicodeSetTest::TestAddRemove(void) {
389     UnicodeSet set; // Construct empty set
390     doAssert(set.isEmpty() == TRUE, "set should be empty");
391     doAssert(set.size() == 0, "size should be 0");
392     set.complement();
393     doAssert(set.size() == 0x110000, "size should be 0x110000");
394     set.clear();
395     set.add(0x0061, 0x007a);
396     expectPairs(set, "az");
397     doAssert(set.isEmpty() == FALSE, "set should not be empty");
398     doAssert(set.size() != 0, "size should not be equal to 0");
399     doAssert(set.size() == 26, "size should be equal to 26");
400     set.remove(0x006d, 0x0070);
401     expectPairs(set, "alqz");
402     doAssert(set.size() == 22, "size should be equal to 22");
403     set.remove(0x0065, 0x0067);
404     expectPairs(set, "adhlqz");
405     doAssert(set.size() == 19, "size should be equal to 19");
406     set.remove(0x0064, 0x0069);
407     expectPairs(set, "acjlqz");
408     doAssert(set.size() == 16, "size should be equal to 16");
409     set.remove(0x0063, 0x0072);
410     expectPairs(set, "absz");
411     doAssert(set.size() == 10, "size should be equal to 10");
412     set.add(0x0066, 0x0071);
413     expectPairs(set, "abfqsz");
414     doAssert(set.size() == 22, "size should be equal to 22");
415     set.remove(0x0061, 0x0067);
416     expectPairs(set, "hqsz");
417     set.remove(0x0061, 0x007a);
418     expectPairs(set, "");
419     doAssert(set.isEmpty() == TRUE, "set should be empty");
420     doAssert(set.size() == 0, "size should be 0");
421     set.add(0x0061);
422     doAssert(set.isEmpty() == FALSE, "set should not be empty");
423     doAssert(set.size() == 1, "size should not be equal to 1");
424     set.add(0x0062);
425     set.add(0x0063);
426     expectPairs(set, "ac");
427     doAssert(set.size() == 3, "size should not be equal to 3");
428     set.add(0x0070);
429     set.add(0x0071);
430     expectPairs(set, "acpq");
431     doAssert(set.size() == 5, "size should not be equal to 5");
432     set.clear();
433     expectPairs(set, "");
434     doAssert(set.isEmpty() == TRUE, "set should be empty");
435     doAssert(set.size() == 0, "size should be 0");
436 
437     // Try removing an entire set from another set
438     expectPattern(set, "[c-x]", "cx");
439     UnicodeSet set2;
440     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
441     set.removeAll(set2);
442     expectPairs(set, "deluxx");
443 
444     // Try adding an entire set to another set
445     expectPattern(set, "[jackiemclean]", "aacceein");
446     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
447     set.addAll(set2);
448     expectPairs(set, "aacehort");
449     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
450 
451     // Try retaining an set of elements contained in another set (intersection)
452     UnicodeSet set3;
453     expectPattern(set3, "[a-c]", "ac");
454     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
455     set3.remove(0x0062);
456     expectPairs(set3, "aacc");
457     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
458     set.retainAll(set3);
459     expectPairs(set, "aacc");
460     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
461     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
462     set.clear();
463     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
464 
465     // Test commutativity
466     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
467     expectPattern(set2, "[jackiemclean]", "aacceein");
468     set.addAll(set2);
469     expectPairs(set, "aacehort");
470     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
471 
472 
473 
474 
475 }
476 
477 /**
478  * Make sure minimal representation is maintained.
479  */
TestMinimalRep()480 void UnicodeSetTest::TestMinimalRep() {
481     UErrorCode status = U_ZERO_ERROR;
482     // This is pretty thoroughly tested by checkCanonicalRep()
483     // run against the exhaustive operation results.  Use the code
484     // here for debugging specific spot problems.
485 
486     // 1 overlap against 2
487     UnicodeSet set("[h-km-q]", status);
488     if (U_FAILURE(status)) { errln("FAIL"); return; }
489     UnicodeSet set2("[i-o]", status);
490     if (U_FAILURE(status)) { errln("FAIL"); return; }
491     set.addAll(set2);
492     expectPairs(set, "hq");
493     // right
494     set.applyPattern("[a-m]", status);
495     if (U_FAILURE(status)) { errln("FAIL"); return; }
496     set2.applyPattern("[e-o]", status);
497     if (U_FAILURE(status)) { errln("FAIL"); return; }
498     set.addAll(set2);
499     expectPairs(set, "ao");
500     // left
501     set.applyPattern("[e-o]", status);
502     if (U_FAILURE(status)) { errln("FAIL"); return; }
503     set2.applyPattern("[a-m]", status);
504     if (U_FAILURE(status)) { errln("FAIL"); return; }
505     set.addAll(set2);
506     expectPairs(set, "ao");
507     // 1 overlap against 3
508     set.applyPattern("[a-eg-mo-w]", status);
509     if (U_FAILURE(status)) { errln("FAIL"); return; }
510     set2.applyPattern("[d-q]", status);
511     if (U_FAILURE(status)) { errln("FAIL"); return; }
512     set.addAll(set2);
513     expectPairs(set, "aw");
514 }
515 
TestAPI()516 void UnicodeSetTest::TestAPI() {
517     UErrorCode status = U_ZERO_ERROR;
518     // default ct
519     UnicodeSet set;
520     if (!set.isEmpty() || set.getRangeCount() != 0) {
521         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
522               set);
523     }
524 
525     // clear(), isEmpty()
526     set.add(0x0061);
527     if (set.isEmpty()) {
528         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
529               set);
530     }
531     set.clear();
532     if (!set.isEmpty()) {
533         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
534               set);
535     }
536 
537     // size()
538     set.clear();
539     if (set.size() != 0) {
540         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
541               ": " + set);
542     }
543     set.add(0x0061);
544     if (set.size() != 1) {
545         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
546               ": " + set);
547     }
548     set.add(0x0031, 0x0039);
549     if (set.size() != 10) {
550         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
551               ": " + set);
552     }
553 
554     // contains(first, last)
555     set.clear();
556     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
557     if (U_FAILURE(status)) { errln("FAIL"); return; }
558     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
559         UChar32 a = set.getRangeStart(i);
560         UChar32 b = set.getRangeEnd(i);
561         if (!set.contains(a, b)) {
562             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
563                   " but doesn't: " + set);
564         }
565         if (set.contains((UChar32)(a-1), b)) {
566             errln((UnicodeString)"FAIL, shouldn't contain " +
567                   (unsigned short)(a-1) + '-' + (unsigned short)b +
568                   " but does: " + set);
569         }
570         if (set.contains(a, (UChar32)(b+1))) {
571             errln((UnicodeString)"FAIL, shouldn't contain " +
572                   (unsigned short)a + '-' + (unsigned short)(b+1) +
573                   " but does: " + set);
574         }
575     }
576 
577     // Ported InversionList test.
578     UnicodeSet a((UChar32)3,(UChar32)10);
579     UnicodeSet b((UChar32)7,(UChar32)15);
580     UnicodeSet c;
581 
582     logln((UnicodeString)"a [3-10]: " + a);
583     logln((UnicodeString)"b [7-15]: " + b);
584     c = a;
585     c.addAll(b);
586     UnicodeSet exp((UChar32)3,(UChar32)15);
587     if (c == exp) {
588         logln((UnicodeString)"c.set(a).add(b): " + c);
589     } else {
590         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
591     }
592     c.complement();
593     exp.set((UChar32)0, (UChar32)2);
594     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
595     if (c == exp) {
596         logln((UnicodeString)"c.complement(): " + c);
597     } else {
598         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
599     }
600     c.complement();
601     exp.set((UChar32)3, (UChar32)15);
602     if (c == exp) {
603         logln((UnicodeString)"c.complement(): " + c);
604     } else {
605         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
606     }
607     c = a;
608     c.complementAll(b);
609     exp.set((UChar32)3,(UChar32)6);
610     exp.add((UChar32)11,(UChar32) 15);
611     if (c == exp) {
612         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
613     } else {
614         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
615     }
616 
617     exp = c;
618     bitsToSet(setToBits(c), c);
619     if (c == exp) {
620         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
621     } else {
622         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
623     }
624 
625     // Additional tests for coverage JB#2118
626     //UnicodeSet::complement(class UnicodeString const &)
627     //UnicodeSet::complementAll(class UnicodeString const &)
628     //UnicodeSet::containsNone(class UnicodeSet const &)
629     //UnicodeSet::containsNone(long,long)
630     //UnicodeSet::containsSome(class UnicodeSet const &)
631     //UnicodeSet::containsSome(long,long)
632     //UnicodeSet::removeAll(class UnicodeString const &)
633     //UnicodeSet::retain(long)
634     //UnicodeSet::retainAll(class UnicodeString const &)
635     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
636     //UnicodeSetIterator::getString(void)
637     set.clear();
638     set.complement("ab");
639     exp.applyPattern("[{ab}]", status);
640     if (U_FAILURE(status)) { errln("FAIL"); return; }
641     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
642 
643     UnicodeSetIterator iset(set);
644     if (!iset.next() || !iset.isString()) {
645         errln("FAIL: UnicodeSetIterator::next/isString");
646     } else if (iset.getString() != "ab") {
647         errln("FAIL: UnicodeSetIterator::getString");
648     }
649 
650     set.add((UChar32)0x61, (UChar32)0x7A);
651     set.complementAll("alan");
652     exp.applyPattern("[{ab}b-kmo-z]", status);
653     if (U_FAILURE(status)) { errln("FAIL"); return; }
654     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
655 
656     exp.applyPattern("[a-z]", status);
657     if (U_FAILURE(status)) { errln("FAIL"); return; }
658     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
659     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
660     exp.applyPattern("[aln]", status);
661     if (U_FAILURE(status)) { errln("FAIL"); return; }
662     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
663     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
664 
665     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
666         errln("FAIL: containsNone(UChar32, UChar32)");
667     }
668     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
669         errln("FAIL: containsSome(UChar32, UChar32)");
670     }
671     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
672         errln("FAIL: containsNone(UChar32, UChar32)");
673     }
674     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
675         errln("FAIL: containsSome(UChar32, UChar32)");
676     }
677 
678     set.removeAll("liu");
679     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
680     if (U_FAILURE(status)) { errln("FAIL"); return; }
681     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
682 
683     set.retainAll("star");
684     exp.applyPattern("[rst]", status);
685     if (U_FAILURE(status)) { errln("FAIL"); return; }
686     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
687 
688     set.retain((UChar32)0x73);
689     exp.applyPattern("[s]", status);
690     if (U_FAILURE(status)) { errln("FAIL"); return; }
691     if (set != exp) { errln("FAIL: retain('s')"); return; }
692 
693     uint16_t buf[32];
694     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
695     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
696     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
697         errln("FAIL: serialize");
698         return;
699     }
700 
701     // Conversions to and from USet
702     UnicodeSet *uniset = &set;
703     USet *uset = uniset->toUSet();
704     TEST_ASSERT((void *)uset == (void *)uniset);
705     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
706     TEST_ASSERT((void *)setx == (void *)uset);
707     const UnicodeSet *constSet = uniset;
708     const USet *constUSet = constSet->toUSet();
709     TEST_ASSERT((void *)constUSet == (void *)constSet);
710     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
711     TEST_ASSERT((void *)constSetx == (void *)constUSet);
712 
713     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
714     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
715     UnicodeSet ac(0x61, 0x63);
716     ac.remove(0x62).freeze();
717     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
718         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
719         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
720         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
721         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
722         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
723         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
724         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
725         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
726         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
727     ) {
728         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
729     }
730     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
731         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
732         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
733         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
734         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
735         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
736         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
737         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
738         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
739         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
740     ) {
741         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
742     }
743 }
744 
TestIteration()745 void UnicodeSetTest::TestIteration() {
746     UErrorCode ec = U_ZERO_ERROR;
747     int i = 0;
748     int outerLoop;
749 
750     // 6 code points, 3 ranges, 2 strings, 8 total elements
751     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
752     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
753     TEST_ASSERT_SUCCESS(ec);
754     UnicodeSetIterator it(set);
755 
756     for (outerLoop=0; outerLoop<3; outerLoop++) {
757         // Run the test multiple times, to check that iterator.reset() is working.
758         for (i=0; i<10; i++) {
759             UBool         nextv        = it.next();
760             UBool         isString     = it.isString();
761             int32_t       codePoint    = it.getCodepoint();
762             //int32_t       codePointEnd = it.getCodepointEnd();
763             UnicodeString s   = it.getString();
764             switch (i) {
765             case 0:
766                 TEST_ASSERT(nextv == TRUE);
767                 TEST_ASSERT(isString == FALSE);
768                 TEST_ASSERT(codePoint==0x61);
769                 TEST_ASSERT(s == "a");
770                 break;
771             case 1:
772                 TEST_ASSERT(nextv == TRUE);
773                 TEST_ASSERT(isString == FALSE);
774                 TEST_ASSERT(codePoint==0x62);
775                 TEST_ASSERT(s == "b");
776                 break;
777             case 2:
778                 TEST_ASSERT(nextv == TRUE);
779                 TEST_ASSERT(isString == FALSE);
780                 TEST_ASSERT(codePoint==0x63);
781                 TEST_ASSERT(s == "c");
782                 break;
783             case 3:
784                 TEST_ASSERT(nextv == TRUE);
785                 TEST_ASSERT(isString == FALSE);
786                 TEST_ASSERT(codePoint==0x79);
787                 TEST_ASSERT(s == "y");
788                 break;
789             case 4:
790                 TEST_ASSERT(nextv == TRUE);
791                 TEST_ASSERT(isString == FALSE);
792                 TEST_ASSERT(codePoint==0x7a);
793                 TEST_ASSERT(s == "z");
794                 break;
795             case 5:
796                 TEST_ASSERT(nextv == TRUE);
797                 TEST_ASSERT(isString == FALSE);
798                 TEST_ASSERT(codePoint==0x1abcd);
799                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
800                 break;
801             case 6:
802                 TEST_ASSERT(nextv == TRUE);
803                 TEST_ASSERT(isString == TRUE);
804                 TEST_ASSERT(s == "str1");
805                 break;
806             case 7:
807                 TEST_ASSERT(nextv == TRUE);
808                 TEST_ASSERT(isString == TRUE);
809                 TEST_ASSERT(s == "str2");
810                 break;
811             case 8:
812                 TEST_ASSERT(nextv == FALSE);
813                 break;
814             case 9:
815                 TEST_ASSERT(nextv == FALSE);
816                 break;
817             }
818         }
819         it.reset();  // prepare to run the iteration again.
820     }
821 }
822 
823 
824 
825 
TestStrings()826 void UnicodeSetTest::TestStrings() {
827     UErrorCode ec = U_ZERO_ERROR;
828 
829     UnicodeSet* testList[] = {
830         UnicodeSet::createFromAll("abc"),
831         new UnicodeSet("[a-c]", ec),
832 
833         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
834         new UnicodeSet("[{ll}{ch}a-z]", ec),
835 
836         UnicodeSet::createFrom("ab}c"),
837         new UnicodeSet("[{ab\\}c}]", ec),
838 
839         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
840         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
841 
842         NULL
843     };
844 
845     if (U_FAILURE(ec)) {
846         errln("FAIL: couldn't construct test sets");
847     }
848 
849     for (int32_t i = 0; testList[i] != NULL; i+=2) {
850         if (U_SUCCESS(ec)) {
851             UnicodeString pat0, pat1;
852             testList[i]->toPattern(pat0, TRUE);
853             testList[i+1]->toPattern(pat1, TRUE);
854             if (*testList[i] == *testList[i+1]) {
855                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
856             } else {
857                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
858             }
859         }
860         delete testList[i];
861         delete testList[i+1];
862     }
863 }
864 
865 /**
866  * Test the [:Latin:] syntax.
867  */
TestScriptSet()868 void UnicodeSetTest::TestScriptSet() {
869     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
870 
871     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
872 
873     /* Jitterbug 1423 */
874     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
875 
876 }
877 
878 /**
879  * Test the [:Latin:] syntax.
880  */
TestPropertySet()881 void UnicodeSetTest::TestPropertySet() {
882     static const char* const DATA[] = {
883         // Pattern, Chars IN, Chars NOT in
884 
885         "[:Latin:]",
886         "aA",
887         "\\u0391\\u03B1",
888 
889         "[\\p{Greek}]",
890         "\\u0391\\u03B1",
891         "aA",
892 
893         "\\P{ GENERAL Category = upper case letter }",
894         "abc",
895         "ABC",
896 
897 #if !UCONFIG_NO_NORMALIZATION
898         // Combining class: @since ICU 2.2
899         // Check both symbolic and numeric
900         "\\p{ccc=Nukta}",
901         "\\u0ABC",
902         "abc",
903 
904         "\\p{Canonical Combining Class = 11}",
905         "\\u05B1",
906         "\\u05B2",
907 
908         "[:c c c = iota subscript :]",
909         "\\u0345",
910         "xyz",
911 #endif
912 
913         // Bidi class: @since ICU 2.2
914         "\\p{bidiclass=lefttoright}",
915         "abc",
916         "\\u0671\\u0672",
917 
918         // Binary properties: @since ICU 2.2
919         "\\p{ideographic}",
920         "\\u4E0A",
921         "x",
922 
923         "[:math=false:]",
924         "q)*(",
925         // weiv: )(and * were removed from math in Unicode 4.0.1
926         //"(*+)",
927         "+<>^",
928 
929         // JB#1767 \N{}, \p{ASCII}
930         "[:Ascii:]",
931         "abc\\u0000\\u007F",
932         "\\u0080\\u4E00",
933 
934         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
935         "az",
936         "qrs",
937 
938         // JB#2015
939         "[:any:]",
940         "a\\U0010FFFF",
941         "",
942 
943         "[:nv=0.5:]",
944         "\\u00BD\\u0F2A",
945         "\\u00BC",
946 
947         // JB#2653: Age
948         "[:Age=1.1:]",
949         "\\u03D6", // 1.1
950         "\\u03D8\\u03D9", // 3.2
951 
952         "[:Age=3.1:]",
953         "\\u1800\\u3400\\U0002f800",
954         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
955 
956         // JB#2350: Case_Sensitive
957         "[:Case Sensitive:]",
958         "A\\u1FFC\\U00010410",
959         ";\\u00B4\\U00010500",
960 
961         // JB#2832: C99-compatibility props
962         "[:blank:]",
963         " \\u0009",
964         "1-9A-Z",
965 
966         "[:graph:]",
967         "19AZ",
968         " \\u0003\\u0007\\u0009\\u000A\\u000D",
969 
970         "[:punct:]",
971         "!@#%&*()[]{}-_\\/;:,.?'\"",
972         "09azAZ",
973 
974         "[:xdigit:]",
975         "09afAF",
976         "gG!",
977 
978         // Regex compatibility test
979         "[-b]", // leading '-' is literal
980         "-b",
981         "ac",
982 
983         "[^-b]", // leading '-' is literal
984         "ac",
985         "-b",
986 
987         "[b-]", // trailing '-' is literal
988         "-b",
989         "ac",
990 
991         "[^b-]", // trailing '-' is literal
992         "ac",
993         "-b",
994 
995         "[a-b-]", // trailing '-' is literal
996         "ab-",
997         "c=",
998 
999         "[[a-q]&[p-z]-]", // trailing '-' is literal
1000         "pq-",
1001         "or=",
1002 
1003         "[\\s|\\)|:|$|\\>]", // from regex tests
1004         "s|):$>",
1005         "abc",
1006 
1007         "[\\uDC00cd]", // JB#2906: isolated trail at start
1008         "cd\\uDC00",
1009         "ab\\uD800\\U00010000",
1010 
1011         "[ab\\uD800]", // JB#2906: isolated trail at start
1012         "ab\\uD800",
1013         "cd\\uDC00\\U00010000",
1014 
1015         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016         "abcd\\uD800",
1017         "ef\\uDC00\\U00010000",
1018 
1019         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020         "abcd\\uDC00",
1021         "ef\\uD800\\U00010000",
1022 
1023 #if !UCONFIG_NO_NORMALIZATION
1024         "[:^lccc=0:]", // Lead canonical class
1025         "\\u0300\\u0301",
1026         "abcd\\u00c0\\u00c5",
1027 
1028         "[:^tccc=0:]", // Trail canonical class
1029         "\\u0300\\u0301\\u00c0\\u00c5",
1030         "abcd",
1031 
1032         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033         "\\u0300\\u0301\\u00c0\\u00c5",
1034         "abcd",
1035 
1036         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037         "",
1038         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039 
1040         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041         "\\u0F73\\u0F75\\u0F81",
1042         "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043 #endif /* !UCONFIG_NO_NORMALIZATION */
1044 
1045         "[:Assigned:]",
1046         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047         "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048 
1049         // Script_Extensions, new in Unicode 6.0
1050         "[:scx=Arab:]",
1051         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052         "\\u061D\\u065F\\uFDEF\\uFDFE",
1053 
1054         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055         // so scx-sc is missing U+FDF2.
1056         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057         "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
1058         "\\uFDF2"
1059     };
1060 
1061     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1062 
1063     for (int32_t i=0; i<DATA_LEN; i+=3) {
1064         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065                           CharsToUnicodeString(DATA[i+2]));
1066     }
1067 }
1068 
1069 /**
1070   * Test that Posix style character classes [:digit:], etc.
1071   *   have the Unicode definitions from TR 18.
1072   */
TestPosixClasses()1073 void UnicodeSetTest::TestPosixClasses() {
1074     {
1075         UErrorCode status = U_ZERO_ERROR;
1076         UnicodeSet s1("[:alpha:]", status);
1077         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078         TEST_ASSERT_SUCCESS(status);
1079         TEST_ASSERT(s1==s2);
1080     }
1081     {
1082         UErrorCode status = U_ZERO_ERROR;
1083         UnicodeSet s1("[:lower:]", status);
1084         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085         TEST_ASSERT_SUCCESS(status);
1086         TEST_ASSERT(s1==s2);
1087     }
1088     {
1089         UErrorCode status = U_ZERO_ERROR;
1090         UnicodeSet s1("[:upper:]", status);
1091         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092         TEST_ASSERT_SUCCESS(status);
1093         TEST_ASSERT(s1==s2);
1094     }
1095     {
1096         UErrorCode status = U_ZERO_ERROR;
1097         UnicodeSet s1("[:punct:]", status);
1098         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099         TEST_ASSERT_SUCCESS(status);
1100         TEST_ASSERT(s1==s2);
1101     }
1102     {
1103         UErrorCode status = U_ZERO_ERROR;
1104         UnicodeSet s1("[:digit:]", status);
1105         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106         TEST_ASSERT_SUCCESS(status);
1107         TEST_ASSERT(s1==s2);
1108     }
1109     {
1110         UErrorCode status = U_ZERO_ERROR;
1111         UnicodeSet s1("[:xdigit:]", status);
1112         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113         TEST_ASSERT_SUCCESS(status);
1114         TEST_ASSERT(s1==s2);
1115     }
1116     {
1117         UErrorCode status = U_ZERO_ERROR;
1118         UnicodeSet s1("[:alnum:]", status);
1119         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120         TEST_ASSERT_SUCCESS(status);
1121         TEST_ASSERT(s1==s2);
1122     }
1123     {
1124         UErrorCode status = U_ZERO_ERROR;
1125         UnicodeSet s1("[:space:]", status);
1126         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127         TEST_ASSERT_SUCCESS(status);
1128         TEST_ASSERT(s1==s2);
1129     }
1130     {
1131         UErrorCode status = U_ZERO_ERROR;
1132         UnicodeSet s1("[:blank:]", status);
1133         TEST_ASSERT_SUCCESS(status);
1134         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135             status);
1136         TEST_ASSERT_SUCCESS(status);
1137         TEST_ASSERT(s1==s2);
1138     }
1139     {
1140         UErrorCode status = U_ZERO_ERROR;
1141         UnicodeSet s1("[:cntrl:]", status);
1142         TEST_ASSERT_SUCCESS(status);
1143         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144         TEST_ASSERT_SUCCESS(status);
1145         TEST_ASSERT(s1==s2);
1146     }
1147     {
1148         UErrorCode status = U_ZERO_ERROR;
1149         UnicodeSet s1("[:graph:]", status);
1150         TEST_ASSERT_SUCCESS(status);
1151         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152         TEST_ASSERT_SUCCESS(status);
1153         TEST_ASSERT(s1==s2);
1154     }
1155     {
1156         UErrorCode status = U_ZERO_ERROR;
1157         UnicodeSet s1("[:print:]", status);
1158         TEST_ASSERT_SUCCESS(status);
1159         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160         TEST_ASSERT_SUCCESS(status);
1161         TEST_ASSERT(s1==s2);
1162     }
1163 }
1164 /**
1165  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1166  */
TestClone()1167 void UnicodeSetTest::TestClone() {
1168     UErrorCode ec = U_ZERO_ERROR;
1169     UnicodeSet s("[abcxyz]", ec);
1170     UnicodeSet t(s);
1171     expectContainment(t, "abc", "def");
1172 }
1173 
1174 /**
1175  * Test the indexOf() and charAt() methods.
1176  */
TestIndexOf()1177 void UnicodeSetTest::TestIndexOf() {
1178     UErrorCode ec = U_ZERO_ERROR;
1179     UnicodeSet set("[a-cx-y3578]", ec);
1180     if (U_FAILURE(ec)) {
1181         errln("FAIL: UnicodeSet constructor");
1182         return;
1183     }
1184     for (int32_t i=0; i<set.size(); ++i) {
1185         UChar32 c = set.charAt(i);
1186         if (set.indexOf(c) != i) {
1187             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188                 i, c, set.indexOf(c));
1189         }
1190     }
1191     UChar32 c = set.charAt(set.size());
1192     if (c != -1) {
1193         errln("FAIL: charAt(<out of range>) = %X", c);
1194     }
1195     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196     if (j != -1) {
1197         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198     }
1199 }
1200 
1201 /**
1202  * Test closure API.
1203  */
TestCloseOver()1204 void UnicodeSetTest::TestCloseOver() {
1205     UErrorCode ec = U_ZERO_ERROR;
1206 
1207     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209     const char* DATA[] = {
1210         // selector, input, output
1211         CASE,
1212         "[aq\\u00DF{Bc}{bC}{Fi}]",
1213         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214 
1215         CASE,
1216         "[\\u01F1]", // 'DZ'
1217         "[\\u01F1\\u01F2\\u01F3]",
1218 
1219         CASE,
1220         "[\\u1FB4]",
1221         "[\\u1FB4{\\u03AC\\u03B9}]",
1222 
1223         CASE,
1224         "[{F\\uFB01}]",
1225         "[\\uFB03{ffi}]",
1226 
1227         CASE, // make sure binary search finds limits
1228         "[a\\uFF3A]",
1229         "[aA\\uFF3A\\uFF5A]",
1230 
1231         CASE,
1232         "[a-z]","[A-Za-z\\u017F\\u212A]",
1233         CASE,
1234         "[abc]","[A-Ca-c]",
1235         CASE,
1236         "[ABC]","[A-Ca-c]",
1237 
1238         CASE, "[i]", "[iI]",
1239 
1240         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1241         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1242 
1243         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1244 
1245         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246 
1247         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248 
1249         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1250 
1251         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1252 
1253         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254 
1255         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1256         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1257 
1258         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1259 
1260         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261 
1262         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263 
1264 #if !UCONFIG_NO_FILE_IO
1265         CASE_MAPPINGS,
1266         "[aq\\u00DF{Bc}{bC}{Fi}]",
1267         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268 #endif
1269 
1270         CASE_MAPPINGS,
1271         "[\\u01F1]", // 'DZ'
1272         "[\\u01F1\\u01F2\\u01F3]",
1273 
1274         CASE_MAPPINGS,
1275         "[a-z]",
1276         "[A-Za-z]",
1277 
1278         NULL
1279     };
1280 
1281     UnicodeSet s;
1282     UnicodeSet t;
1283     UnicodeString buf;
1284     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285         int32_t selector = DATA[i][0];
1286         UnicodeString pat(DATA[i+1], -1, US_INV);
1287         UnicodeString exp(DATA[i+2], -1, US_INV);
1288         s.applyPattern(pat, ec);
1289         s.closeOver(selector);
1290         t.applyPattern(exp, ec);
1291         if (U_FAILURE(ec)) {
1292             errln("FAIL: applyPattern failed");
1293             continue;
1294         }
1295         if (s == t) {
1296             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297         } else {
1298             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299                   s.toPattern(buf, TRUE) + ", expected " + exp);
1300         }
1301     }
1302 
1303 #if 0
1304     /*
1305      * Unused test code.
1306      * This was used to compare the old implementation (using USET_CASE)
1307      * with the new one (using 0x100 temporarily)
1308      * while transitioning from hardcoded case closure tables in uniset.cpp
1309      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310      * and using ucase.c functions for closure.
1311      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312      *
1313      * Note: The old and new implementation never fully matched because
1314      * the old implementation turned out to not map U+0130 and U+0131 correctly
1315      * (dotted I and dotless i) and because the old implementation's data tables
1316      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317      * new implementation. (So sigmas and some other characters were not handled
1318      * according to the newer Unicode version.)
1319      */
1320     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321     UnicodeSetIterator si(sens);
1322     UnicodeString str, buf2;
1323     const UnicodeString *pStr;
1324     UChar32 c;
1325     while(si.next()) {
1326         if(!si.isString()) {
1327             c=si.getCodepoint();
1328             s.clear();
1329             s.add(c);
1330 
1331             str.setTo(c);
1332             str.foldCase();
1333             sens2.add(str);
1334 
1335             t=s;
1336             s.closeOver(USET_CASE);
1337             t.closeOver(0x100);
1338             if(s!=t) {
1339                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1340                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341             }
1342         }
1343     }
1344     // remove all code points
1345     // should contain all full case folding mapping strings
1346     sens2.remove(0, 0x10ffff);
1347     si.reset(sens2);
1348     while(si.next()) {
1349         if(si.isString()) {
1350             pStr=&si.getString();
1351             s.clear();
1352             s.add(*pStr);
1353             t=s2=s;
1354             s.closeOver(USET_CASE);
1355             t.closeOver(0x100);
1356             if(s!=t) {
1357                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359             }
1360         }
1361     }
1362 #endif
1363 
1364     // Test the pattern API
1365     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366     if (U_FAILURE(ec)) {
1367         errln("FAIL: applyPattern failed");
1368     } else {
1369         expectContainment(s, "abcABC", "defDEF");
1370     }
1371     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372     if (U_FAILURE(ec)) {
1373         errln("FAIL: constructor failed");
1374     } else {
1375         expectContainment(v, "defDEF", "abcABC");
1376     }
1377     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378     if (U_FAILURE(ec)) {
1379         errln("FAIL: construct w/case mappings failed");
1380     } else {
1381         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382     }
1383 }
1384 
TestEscapePattern()1385 void UnicodeSetTest::TestEscapePattern() {
1386     const char pattern[] =
1387         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388     const char exp[] =
1389         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390     // We test this with two passes; in the second pass we
1391     // pre-unescape the pattern.  Since U+200E is rule whitespace,
1392     // this fails -- which is what we expect.
1393     for (int32_t pass=1; pass<=2; ++pass) {
1394         UErrorCode ec = U_ZERO_ERROR;
1395         UnicodeString pat(pattern, -1, US_INV);
1396         if (pass==2) {
1397             pat = pat.unescape();
1398         }
1399         // Pattern is only good for pass 1
1400         UBool isPatternValid = (pass==1);
1401 
1402         UnicodeSet set(pat, ec);
1403         if (U_SUCCESS(ec) != isPatternValid){
1404             errln((UnicodeString)"FAIL: applyPattern(" +
1405                   escape(pat) + ") => " +
1406                   u_errorName(ec));
1407             continue;
1408         }
1409         if (U_FAILURE(ec)) {
1410             continue;
1411         }
1412         if (set.contains((UChar)0x0644)){
1413             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414         }
1415 
1416         UnicodeString newpat;
1417         set.toPattern(newpat, TRUE);
1418         if (newpat == UnicodeString(exp, -1, US_INV)) {
1419             logln(escape(pat) + " => " + newpat);
1420         } else {
1421             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422         }
1423 
1424         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425             UnicodeString str("Range ");
1426             str.append((UChar)(0x30 + i))
1427                 .append(": ")
1428                 .append((UChar32)set.getRangeStart(i))
1429                 .append(" - ")
1430                 .append((UChar32)set.getRangeEnd(i));
1431             str = str + " (" + set.getRangeStart(i) + " - " +
1432                 set.getRangeEnd(i) + ")";
1433             if (set.getRangeStart(i) < 0) {
1434                 errln((UnicodeString)"FAIL: " + escape(str));
1435             } else {
1436                 logln(escape(str));
1437             }
1438         }
1439     }
1440 }
1441 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
1443                                  const UnicodeSet& set,
1444                                  UChar32 start, UChar32 end) {
1445     UnicodeSet exp(start, end);
1446     UnicodeString pat;
1447     if (set == exp) {
1448         logln(label + " => " + set.toPattern(pat, TRUE));
1449     } else {
1450         UnicodeString xpat;
1451         errln((UnicodeString)"FAIL: " + label + " => " +
1452               set.toPattern(pat, TRUE) +
1453               ", expected " + exp.toPattern(xpat, TRUE));
1454     }
1455 }
1456 
TestInvalidCodePoint()1457 void UnicodeSetTest::TestInvalidCodePoint() {
1458 
1459     const UChar32 DATA[] = {
1460         // Test range             Expected range
1461         0, 0x10FFFF,              0, 0x10FFFF,
1462         (UChar32)-1, 8,           0, 8,
1463         8, 0x110000,              8, 0x10FFFF
1464     };
1465     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1466 
1467     UnicodeString pat;
1468     int32_t i;
1469 
1470     for (i=0; i<DATA_LENGTH; i+=4) {
1471         UChar32 start  = DATA[i];
1472         UChar32 end    = DATA[i+1];
1473         UChar32 xstart = DATA[i+2];
1474         UChar32 xend   = DATA[i+3];
1475 
1476         // Try various API using the test code points
1477 
1478         UnicodeSet set(start, end);
1479         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480                     set, xstart, xend);
1481 
1482         set.clear();
1483         set.set(start, end);
1484         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485                     set, xstart, xend);
1486 
1487         UBool b = set.contains(start);
1488         b = set.contains(start, end);
1489         b = set.containsNone(start, end);
1490         b = set.containsSome(start, end);
1491 
1492         /*int32_t index = set.indexOf(start);*/
1493 
1494         set.clear();
1495         set.add(start);
1496         set.add(start, end);
1497         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1498                     set, xstart, xend);
1499 
1500         set.set(0, 0x10FFFF);
1501         set.retain(start, end);
1502         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1503                     set, xstart, xend);
1504         set.retain(start);
1505 
1506         set.set(0, 0x10FFFF);
1507         set.remove(start);
1508         set.remove(start, end);
1509         set.complement();
1510         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1511                     set, xstart, xend);
1512 
1513         set.set(0, 0x10FFFF);
1514         set.complement(start, end);
1515         set.complement();
1516         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1517                     set, xstart, xend);
1518         set.complement(start);
1519     }
1520 
1521     const UChar32 DATA2[] = {
1522         0,
1523         0x10FFFF,
1524         (UChar32)-1,
1525         0x110000
1526     };
1527     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1528 
1529     for (i=0; i<DATA2_LENGTH; ++i) {
1530         UChar32 c = DATA2[i], end = 0x10FFFF;
1531         UBool valid = (c >= 0 && c <= 0x10FFFF);
1532 
1533         UnicodeSet set(0, 0x10FFFF);
1534 
1535         // For single-codepoint contains, invalid codepoints are NOT contained
1536         UBool b = set.contains(c);
1537         if (b == valid) {
1538             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1539                   ") = " + b);
1540         } else {
1541             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1542                   ") = " + b);
1543         }
1544 
1545         // For codepoint range contains, containsNone, and containsSome,
1546         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1547         b = set.contains(c, end);
1548         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1549               "," + end + ") = " + b);
1550 
1551         b = set.containsNone(c, end);
1552         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1553               "," + end + ") = " + b);
1554 
1555         b = set.containsSome(c, end);
1556         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1557               "," + end + ") = " + b);
1558 
1559         int32_t index = set.indexOf(c);
1560         if ((index >= 0) == valid) {
1561             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1562                   ") = " + index);
1563         } else {
1564             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1565                   ") = " + index);
1566         }
1567     }
1568 }
1569 
1570 // Used by TestSymbolTable
1571 class TokenSymbolTable : public SymbolTable {
1572 public:
1573     Hashtable contents;
1574 
TokenSymbolTable(UErrorCode & ec)1575     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1576         contents.setValueDeleter(uhash_deleteUnicodeString);
1577     }
1578 
~TokenSymbolTable()1579     ~TokenSymbolTable() {}
1580 
1581     /**
1582      * (Non-SymbolTable API) Add the given variable and value to
1583      * the table.  Variable should NOT contain leading '$'.
1584      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1585     void add(const UnicodeString& var, const UnicodeString& value,
1586              UErrorCode& ec) {
1587         if (U_SUCCESS(ec)) {
1588             contents.put(var, new UnicodeString(value), ec);
1589         }
1590     }
1591 
1592     /**
1593      * SymbolTable API
1594      */
lookup(const UnicodeString & s) const1595     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1596         return (const UnicodeString*) contents.get(s);
1597     }
1598 
1599     /**
1600      * SymbolTable API
1601      */
lookupMatcher(UChar32) const1602     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1603         return NULL;
1604     }
1605 
1606     /**
1607      * SymbolTable API
1608      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1609     virtual UnicodeString parseReference(const UnicodeString& text,
1610                                          ParsePosition& pos, int32_t limit) const {
1611         int32_t start = pos.getIndex();
1612         int32_t i = start;
1613         UnicodeString result;
1614         while (i < limit) {
1615             UChar c = text.charAt(i);
1616             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1617                 break;
1618             }
1619             ++i;
1620         }
1621         if (i == start) { // No valid name chars
1622             return result; // Indicate failure with empty string
1623         }
1624         pos.setIndex(i);
1625         text.extractBetween(start, i, result);
1626         return result;
1627     }
1628 };
1629 
TestSymbolTable()1630 void UnicodeSetTest::TestSymbolTable() {
1631     // Multiple test cases can be set up here.  Each test case
1632     // is terminated by null:
1633     // var, value, var, value,..., input pat., exp. output pat., null
1634     const char* DATA[] = {
1635         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1636         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1637         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1638         NULL
1639     };
1640 
1641     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1642         UErrorCode ec = U_ZERO_ERROR;
1643         TokenSymbolTable sym(ec);
1644         if (U_FAILURE(ec)) {
1645             errln("FAIL: couldn't construct TokenSymbolTable");
1646             continue;
1647         }
1648 
1649         // Set up variables
1650         while (DATA[i+2] != NULL) {
1651             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1652             if (U_FAILURE(ec)) {
1653                 errln("FAIL: couldn't add to TokenSymbolTable");
1654                 continue;
1655             }
1656             i += 2;
1657         }
1658 
1659         // Input pattern and expected output pattern
1660         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1661         i += 2;
1662 
1663         ParsePosition pos(0);
1664         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1665         if (U_FAILURE(ec)) {
1666             errln("FAIL: couldn't construct UnicodeSet");
1667             continue;
1668         }
1669 
1670         // results
1671         if (pos.getIndex() != inpat.length()) {
1672             errln((UnicodeString)"Failed to read to end of string \""
1673                   + inpat + "\": read to "
1674                   + pos.getIndex() + ", length is "
1675                   + inpat.length());
1676         }
1677 
1678         UnicodeSet us2(exppat, ec);
1679         if (U_FAILURE(ec)) {
1680             errln("FAIL: couldn't construct expected UnicodeSet");
1681             continue;
1682         }
1683 
1684         UnicodeString a, b;
1685         if (us != us2) {
1686             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1687                   ", expected " + us2.toPattern(b, TRUE));
1688         } else {
1689             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1690         }
1691     }
1692 }
1693 
TestSurrogate()1694 void UnicodeSetTest::TestSurrogate() {
1695     const char* DATA[] = {
1696         // These should all behave identically
1697         "[abc\\uD800\\uDC00]",
1698         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1699         "[abc\\U00010000]",
1700         0
1701     };
1702     for (int i=0; DATA[i] != 0; ++i) {
1703         UErrorCode ec = U_ZERO_ERROR;
1704         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1705         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1706         UnicodeSet set(str, ec);
1707         if (U_FAILURE(ec)) {
1708             errln("FAIL: UnicodeSet constructor");
1709             continue;
1710         }
1711         expectContainment(set,
1712                           CharsToUnicodeString("abc\\U00010000"),
1713                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1714         if (set.size() != 4) {
1715             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1716                   set.size() + ", expected 4");
1717         }
1718     }
1719 }
1720 
TestExhaustive()1721 void UnicodeSetTest::TestExhaustive() {
1722     // exhaustive tests. Simulate UnicodeSets with integers.
1723     // That gives us very solid tests (except for large memory tests).
1724 
1725     int32_t limit = 128;
1726 
1727     UnicodeSet x, y, z, aa;
1728 
1729     for (int32_t i = 0; i < limit; ++i) {
1730         bitsToSet(i, x);
1731         logln((UnicodeString)"Testing " + i + ", " + x);
1732         _testComplement(i, x, y);
1733 
1734         // AS LONG AS WE ARE HERE, check roundtrip
1735         checkRoundTrip(bitsToSet(i, aa));
1736 
1737         for (int32_t j = 0; j < limit; ++j) {
1738             _testAdd(i,j,  x,y,z);
1739             _testXor(i,j,  x,y,z);
1740             _testRetain(i,j,  x,y,z);
1741             _testRemove(i,j,  x,y,z);
1742         }
1743     }
1744 }
1745 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1746 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1747     bitsToSet(a, x);
1748     z = x;
1749     z.complement();
1750     int32_t c = setToBits(z);
1751     if (c != (~a)) {
1752         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1753         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1754     }
1755     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1756 }
1757 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1758 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1759     bitsToSet(a, x);
1760     bitsToSet(b, y);
1761     z = x;
1762     z.addAll(y);
1763     int32_t c = setToBits(z);
1764     if (c != (a | b)) {
1765         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1766         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1767     }
1768     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1769 }
1770 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1771 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772     bitsToSet(a, x);
1773     bitsToSet(b, y);
1774     z = x;
1775     z.retainAll(y);
1776     int32_t c = setToBits(z);
1777     if (c != (a & b)) {
1778         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1779         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1780     }
1781     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1782 }
1783 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1784 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785     bitsToSet(a, x);
1786     bitsToSet(b, y);
1787     z = x;
1788     z.removeAll(y);
1789     int32_t c = setToBits(z);
1790     if (c != (a &~ b)) {
1791         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1792         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1793     }
1794     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1795 }
1796 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1797 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798     bitsToSet(a, x);
1799     bitsToSet(b, y);
1800     z = x;
1801     z.complementAll(y);
1802     int32_t c = setToBits(z);
1803     if (c != (a ^ b)) {
1804         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1805         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1806     }
1807     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1808 }
1809 
1810 /**
1811  * Check that ranges are monotonically increasing and non-
1812  * overlapping.
1813  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1814 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1815     int32_t n = set.getRangeCount();
1816     if (n < 0) {
1817         errln((UnicodeString)"FAIL result of " + msg +
1818               ": range count should be >= 0 but is " +
1819               n /*+ " for " + set.toPattern())*/);
1820         return;
1821     }
1822     UChar32 last = 0;
1823     for (int32_t i=0; i<n; ++i) {
1824         UChar32 start = set.getRangeStart(i);
1825         UChar32 end = set.getRangeEnd(i);
1826         if (start > end) {
1827             errln((UnicodeString)"FAIL result of " + msg +
1828                   ": range " + (i+1) +
1829                   " start > end: " + (int)start + ", " + (int)end +
1830                   " for " + set);
1831         }
1832         if (i > 0 && start <= last) {
1833             errln((UnicodeString)"FAIL result of " + msg +
1834                   ": range " + (i+1) +
1835                   " overlaps previous range: " + (int)start + ", " + (int)end +
1836                   " for " + set);
1837         }
1838         last = end;
1839     }
1840 }
1841 
1842 /**
1843  * Convert a bitmask to a UnicodeSet.
1844  */
bitsToSet(int32_t a,UnicodeSet & result)1845 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1846     result.clear();
1847     for (UChar32 i = 0; i < 32; ++i) {
1848         if ((a & (1<<i)) != 0) {
1849             result.add(i);
1850         }
1851     }
1852     return result;
1853 }
1854 
1855 /**
1856  * Convert a UnicodeSet to a bitmask.  Only the characters
1857  * U+0000 to U+0020 are represented in the bitmask.
1858  */
setToBits(const UnicodeSet & x)1859 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1860     int32_t result = 0;
1861     for (int32_t i = 0; i < 32; ++i) {
1862         if (x.contains((UChar32)i)) {
1863             result |= (1<<i);
1864         }
1865     }
1866     return result;
1867 }
1868 
1869 /**
1870  * Return the representation of an inversion list based UnicodeSet
1871  * as a pairs list.  Ranges are listed in ascending Unicode order.
1872  * For example, the set [a-zA-M3] is represented as "33AMaz".
1873  */
getPairs(const UnicodeSet & set)1874 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1875     UnicodeString pairs;
1876     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1877         UChar32 start = set.getRangeStart(i);
1878         UChar32 end = set.getRangeEnd(i);
1879         if (end > 0xFFFF) {
1880             end = 0xFFFF;
1881             i = set.getRangeCount(); // Should be unnecessary
1882         }
1883         pairs.append((UChar)start).append((UChar)end);
1884     }
1885     return pairs;
1886 }
1887 
1888 /**
1889  * Basic consistency check for a few items.
1890  * That the iterator works, and that we can create a pattern and
1891  * get the same thing back
1892  */
checkRoundTrip(const UnicodeSet & s)1893 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1894     UErrorCode ec = U_ZERO_ERROR;
1895 
1896     UnicodeSet t(s);
1897     checkEqual(s, t, "copy ct");
1898 
1899     t = s;
1900     checkEqual(s, t, "operator=");
1901 
1902     copyWithIterator(t, s, FALSE);
1903     checkEqual(s, t, "iterator roundtrip");
1904 
1905     copyWithIterator(t, s, TRUE); // try range
1906     checkEqual(s, t, "iterator roundtrip");
1907 
1908     UnicodeString pat; s.toPattern(pat, FALSE);
1909     t.applyPattern(pat, ec);
1910     if (U_FAILURE(ec)) {
1911         errln("FAIL: applyPattern");
1912         return;
1913     } else {
1914         checkEqual(s, t, "toPattern(false)");
1915     }
1916 
1917     s.toPattern(pat, TRUE);
1918     t.applyPattern(pat, ec);
1919     if (U_FAILURE(ec)) {
1920         errln("FAIL: applyPattern");
1921         return;
1922     } else {
1923         checkEqual(s, t, "toPattern(true)");
1924     }
1925 }
1926 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1927 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1928     t.clear();
1929     UnicodeSetIterator it(s);
1930     if (withRange) {
1931         while (it.nextRange()) {
1932             if (it.isString()) {
1933                 t.add(it.getString());
1934             } else {
1935                 t.add(it.getCodepoint(), it.getCodepointEnd());
1936             }
1937         }
1938     } else {
1939         while (it.next()) {
1940             if (it.isString()) {
1941                 t.add(it.getString());
1942             } else {
1943                 t.add(it.getCodepoint());
1944             }
1945         }
1946     }
1947 }
1948 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)1949 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1950     UnicodeString source; s.toPattern(source, TRUE);
1951     UnicodeString result; t.toPattern(result, TRUE);
1952     if (s != t) {
1953         errln((UnicodeString)"FAIL: " + message
1954               + "; source = " + source
1955               + "; result = " + result
1956               );
1957         return FALSE;
1958     } else {
1959         logln((UnicodeString)"Ok: " + message
1960               + "; source = " + source
1961               + "; result = " + result
1962               );
1963     }
1964     return TRUE;
1965 }
1966 
1967 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)1968 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1969                                   const UnicodeString& charsIn,
1970                                   const UnicodeString& charsOut) {
1971     UErrorCode ec = U_ZERO_ERROR;
1972     UnicodeSet set(pat, ec);
1973     if (U_FAILURE(ec)) {
1974         dataerrln((UnicodeString)"FAIL: pattern \"" +
1975               pat + "\" => " + u_errorName(ec));
1976         return;
1977     }
1978     expectContainment(set, pat, charsIn, charsOut);
1979 }
1980 
1981 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)1982 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1983                                   const UnicodeString& charsIn,
1984                                   const UnicodeString& charsOut) {
1985     UnicodeString pat;
1986     set.toPattern(pat);
1987     expectContainment(set, pat, charsIn, charsOut);
1988 }
1989 
1990 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)1991 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1992                                   const UnicodeString& setName,
1993                                   const UnicodeString& charsIn,
1994                                   const UnicodeString& charsOut) {
1995     UnicodeString bad;
1996     UChar32 c;
1997     int32_t i;
1998 
1999     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2000         c = charsIn.char32At(i);
2001         if (!set.contains(c)) {
2002             bad.append(c);
2003         }
2004     }
2005     if (bad.length() > 0) {
2006         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2007               ", expected containment of " + prettify(charsIn));
2008     } else {
2009         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2010     }
2011 
2012     bad.truncate(0);
2013     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2014         c = charsOut.char32At(i);
2015         if (set.contains(c)) {
2016             bad.append(c);
2017         }
2018     }
2019     if (bad.length() > 0) {
2020         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2021               ", expected non-containment of " + prettify(charsOut));
2022     } else {
2023         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2024     }
2025 }
2026 
2027 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2028 UnicodeSetTest::expectPattern(UnicodeSet& set,
2029                               const UnicodeString& pattern,
2030                               const UnicodeString& expectedPairs){
2031     UErrorCode status = U_ZERO_ERROR;
2032     set.applyPattern(pattern, status);
2033     if (U_FAILURE(status)) {
2034         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2035               "\") failed");
2036         return;
2037     } else {
2038         if (getPairs(set) != expectedPairs ) {
2039             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2040                   "\") => pairs \"" +
2041                   escape(getPairs(set)) + "\", expected \"" +
2042                   escape(expectedPairs) + "\"");
2043         } else {
2044             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2045                   "\") => pairs \"" +
2046                   escape(getPairs(set)) + "\"");
2047         }
2048     }
2049     // the result of calling set.toPattern(), which is the string representation of
2050     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2051     // will produce another set that is equal to this one.
2052     UnicodeString temppattern;
2053     set.toPattern(temppattern);
2054     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2055     if (U_FAILURE(status)) {
2056         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2057         return;
2058     }
2059     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2060         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2061             escape(getPairs(set)) + "\""));
2062     } else{
2063         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2064     }
2065 
2066     delete tempset;
2067 
2068 }
2069 
2070 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2071 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2072     if (getPairs(set) != expectedPairs) {
2073         errln(UnicodeString("FAIL: Expected pair list \"") +
2074               escape(expectedPairs) + "\", got \"" +
2075               escape(getPairs(set)) + "\"");
2076     }
2077 }
2078 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2079 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2080                                      const UnicodeString& expPat,
2081                                      const char** expStrings) {
2082     UnicodeString pat;
2083     set.toPattern(pat, TRUE);
2084     if (pat == expPat) {
2085         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2086     } else {
2087         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2088         return;
2089     }
2090     if (expStrings == NULL) {
2091         return;
2092     }
2093     UBool in = TRUE;
2094     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2095         if (expStrings[i] == NOT) { // sic; pointer comparison
2096             in = FALSE;
2097             continue;
2098         }
2099         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2100         UBool contained = set.contains(s);
2101         if (contained == in) {
2102             logln((UnicodeString)"Ok: " + expPat +
2103                   (contained ? " contains {" : " does not contain {") +
2104                   escape(expStrings[i]) + "}");
2105         } else {
2106             errln((UnicodeString)"FAIL: " + expPat +
2107                   (contained ? " contains {" : " does not contain {") +
2108                   escape(expStrings[i]) + "}");
2109         }
2110     }
2111 }
2112 
toHexString(int32_t i)2113 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2114 
2115 void
doAssert(UBool condition,const char * message)2116 UnicodeSetTest::doAssert(UBool condition, const char *message)
2117 {
2118     if (!condition) {
2119         errln(UnicodeString("ERROR : ") + message);
2120     }
2121 }
2122 
2123 UnicodeString
escape(const UnicodeString & s)2124 UnicodeSetTest::escape(const UnicodeString& s) {
2125     UnicodeString buf;
2126     for (int32_t i=0; i<s.length(); )
2127     {
2128         UChar32 c = s.char32At(i);
2129         if (0x0020 <= c && c <= 0x007F) {
2130             buf += c;
2131         } else {
2132             if (c <= 0xFFFF) {
2133                 buf += (UChar)0x5c; buf += (UChar)0x75;
2134             } else {
2135                 buf += (UChar)0x5c; buf += (UChar)0x55;
2136                 buf += toHexString((c & 0xF0000000) >> 28);
2137                 buf += toHexString((c & 0x0F000000) >> 24);
2138                 buf += toHexString((c & 0x00F00000) >> 20);
2139                 buf += toHexString((c & 0x000F0000) >> 16);
2140             }
2141             buf += toHexString((c & 0xF000) >> 12);
2142             buf += toHexString((c & 0x0F00) >> 8);
2143             buf += toHexString((c & 0x00F0) >> 4);
2144             buf += toHexString(c & 0x000F);
2145         }
2146         i += U16_LENGTH(c);
2147     }
2148     return buf;
2149 }
2150 
TestFreezable()2151 void UnicodeSetTest::TestFreezable() {
2152     UErrorCode errorCode=U_ZERO_ERROR;
2153     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2154     UnicodeSet idSet(idPattern, errorCode);
2155     if(U_FAILURE(errorCode)) {
2156         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2157         return;
2158     }
2159 
2160     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2161     UnicodeSet wsSet(wsPattern, errorCode);
2162     if(U_FAILURE(errorCode)) {
2163         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2164         return;
2165     }
2166 
2167     idSet.add(idPattern);
2168     UnicodeSet frozen(idSet);
2169     frozen.freeze();
2170 
2171     if(idSet.isFrozen() || !frozen.isFrozen()) {
2172         errln("FAIL: isFrozen() is wrong");
2173     }
2174     if(frozen!=idSet || !(frozen==idSet)) {
2175         errln("FAIL: a copy-constructed frozen set differs from its original");
2176     }
2177 
2178     frozen=wsSet;
2179     if(frozen!=idSet || !(frozen==idSet)) {
2180         errln("FAIL: a frozen set was modified by operator=");
2181     }
2182 
2183     UnicodeSet frozen2(frozen);
2184     if(frozen2!=frozen || frozen2!=idSet) {
2185         errln("FAIL: a copied frozen set differs from its frozen original");
2186     }
2187     if(!frozen2.isFrozen()) {
2188         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2189     }
2190     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2191     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2192         errln("FAIL: UnicodeSet(5, 55) failed");
2193     }
2194     frozen3=frozen;
2195     if(!frozen3.isFrozen()) {
2196         errln("FAIL: copying a frozen set results in a thawed one");
2197     }
2198 
2199     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2200     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2201         errln("FAIL: clone() failed");
2202     }
2203     cloned->add(0xd802, 0xd805);
2204     if(cloned->containsSome(0xd802, 0xd805)) {
2205         errln("FAIL: unable to modify clone");
2206     }
2207     delete cloned;
2208 
2209     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2210     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2211         errln("FAIL: cloneAsThawed() failed");
2212     }
2213     thawed->add(0xd802, 0xd805);
2214     if(!thawed->contains(0xd802, 0xd805)) {
2215         errln("FAIL: unable to modify thawed clone");
2216     }
2217     delete thawed;
2218 
2219     frozen.set(5, 55);
2220     if(frozen!=idSet || !(frozen==idSet)) {
2221         errln("FAIL: UnicodeSet::set() modified a frozen set");
2222     }
2223 
2224     frozen.clear();
2225     if(frozen!=idSet || !(frozen==idSet)) {
2226         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2227     }
2228 
2229     frozen.closeOver(USET_CASE_INSENSITIVE);
2230     if(frozen!=idSet || !(frozen==idSet)) {
2231         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2232     }
2233 
2234     frozen.compact();
2235     if(frozen!=idSet || !(frozen==idSet)) {
2236         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2237     }
2238 
2239     ParsePosition pos;
2240     frozen.
2241         applyPattern(wsPattern, errorCode).
2242         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2243         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2244         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2245         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2246     if(frozen!=idSet || !(frozen==idSet)) {
2247         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2248     }
2249 
2250     frozen.
2251         add(0xd800).
2252         add(0xd802, 0xd805).
2253         add(wsPattern).
2254         addAll(idPattern).
2255         addAll(wsSet);
2256     if(frozen!=idSet || !(frozen==idSet)) {
2257         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2258     }
2259 
2260     frozen.
2261         retain(0x62).
2262         retain(0x64, 0x69).
2263         retainAll(wsPattern).
2264         retainAll(wsSet);
2265     if(frozen!=idSet || !(frozen==idSet)) {
2266         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2267     }
2268 
2269     frozen.
2270         remove(0x62).
2271         remove(0x64, 0x69).
2272         remove(idPattern).
2273         removeAll(idPattern).
2274         removeAll(idSet);
2275     if(frozen!=idSet || !(frozen==idSet)) {
2276         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2277     }
2278 
2279     frozen.
2280         complement().
2281         complement(0x62).
2282         complement(0x64, 0x69).
2283         complement(idPattern).
2284         complementAll(idPattern).
2285         complementAll(idSet);
2286     if(frozen!=idSet || !(frozen==idSet)) {
2287         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2288     }
2289 }
2290 
2291 // Test span() etc. -------------------------------------------------------- ***
2292 
2293 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2294 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2295 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2296     UErrorCode errorCode=U_ZERO_ERROR;
2297     int32_t length8=0;
2298     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2299     if(U_SUCCESS(errorCode)) {
2300         return length8;
2301     } else {
2302         // The string contains an unpaired surrogate.
2303         // Ignore this string.
2304         return 0;
2305     }
2306 }
2307 
2308 class UnicodeSetWithStringsIterator;
2309 
2310 // Make the strings in a UnicodeSet easily accessible.
2311 class UnicodeSetWithStrings {
2312 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2313     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2314             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2315         int32_t size=set.size();
2316         if(size>0 && set.charAt(size-1)<0) {
2317             // If a set's last element is not a code point, then it must contain strings.
2318             // Iterate over the set, skip all code point ranges, and cache the strings.
2319             // Convert them to UTF-8 for spanUTF8().
2320             UnicodeSetIterator iter(set);
2321             const UnicodeString *s;
2322             char *s8=utf8;
2323             int32_t length8, utf8Count=0;
2324             while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2325                 if(iter.isString()) {
2326                     // Store the pointer to the set's string element
2327                     // which we happen to know is a stable pointer.
2328                     strings[stringsLength]=s=&iter.getString();
2329                     utf8Count+=
2330                         utf8Lengths[stringsLength]=length8=
2331                         appendUTF8(s->getBuffer(), s->length(),
2332                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2333                     if(length8==0) {
2334                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2335                     }
2336                     s8+=length8;
2337                     ++stringsLength;
2338                 }
2339             }
2340         }
2341     }
2342 
getSet() const2343     const UnicodeSet &getSet() const {
2344         return set;
2345     }
2346 
hasStrings() const2347     UBool hasStrings() const {
2348         return (UBool)(stringsLength>0);
2349     }
2350 
hasStringsWithSurrogates() const2351     UBool hasStringsWithSurrogates() const {
2352         return hasSurrogates;
2353     }
2354 
2355 private:
2356     friend class UnicodeSetWithStringsIterator;
2357 
2358     const UnicodeSet &set;
2359 
2360     const UnicodeString *strings[20];
2361     int32_t stringsLength;
2362     UBool hasSurrogates;
2363 
2364     char utf8[1024];
2365     int32_t utf8Lengths[20];
2366 
2367     int32_t nextStringIndex;
2368     int32_t nextUTF8Start;
2369 };
2370 
2371 class UnicodeSetWithStringsIterator {
2372 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2373     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2374             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2375     }
2376 
reset()2377     void reset() {
2378         nextStringIndex=nextUTF8Start=0;
2379     }
2380 
nextString()2381     const UnicodeString *nextString() {
2382         if(nextStringIndex<fSet.stringsLength) {
2383             return fSet.strings[nextStringIndex++];
2384         } else {
2385             return NULL;
2386         }
2387     }
2388 
2389     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2390     const char *nextUTF8(int32_t &length) {
2391         if(nextStringIndex<fSet.stringsLength) {
2392             const char *s8=fSet.utf8+nextUTF8Start;
2393             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2394             return s8;
2395         } else {
2396             length=0;
2397             return NULL;
2398         }
2399     }
2400 
2401 private:
2402     const UnicodeSetWithStrings &fSet;
2403     int32_t nextStringIndex;
2404     int32_t nextUTF8Start;
2405 };
2406 
2407 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2408 // at code point boundaries.
2409 // That is, each edge of a match must not be in the middle of a surrogate pair.
2410 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2411 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2412     s+=start;
2413     limit-=start;
2414     int32_t length=t.length();
2415     return 0==t.compare(s, length) &&
2416            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2417            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2418 }
2419 
2420 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2421 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2422                                  USetSpanCondition spanCondition) {
2423     const UnicodeSet &realSet(set.getSet());
2424     if(!set.hasStrings()) {
2425         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2426             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2427         }
2428 
2429         UChar32 c;
2430         int32_t start=0, prev;
2431         while((prev=start)<length) {
2432             U16_NEXT(s, start, length, c);
2433             if(realSet.contains(c)!=spanCondition) {
2434                 break;
2435             }
2436         }
2437         return prev;
2438     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2439         UnicodeSetWithStringsIterator iter(set);
2440         UChar32 c;
2441         int32_t start, next;
2442         for(start=next=0; start<length;) {
2443             U16_NEXT(s, next, length, c);
2444             if(realSet.contains(c)) {
2445                 break;
2446             }
2447             const UnicodeString *str;
2448             iter.reset();
2449             while((str=iter.nextString())!=NULL) {
2450                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2451                     // spanNeedsStrings=TRUE;
2452                     return start;
2453                 }
2454             }
2455             start=next;
2456         }
2457         return start;
2458     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2459         UnicodeSetWithStringsIterator iter(set);
2460         UChar32 c;
2461         int32_t start, next, maxSpanLimit=0;
2462         for(start=next=0; start<length;) {
2463             U16_NEXT(s, next, length, c);
2464             if(!realSet.contains(c)) {
2465                 next=start;  // Do not span this single, not-contained code point.
2466             }
2467             const UnicodeString *str;
2468             iter.reset();
2469             while((str=iter.nextString())!=NULL) {
2470                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2471                     // spanNeedsStrings=TRUE;
2472                     int32_t matchLimit=start+str->length();
2473                     if(matchLimit==length) {
2474                         return length;
2475                     }
2476                     if(spanCondition==USET_SPAN_CONTAINED) {
2477                         // Iterate for the shortest match at each position.
2478                         // Recurse for each but the shortest match.
2479                         if(next==start) {
2480                             next=matchLimit;  // First match from start.
2481                         } else {
2482                             if(matchLimit<next) {
2483                                 // Remember shortest match from start for iteration.
2484                                 int32_t temp=next;
2485                                 next=matchLimit;
2486                                 matchLimit=temp;
2487                             }
2488                             // Recurse for non-shortest match from start.
2489                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2490                                                                  USET_SPAN_CONTAINED);
2491                             if((matchLimit+spanLength)>maxSpanLimit) {
2492                                 maxSpanLimit=matchLimit+spanLength;
2493                                 if(maxSpanLimit==length) {
2494                                     return length;
2495                                 }
2496                             }
2497                         }
2498                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2499                         if(matchLimit>next) {
2500                             // Remember longest match from start.
2501                             next=matchLimit;
2502                         }
2503                     }
2504                 }
2505             }
2506             if(next==start) {
2507                 break;  // No match from start.
2508             }
2509             start=next;
2510         }
2511         if(start>maxSpanLimit) {
2512             return start;
2513         } else {
2514             return maxSpanLimit;
2515         }
2516     }
2517 }
2518 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2519 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2520                                      USetSpanCondition spanCondition) {
2521     if(length==0) {
2522         return 0;
2523     }
2524     const UnicodeSet &realSet(set.getSet());
2525     if(!set.hasStrings()) {
2526         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2527             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2528         }
2529 
2530         UChar32 c;
2531         int32_t prev=length;
2532         do {
2533             U16_PREV(s, 0, length, c);
2534             if(realSet.contains(c)!=spanCondition) {
2535                 break;
2536             }
2537         } while((prev=length)>0);
2538         return prev;
2539     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2540         UnicodeSetWithStringsIterator iter(set);
2541         UChar32 c;
2542         int32_t prev=length, length0=length;
2543         do {
2544             U16_PREV(s, 0, length, c);
2545             if(realSet.contains(c)) {
2546                 break;
2547             }
2548             const UnicodeString *str;
2549             iter.reset();
2550             while((str=iter.nextString())!=NULL) {
2551                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2552                     // spanNeedsStrings=TRUE;
2553                     return prev;
2554                 }
2555             }
2556         } while((prev=length)>0);
2557         return prev;
2558     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2559         UnicodeSetWithStringsIterator iter(set);
2560         UChar32 c;
2561         int32_t prev=length, minSpanStart=length, length0=length;
2562         do {
2563             U16_PREV(s, 0, length, c);
2564             if(!realSet.contains(c)) {
2565                 length=prev;  // Do not span this single, not-contained code point.
2566             }
2567             const UnicodeString *str;
2568             iter.reset();
2569             while((str=iter.nextString())!=NULL) {
2570                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2571                     // spanNeedsStrings=TRUE;
2572                     int32_t matchStart=prev-str->length();
2573                     if(matchStart==0) {
2574                         return 0;
2575                     }
2576                     if(spanCondition==USET_SPAN_CONTAINED) {
2577                         // Iterate for the shortest match at each position.
2578                         // Recurse for each but the shortest match.
2579                         if(length==prev) {
2580                             length=matchStart;  // First match from prev.
2581                         } else {
2582                             if(matchStart>length) {
2583                                 // Remember shortest match from prev for iteration.
2584                                 int32_t temp=length;
2585                                 length=matchStart;
2586                                 matchStart=temp;
2587                             }
2588                             // Recurse for non-shortest match from prev.
2589                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2590                                                                     USET_SPAN_CONTAINED);
2591                             if(spanStart<minSpanStart) {
2592                                 minSpanStart=spanStart;
2593                                 if(minSpanStart==0) {
2594                                     return 0;
2595                                 }
2596                             }
2597                         }
2598                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2599                         if(matchStart<length) {
2600                             // Remember longest match from prev.
2601                             length=matchStart;
2602                         }
2603                     }
2604                 }
2605             }
2606             if(length==prev) {
2607                 break;  // No match from prev.
2608             }
2609         } while((prev=length)>0);
2610         if(prev<minSpanStart) {
2611             return prev;
2612         } else {
2613             return minSpanStart;
2614         }
2615     }
2616 }
2617 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2618 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2619                                 USetSpanCondition spanCondition) {
2620     const UnicodeSet &realSet(set.getSet());
2621     if(!set.hasStrings()) {
2622         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2623             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2624         }
2625 
2626         UChar32 c;
2627         int32_t start=0, prev;
2628         while((prev=start)<length) {
2629             U8_NEXT(s, start, length, c);
2630             if(c<0) {
2631                 c=0xfffd;
2632             }
2633             if(realSet.contains(c)!=spanCondition) {
2634                 break;
2635             }
2636         }
2637         return prev;
2638     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2639         UnicodeSetWithStringsIterator iter(set);
2640         UChar32 c;
2641         int32_t start, next;
2642         for(start=next=0; start<length;) {
2643             U8_NEXT(s, next, length, c);
2644             if(c<0) {
2645                 c=0xfffd;
2646             }
2647             if(realSet.contains(c)) {
2648                 break;
2649             }
2650             const char *s8;
2651             int32_t length8;
2652             iter.reset();
2653             while((s8=iter.nextUTF8(length8))!=NULL) {
2654                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2655                     // spanNeedsStrings=TRUE;
2656                     return start;
2657                 }
2658             }
2659             start=next;
2660         }
2661         return start;
2662     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2663         UnicodeSetWithStringsIterator iter(set);
2664         UChar32 c;
2665         int32_t start, next, maxSpanLimit=0;
2666         for(start=next=0; start<length;) {
2667             U8_NEXT(s, next, length, c);
2668             if(c<0) {
2669                 c=0xfffd;
2670             }
2671             if(!realSet.contains(c)) {
2672                 next=start;  // Do not span this single, not-contained code point.
2673             }
2674             const char *s8;
2675             int32_t length8;
2676             iter.reset();
2677             while((s8=iter.nextUTF8(length8))!=NULL) {
2678                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2679                     // spanNeedsStrings=TRUE;
2680                     int32_t matchLimit=start+length8;
2681                     if(matchLimit==length) {
2682                         return length;
2683                     }
2684                     if(spanCondition==USET_SPAN_CONTAINED) {
2685                         // Iterate for the shortest match at each position.
2686                         // Recurse for each but the shortest match.
2687                         if(next==start) {
2688                             next=matchLimit;  // First match from start.
2689                         } else {
2690                             if(matchLimit<next) {
2691                                 // Remember shortest match from start for iteration.
2692                                 int32_t temp=next;
2693                                 next=matchLimit;
2694                                 matchLimit=temp;
2695                             }
2696                             // Recurse for non-shortest match from start.
2697                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2698                                                                 USET_SPAN_CONTAINED);
2699                             if((matchLimit+spanLength)>maxSpanLimit) {
2700                                 maxSpanLimit=matchLimit+spanLength;
2701                                 if(maxSpanLimit==length) {
2702                                     return length;
2703                                 }
2704                             }
2705                         }
2706                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2707                         if(matchLimit>next) {
2708                             // Remember longest match from start.
2709                             next=matchLimit;
2710                         }
2711                     }
2712                 }
2713             }
2714             if(next==start) {
2715                 break;  // No match from start.
2716             }
2717             start=next;
2718         }
2719         if(start>maxSpanLimit) {
2720             return start;
2721         } else {
2722             return maxSpanLimit;
2723         }
2724     }
2725 }
2726 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2727 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2728                                     USetSpanCondition spanCondition) {
2729     if(length==0) {
2730         return 0;
2731     }
2732     const UnicodeSet &realSet(set.getSet());
2733     if(!set.hasStrings()) {
2734         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2735             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2736         }
2737 
2738         UChar32 c;
2739         int32_t prev=length;
2740         do {
2741             U8_PREV(s, 0, length, c);
2742             if(c<0) {
2743                 c=0xfffd;
2744             }
2745             if(realSet.contains(c)!=spanCondition) {
2746                 break;
2747             }
2748         } while((prev=length)>0);
2749         return prev;
2750     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2751         UnicodeSetWithStringsIterator iter(set);
2752         UChar32 c;
2753         int32_t prev=length;
2754         do {
2755             U8_PREV(s, 0, length, c);
2756             if(c<0) {
2757                 c=0xfffd;
2758             }
2759             if(realSet.contains(c)) {
2760                 break;
2761             }
2762             const char *s8;
2763             int32_t length8;
2764             iter.reset();
2765             while((s8=iter.nextUTF8(length8))!=NULL) {
2766                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2767                     // spanNeedsStrings=TRUE;
2768                     return prev;
2769                 }
2770             }
2771         } while((prev=length)>0);
2772         return prev;
2773     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2774         UnicodeSetWithStringsIterator iter(set);
2775         UChar32 c;
2776         int32_t prev=length, minSpanStart=length;
2777         do {
2778             U8_PREV(s, 0, length, c);
2779             if(c<0) {
2780                 c=0xfffd;
2781             }
2782             if(!realSet.contains(c)) {
2783                 length=prev;  // Do not span this single, not-contained code point.
2784             }
2785             const char *s8;
2786             int32_t length8;
2787             iter.reset();
2788             while((s8=iter.nextUTF8(length8))!=NULL) {
2789                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2790                     // spanNeedsStrings=TRUE;
2791                     int32_t matchStart=prev-length8;
2792                     if(matchStart==0) {
2793                         return 0;
2794                     }
2795                     if(spanCondition==USET_SPAN_CONTAINED) {
2796                         // Iterate for the shortest match at each position.
2797                         // Recurse for each but the shortest match.
2798                         if(length==prev) {
2799                             length=matchStart;  // First match from prev.
2800                         } else {
2801                             if(matchStart>length) {
2802                                 // Remember shortest match from prev for iteration.
2803                                 int32_t temp=length;
2804                                 length=matchStart;
2805                                 matchStart=temp;
2806                             }
2807                             // Recurse for non-shortest match from prev.
2808                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2809                                                                    USET_SPAN_CONTAINED);
2810                             if(spanStart<minSpanStart) {
2811                                 minSpanStart=spanStart;
2812                                 if(minSpanStart==0) {
2813                                     return 0;
2814                                 }
2815                             }
2816                         }
2817                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2818                         if(matchStart<length) {
2819                             // Remember longest match from prev.
2820                             length=matchStart;
2821                         }
2822                     }
2823                 }
2824             }
2825             if(length==prev) {
2826                 break;  // No match from prev.
2827             }
2828         } while((prev=length)>0);
2829         if(prev<minSpanStart) {
2830             return prev;
2831         } else {
2832             return minSpanStart;
2833         }
2834     }
2835 }
2836 
2837 // spans to be performed and compared
2838 enum {
2839     SPAN_UTF16          =1,
2840     SPAN_UTF8           =2,
2841     SPAN_UTFS           =3,
2842 
2843     SPAN_SET            =4,
2844     SPAN_COMPLEMENT     =8,
2845     SPAN_POLARITY       =0xc,
2846 
2847     SPAN_FWD            =0x10,
2848     SPAN_BACK           =0x20,
2849     SPAN_DIRS           =0x30,
2850 
2851     SPAN_CONTAINED      =0x100,
2852     SPAN_SIMPLE         =0x200,
2853     SPAN_CONDITION      =0x300,
2854 
2855     SPAN_ALL            =0x33f
2856 };
2857 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2858 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2859     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2860 }
2861 
slen(const void * s,UBool isUTF16)2862 static inline int32_t slen(const void *s, UBool isUTF16) {
2863     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2864 }
2865 
2866 /*
2867  * Count spans on a string with the method according to type and set the span limits.
2868  * The set may be the complement of the original.
2869  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2870  * according to the expected number of spans.
2871  * Sets typeName to an empty string if there is no such type.
2872  * Returns -1 if the span option is filtered out.
2873  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2874 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2875                         const void *s, int32_t length, UBool isUTF16,
2876                         uint32_t whichSpans,
2877                         int type, const char *&typeName,
2878                         int32_t limits[], int32_t limitsCapacity,
2879                         int32_t expectCount) {
2880     const UnicodeSet &realSet(set.getSet());
2881     int32_t start, count;
2882     USetSpanCondition spanCondition, firstSpanCondition, contained;
2883     UBool isForward;
2884 
2885     if(type<0 || 7<type) {
2886         typeName="";
2887         return 0;
2888     }
2889 
2890     static const char *const typeNames16[]={
2891         "contains", "contains(LM)",
2892         "span", "span(LM)",
2893         "containsBack", "containsBack(LM)",
2894         "spanBack", "spanBack(LM)"
2895     };
2896 
2897     static const char *const typeNames8[]={
2898         "containsUTF8", "containsUTF8(LM)",
2899         "spanUTF8", "spanUTF8(LM)",
2900         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2901         "spanBackUTF8", "spanBackUTF8(LM)"
2902     };
2903 
2904     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2905 
2906     // filter span options
2907     if(type<=3) {
2908         // span forward
2909         if((whichSpans&SPAN_FWD)==0) {
2910             return -1;
2911         }
2912         isForward=TRUE;
2913     } else {
2914         // span backward
2915         if((whichSpans&SPAN_BACK)==0) {
2916             return -1;
2917         }
2918         isForward=FALSE;
2919     }
2920     if((type&1)==0) {
2921         // use USET_SPAN_CONTAINED
2922         if((whichSpans&SPAN_CONTAINED)==0) {
2923             return -1;
2924         }
2925         contained=USET_SPAN_CONTAINED;
2926     } else {
2927         // use USET_SPAN_SIMPLE
2928         if((whichSpans&SPAN_SIMPLE)==0) {
2929             return -1;
2930         }
2931         contained=USET_SPAN_SIMPLE;
2932     }
2933 
2934     // Default first span condition for going forward with an uncomplemented set.
2935     spanCondition=USET_SPAN_NOT_CONTAINED;
2936     if(isComplement) {
2937         spanCondition=invertSpanCondition(spanCondition, contained);
2938     }
2939 
2940     // First span condition for span(), used to terminate the spanBack() iteration.
2941     firstSpanCondition=spanCondition;
2942 
2943     // spanBack(): Its initial span condition is span()'s last span condition,
2944     // which is the opposite of span()'s first span condition
2945     // if we expect an even number of spans.
2946     // (The loop inverts spanCondition (expectCount-1) times
2947     // before the expectCount'th span() call.)
2948     // If we do not compare forward and backward directions, then we do not have an
2949     // expectCount and just start with firstSpanCondition.
2950     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2951         spanCondition=invertSpanCondition(spanCondition, contained);
2952     }
2953 
2954     count=0;
2955     switch(type) {
2956     case 0:
2957     case 1:
2958         start=0;
2959         if(length<0) {
2960             length=slen(s, isUTF16);
2961         }
2962         for(;;) {
2963             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2964                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2965             if(count<limitsCapacity) {
2966                 limits[count]=start;
2967             }
2968             ++count;
2969             if(start>=length) {
2970                 break;
2971             }
2972             spanCondition=invertSpanCondition(spanCondition, contained);
2973         }
2974         break;
2975     case 2:
2976     case 3:
2977         start=0;
2978         for(;;) {
2979             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2980                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2981             if(count<limitsCapacity) {
2982                 limits[count]=start;
2983             }
2984             ++count;
2985             if(length>=0 ? start>=length :
2986                            isUTF16 ? ((const UChar *)s)[start]==0 :
2987                                      ((const char *)s)[start]==0
2988             ) {
2989                 break;
2990             }
2991             spanCondition=invertSpanCondition(spanCondition, contained);
2992         }
2993         break;
2994     case 4:
2995     case 5:
2996         if(length<0) {
2997             length=slen(s, isUTF16);
2998         }
2999         for(;;) {
3000             ++count;
3001             if(count<=limitsCapacity) {
3002                 limits[limitsCapacity-count]=length;
3003             }
3004             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3005                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3006             if(length==0 && spanCondition==firstSpanCondition) {
3007                 break;
3008             }
3009             spanCondition=invertSpanCondition(spanCondition, contained);
3010         }
3011         if(count<limitsCapacity) {
3012             memmove(limits, limits+(limitsCapacity-count), count*4);
3013         }
3014         break;
3015     case 6:
3016     case 7:
3017         for(;;) {
3018             ++count;
3019             if(count<=limitsCapacity) {
3020                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3021             }
3022             // Note: Length<0 is tested only for the first spanBack().
3023             // If we wanted to keep length<0 for all spanBack()s, we would have to
3024             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3025             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3026                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
3027             if(length==0 && spanCondition==firstSpanCondition) {
3028                 break;
3029             }
3030             spanCondition=invertSpanCondition(spanCondition, contained);
3031         }
3032         if(count<limitsCapacity) {
3033             memmove(limits, limits+(limitsCapacity-count), count*4);
3034         }
3035         break;
3036     default:
3037         typeName="";
3038         return -1;
3039     }
3040 
3041     return count;
3042 }
3043 
3044 // sets to be tested; odd index=isComplement
3045 enum {
3046     SLOW,
3047     SLOW_NOT,
3048     FAST,
3049     FAST_NOT,
3050     SET_COUNT
3051 };
3052 
3053 static const char *const setNames[SET_COUNT]={
3054     "slow",
3055     "slow.not",
3056     "fast",
3057     "fast.not"
3058 };
3059 
3060 /*
3061  * Verify that we get the same results whether we look at text with contains(),
3062  * span() or spanBack(), using unfrozen or frozen versions of the set,
3063  * and using the set or its complement (switching the spanConditions accordingly).
3064  * The latter verifies that
3065  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3066  *
3067  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3068  * or returned to the caller (with an input expectCount<0).
3069  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3070 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3071                               const void *s, int32_t length, UBool isUTF16,
3072                               uint32_t whichSpans,
3073                               int32_t expectLimits[], int32_t &expectCount,
3074                               const char *testName, int32_t index) {
3075     int32_t limits[500];
3076     int32_t limitsCount;
3077     int i, j;
3078 
3079     const char *typeName;
3080     int type;
3081 
3082     for(i=0; i<SET_COUNT; ++i) {
3083         if((i&1)==0) {
3084             // Even-numbered sets are original, uncomplemented sets.
3085             if((whichSpans&SPAN_SET)==0) {
3086                 continue;
3087             }
3088         } else {
3089             // Odd-numbered sets are complemented.
3090             if((whichSpans&SPAN_COMPLEMENT)==0) {
3091                 continue;
3092             }
3093         }
3094         for(type=0;; ++type) {
3095             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3096                                  s, length, isUTF16,
3097                                  whichSpans,
3098                                  type, typeName,
3099                                  limits, LENGTHOF(limits), expectCount);
3100             if(typeName[0]==0) {
3101                 break; // All types tried.
3102             }
3103             if(limitsCount<0) {
3104                 continue; // Span option filtered out.
3105             }
3106             if(expectCount<0) {
3107                 expectCount=limitsCount;
3108                 if(limitsCount>LENGTHOF(limits)) {
3109                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3110                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3111                     return;
3112                 }
3113                 memcpy(expectLimits, limits, limitsCount*4);
3114             } else if(limitsCount!=expectCount) {
3115                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3116                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3117             } else {
3118                 for(j=0; j<limitsCount; ++j) {
3119                     if(limits[j]!=expectLimits[j]) {
3120                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3121                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3122                               j, (long)limits[j], (long)expectLimits[j]);
3123                         break;
3124                     }
3125                 }
3126             }
3127         }
3128     }
3129 
3130     // Compare span() with containsAll()/containsNone(),
3131     // but only if we have expectLimits[] from the uncomplemented set.
3132     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3133         const UChar *s16=(const UChar *)s;
3134         UnicodeString string;
3135         int32_t prev=0, limit, length;
3136         for(i=0; i<expectCount; ++i) {
3137             limit=expectLimits[i];
3138             length=limit-prev;
3139             if(length>0) {
3140                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3141                 if(i&1) {
3142                     if(!sets[SLOW]->getSet().containsAll(string)) {
3143                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3144                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3145                         return;
3146                     }
3147                     if(!sets[FAST]->getSet().containsAll(string)) {
3148                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3149                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3150                         return;
3151                     }
3152                 } else {
3153                     if(!sets[SLOW]->getSet().containsNone(string)) {
3154                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3155                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3156                         return;
3157                     }
3158                     if(!sets[FAST]->getSet().containsNone(string)) {
3159                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3160                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3161                         return;
3162                     }
3163                 }
3164             }
3165             prev=limit;
3166         }
3167     }
3168 }
3169 
3170 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3171 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3172                               const void *s, int32_t length, UBool isUTF16,
3173                               uint32_t whichSpans,
3174                               const char *testName, int32_t index) {
3175     int32_t expectLimits[500];
3176     int32_t expectCount=-1;
3177     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3178 }
3179 
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3180 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3181     UChar c, c2;
3182 
3183     if(length>=0) {
3184         while(length>0) {
3185             c=*s++;
3186             --length;
3187             if(0xd800<=c && c<0xe000) {
3188                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3189                     return TRUE;
3190                 }
3191                 --length;
3192             }
3193         }
3194     } else {
3195         while((c=*s++)!=0) {
3196             if(0xd800<=c && c<0xe000) {
3197                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3198                     return TRUE;
3199                 }
3200             }
3201         }
3202     }
3203     return FALSE;
3204 }
3205 
3206 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3207 // unless either UTF is turned off in whichSpans.
3208 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3209 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3210 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3211                                       const UChar *s16, int32_t length16,
3212                                       uint32_t whichSpans,
3213                                       const char *testName, int32_t index) {
3214     int32_t expectLimits[500];
3215     int32_t expectCount;
3216 
3217     expectCount=-1;  // Get expectLimits[] from testSpan().
3218 
3219     if((whichSpans&SPAN_UTF16)!=0) {
3220         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3221     }
3222     if((whichSpans&SPAN_UTF8)==0) {
3223         return;
3224     }
3225 
3226     // Convert s16[] and expectLimits[] to UTF-8.
3227     uint8_t s8[3000];
3228     int32_t offsets[3000];
3229 
3230     const UChar *s16Limit=s16+length16;
3231     char *t=(char *)s8;
3232     char *tLimit=t+sizeof(s8);
3233     int32_t *o=offsets;
3234     UErrorCode errorCode=U_ZERO_ERROR;
3235 
3236     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3237     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3238     if(U_FAILURE(errorCode)) {
3239         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3240               testName, (long)index, u_errorName(errorCode));
3241         ucnv_resetFromUnicode(utf8Cnv);
3242         return;
3243     }
3244     int32_t length8=(int32_t)(t-(char *)s8);
3245 
3246     // Convert expectLimits[].
3247     int32_t i, j, expect;
3248     for(i=j=0; i<expectCount; ++i) {
3249         expect=expectLimits[i];
3250         if(expect==length16) {
3251             expectLimits[i]=length8;
3252         } else {
3253             while(offsets[j]<expect) {
3254                 ++j;
3255             }
3256             expectLimits[i]=j;
3257         }
3258     }
3259 
3260     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3261 }
3262 
nextCodePoint(UChar32 c)3263 static UChar32 nextCodePoint(UChar32 c) {
3264     // Skip some large and boring ranges.
3265     switch(c) {
3266     case 0x3441:
3267         return 0x4d7f;
3268     case 0x5100:
3269         return 0x9f00;
3270     case 0xb040:
3271         return 0xd780;
3272     case 0xe041:
3273         return 0xf8fe;
3274     case 0x10100:
3275         return 0x20000;
3276     case 0x20041:
3277         return 0xe0000;
3278     case 0xe0101:
3279         return 0x10fffd;
3280     default:
3281         return c+1;
3282     }
3283 }
3284 
3285 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3286 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3287     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3288     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3289     // Skip the UTF-8 part of the test - if the string contains surrogates -
3290     // because it is likely to produce a different result.
3291     UBool inconsistentSurrogates=
3292             (!(sets[0]->getSet().contains(0xfffd) ?
3293                sets[0]->getSet().contains(0xd800, 0xdfff) :
3294                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3295              sets[0]->hasStringsWithSurrogates());
3296 
3297     UChar s[1000];
3298     int32_t length=0;
3299     uint32_t localWhichSpans;
3300 
3301     UChar32 c, first;
3302     for(first=c=0;; c=nextCodePoint(c)) {
3303         if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3304             localWhichSpans=whichSpans;
3305             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3306                 localWhichSpans&=~SPAN_UTF8;
3307             }
3308             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3309             if(c>0x10ffff) {
3310                 break;
3311             }
3312             length=0;
3313             first=c;
3314         }
3315         U16_APPEND_UNSAFE(s, length, c);
3316     }
3317 }
3318 
3319 // Test with a particular, interesting string.
3320 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3321 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3322     static const UChar s[]={
3323         0x61, 0x62, 0x20,                       // Latin, space
3324         0x3b1, 0x3b2, 0x3b3,                    // Greek
3325         0xd900,                                 // lead surrogate
3326         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3327         0xdc05,                                 // trail surrogate
3328         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3329         0xd900, 0xdc05,                         // unassigned supplementary
3330         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3331         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3332         0                                       // NUL
3333     };
3334 
3335     if((whichSpans&SPAN_UTF16)==0) {
3336         return;
3337     }
3338     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3339     testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3340 }
3341 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3342 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3343     static const char s[]={
3344         "abc"                                   // Latin
3345 
3346         /* trail byte in lead position */
3347         "\x80"
3348 
3349         " "                                     // space
3350 
3351         /* truncated multi-byte sequences */
3352         "\xd0"
3353         "\xe0"
3354         "\xe1"
3355         "\xed"
3356         "\xee"
3357         "\xf0"
3358         "\xf1"
3359         "\xf4"
3360         "\xf8"
3361         "\xfc"
3362 
3363         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3364 
3365         /* trail byte in lead position */
3366         "\x80"
3367 
3368         "\xe0\x80"
3369         "\xe0\xa0"
3370         "\xe1\x80"
3371         "\xed\x80"
3372         "\xed\xa0"
3373         "\xee\x80"
3374         "\xf0\x80"
3375         "\xf0\x90"
3376         "\xf1\x80"
3377         "\xf4\x80"
3378         "\xf4\x90"
3379         "\xf8\x80"
3380         "\xfc\x80"
3381 
3382         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3383 
3384         /* trail byte in lead position */
3385         "\x80"
3386 
3387         "\xf0\x80\x80"
3388         "\xf0\x90\x80"
3389         "\xf1\x80\x80"
3390         "\xf4\x80\x80"
3391         "\xf4\x90\x80"
3392         "\xf8\x80\x80"
3393         "\xfc\x80\x80"
3394 
3395         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3396 
3397         /* trail byte in lead position */
3398         "\x80"
3399 
3400         "\xf8\x80\x80\x80"
3401         "\xfc\x80\x80\x80"
3402 
3403         "\xF1\x90\x80\x85"                      // unassigned supplementary
3404 
3405         /* trail byte in lead position */
3406         "\x80"
3407 
3408         "\xfc\x80\x80\x80\x80"
3409 
3410         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3411 
3412         /* trail byte in lead position */
3413         "\x80"
3414 
3415         /* complete sequences but non-shortest forms or out of range etc. */
3416         "\xc0\x80"
3417         "\xe0\x80\x80"
3418         "\xed\xa0\x80"
3419         "\xf0\x80\x80\x80"
3420         "\xf4\x90\x80\x80"
3421         "\xf8\x80\x80\x80\x80"
3422         "\xfc\x80\x80\x80\x80\x80"
3423         "\xfe"
3424         "\xff"
3425 
3426         /* trail byte in lead position */
3427         "\x80"
3428 
3429         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3430     };
3431 
3432     if((whichSpans&SPAN_UTF8)==0) {
3433         return;
3434     }
3435     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3436     testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3437 }
3438 
3439 // Take a set of span options and multiply them so that
3440 // each portion only has one of the options a, b and c.
3441 // If b==0, then the set of options is just modified with mask and a.
3442 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3443 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3444 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3445                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3446     uint32_t s;
3447     int32_t i;
3448 
3449     for(i=0; i<whichSpansCount; ++i) {
3450         s=whichSpans[i]&mask;
3451         whichSpans[i]=s|a;
3452         if(b!=0) {
3453             whichSpans[whichSpansCount+i]=s|b;
3454             if(c!=0) {
3455                 whichSpans[2*whichSpansCount+i]=s|c;
3456             }
3457         }
3458     }
3459     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3460 }
3461 
3462 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3463 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3464 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3465 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3466 
TestSpan()3467 void UnicodeSetTest::TestSpan() {
3468     // "[...]" is a UnicodeSet pattern.
3469     // "*" performs tests on all Unicode code points and on a selection of
3470     //   malformed UTF-8/16 strings.
3471     // "-options" limits the scope of testing for the current set.
3472     //   By default, the test verifies that equivalent boundaries are found
3473     //   for UTF-16 and UTF-8, going forward and backward,
3474     //   alternating USET_SPAN_NOT_CONTAINED with
3475     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3476     //   Single-character options:
3477     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3478     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3479     //          or the set contains strings with unpaired surrogates
3480     //          which do not translate to valid UTF-8.
3481     //     c -- set.span() and set.complement().span() boundaries may differ.
3482     //          Cause: Set strings are not complemented.
3483     //     b -- span() and spanBack() boundaries may differ.
3484     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3485     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3486     //          match with non-overlapping substrings.
3487     //          For example, with a set containing "ab" and "ba",
3488     //          span() of "aba" yields boundaries { 0, 2, 3 }
3489     //          because the initial "ab" matches from 0 to 2,
3490     //          while spanBack() yields boundaries { 0, 1, 3 }
3491     //          because the final "ba" matches from 1 to 3.
3492     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3493     //          Cause: Strings in the set overlap, and a longer match may
3494     //          require a sequence including non-longest substrings.
3495     //          For example, with a set containing "ab", "abc" and "cd",
3496     //          span(contained) of "abcd" spans the entire string
3497     //          but span(longest match) only spans the first 3 characters.
3498     //   Each "-options" first resets all options and then applies the specified options.
3499     //   A "-" without options resets the options.
3500     //   The options are also reset for each new set.
3501     // Other strings will be spanned.
3502     static const char *const testdata[]={
3503         "[:ID_Continue:]",
3504         "*",
3505         "[:White_Space:]",
3506         "*",
3507         "[]",
3508         "*",
3509         "[\\u0000-\\U0010FFFF]",
3510         "*",
3511         "[\\u0000\\u0080\\u0800\\U00010000]",
3512         "*",
3513         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3514         "*",
3515         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3516         "-c",
3517         "*",
3518         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3519         "-c",
3520         "*",
3521 
3522         // Overlapping strings cause overlapping attempts to match.
3523         "[x{xy}{xya}{axy}{ax}]",
3524         "-cl",
3525 
3526         // More repetitions of "xya" would take too long with the recursive
3527         // reference implementation.
3528         // containsAll()=FALSE
3529         // test_string 0x14
3530         "xx"
3531         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3532         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3533         "xyaxyaxyaxya"
3534         "xx"
3535         "xyaxyaxyaxya"  // span() ends here.
3536         "aaa",
3537 
3538         // containsAll()=TRUE
3539         // test_string 0x15
3540         "xx"
3541         "xyaxyaxyaxya"
3542         "xx"
3543         "xyaxyaxyaxya"
3544         "xx"
3545         "xyaxyaxyaxy",
3546 
3547         "-bc",
3548         // test_string 0x17
3549         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3550         "-c",
3551         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3552         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3553         "-",
3554         "byaya",     // span() -> { 5 }
3555         "byay",      // span() -> { 4 }
3556         "bya",       // span() -> { 3 }
3557 
3558         // span(longest match) will not span the whole string.
3559         "[a{ab}{bc}]",
3560         "-cl",
3561         // test_string 0x21
3562         "abc",
3563 
3564         "[a{ab}{abc}{cd}]",
3565         "-cl",
3566         "acdabcdabccd",
3567 
3568         // spanBack(longest match) will not span the whole string.
3569         "[c{ab}{bc}]",
3570         "-cl",
3571         "abc",
3572 
3573         "[d{cd}{bcd}{ab}]",
3574         "-cl",
3575         "abbcdabcdabd",
3576 
3577         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3578         // and UTF-8 trail bytes.
3579         // Copies of above test sets and strings, but transliterated to have
3580         // different code points with similar trail units.
3581         // Previous: a      b         c            d
3582         // Unicode:  042B   30AB      200AB        204AB
3583         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3584         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3585         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3586         "-cl",
3587         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3588 
3589         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3590         "-cl",
3591         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3592 
3593         // Stress bookkeeping and recursion.
3594         // The following strings are barely doable with the recursive
3595         // reference implementation.
3596         // The not-contained character at the end prevents an early exit from the span().
3597         "[b{bb}]",
3598         "-c",
3599         // test_string 0x33
3600         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3601         // On complement sets, span() and spanBack() get different results
3602         // because b is not in the complement set and there is an odd number of b's
3603         // in the test string.
3604         "-bc",
3605         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3606 
3607         // Test with set strings with an initial or final code point span
3608         // longer than 254.
3609         "[a{" _64_a _64_a _64_a _64_a "b}"
3610           "{a" _64_b _64_b _64_b _64_b "}]",
3611         "-c",
3612         _64_a _64_a _64_a _63_a "b",
3613         _64_a _64_a _64_a _64_a "b",
3614         _64_a _64_a _64_a _64_a "aaaabbbb",
3615         "a" _64_b _64_b _64_b _63_b,
3616         "a" _64_b _64_b _64_b _64_b,
3617         "aaaabbbb" _64_b _64_b _64_b _64_b,
3618 
3619         // Test with strings containing unpaired surrogates.
3620         // They are not representable in UTF-8, and a leading trail surrogate
3621         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3622         // U+20001 == \\uD840\\uDC01
3623         // U+20400 == \\uD841\\uDC00
3624         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3625         "-8cl",
3626         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3627     };
3628     uint32_t whichSpans[96]={ SPAN_ALL };
3629     int32_t whichSpansCount=1;
3630 
3631     UnicodeSet *sets[SET_COUNT]={ NULL };
3632     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3633 
3634     char testName[1024];
3635     char *testNameLimit=testName;
3636 
3637     int32_t i, j;
3638     for(i=0; i<LENGTHOF(testdata); ++i) {
3639         const char *s=testdata[i];
3640         if(s[0]=='[') {
3641             // Create new test sets from this pattern.
3642             for(j=0; j<SET_COUNT; ++j) {
3643                 delete sets_with_str[j];
3644                 delete sets[j];
3645             }
3646             UErrorCode errorCode=U_ZERO_ERROR;
3647             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3648             if(U_FAILURE(errorCode)) {
3649                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3650                 break;
3651             }
3652             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3653             sets[SLOW_NOT]->complement();
3654             // Intermediate set: Test cloning of a frozen set.
3655             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3656             fast->freeze();
3657             sets[FAST]=(UnicodeSet *)fast->clone();
3658             delete fast;
3659             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3660             fastNot->freeze();
3661             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3662             delete fastNot;
3663 
3664             for(j=0; j<SET_COUNT; ++j) {
3665                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3666             }
3667 
3668             strcpy(testName, s);
3669             testNameLimit=strchr(testName, 0);
3670             *testNameLimit++=':';
3671             *testNameLimit=0;
3672 
3673             whichSpans[0]=SPAN_ALL;
3674             whichSpansCount=1;
3675         } else if(s[0]=='-') {
3676             whichSpans[0]=SPAN_ALL;
3677             whichSpansCount=1;
3678 
3679             while(*++s!=0) {
3680                 switch(*s) {
3681                 case 'c':
3682                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3683                                                    ~SPAN_POLARITY,
3684                                                    SPAN_SET,
3685                                                    SPAN_COMPLEMENT,
3686                                                    0);
3687                     break;
3688                 case 'b':
3689                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3690                                                    ~SPAN_DIRS,
3691                                                    SPAN_FWD,
3692                                                    SPAN_BACK,
3693                                                    0);
3694                     break;
3695                 case 'l':
3696                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3697                     // USET_SPAN_SIMPLE only FWD, and separately
3698                     // USET_SPAN_SIMPLE only BACK
3699                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3700                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3701                                                    SPAN_DIRS|SPAN_CONTAINED,
3702                                                    SPAN_FWD|SPAN_SIMPLE,
3703                                                    SPAN_BACK|SPAN_SIMPLE);
3704                     break;
3705                 case '8':
3706                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3707                                                    ~SPAN_UTFS,
3708                                                    SPAN_UTF16,
3709                                                    SPAN_UTF8,
3710                                                    0);
3711                     break;
3712                 default:
3713                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3714                     break;
3715                 }
3716             }
3717         } else if(0==strcmp(s, "*")) {
3718             strcpy(testNameLimit, "bad_string");
3719             for(j=0; j<whichSpansCount; ++j) {
3720                 if(whichSpansCount>1) {
3721                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3722                             "%%0x%3x",
3723                             whichSpans[j]);
3724                 }
3725                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3726                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3727             }
3728 
3729             strcpy(testNameLimit, "contents");
3730             for(j=0; j<whichSpansCount; ++j) {
3731                 if(whichSpansCount>1) {
3732                     sprintf(testNameLimit+8 /* strlen("contents") */,
3733                             "%%0x%3x",
3734                             whichSpans[j]);
3735                 }
3736                 testSpanContents(sets_with_str, whichSpans[j], testName);
3737             }
3738         } else {
3739             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3740             strcpy(testNameLimit, "test_string");
3741             for(j=0; j<whichSpansCount; ++j) {
3742                 if(whichSpansCount>1) {
3743                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3744                             "%%0x%3x",
3745                             whichSpans[j]);
3746                 }
3747                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3748             }
3749         }
3750     }
3751     for(j=0; j<SET_COUNT; ++j) {
3752         delete sets_with_str[j];
3753         delete sets[j];
3754     }
3755 }
3756 
3757 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3758 void UnicodeSetTest::TestStringSpan() {
3759     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3760     static const char *const string=
3761         "xx"
3762         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3763         "xx"
3764         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3765         "xx"
3766         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3767         "aaaa";
3768 
3769     UErrorCode errorCode=U_ZERO_ERROR;
3770     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3771     UnicodeSet set(pattern16, errorCode);
3772     if(U_FAILURE(errorCode)) {
3773         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3774         return;
3775     }
3776 
3777     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3778 
3779     if(set.containsAll(string16)) {
3780         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3781     }
3782 
3783     // Remove trailing "aaaa".
3784     string16.truncate(string16.length()-4);
3785     if(!set.containsAll(string16)) {
3786         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3787     }
3788 
3789     string16=UNICODE_STRING_SIMPLE("byayaxya");
3790     const UChar *s16=string16.getBuffer();
3791     int32_t length16=string16.length();
3792     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3793         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3794         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3795         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3796         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3797         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3798     ) {
3799         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3800     }
3801 
3802     pattern="[a{ab}{abc}{cd}]";
3803     pattern16=UnicodeString(pattern, -1, US_INV);
3804     set.applyPattern(pattern16, errorCode);
3805     if(U_FAILURE(errorCode)) {
3806         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3807         return;
3808     }
3809     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3810     s16=string16.getBuffer();
3811     length16=string16.length();
3812     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3813         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3814         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3815     ) {
3816         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3817     }
3818 
3819     pattern="[d{cd}{bcd}{ab}]";
3820     pattern16=UnicodeString(pattern, -1, US_INV);
3821     set.applyPattern(pattern16, errorCode).freeze();
3822     if(U_FAILURE(errorCode)) {
3823         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3824         return;
3825     }
3826     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3827     s16=string16.getBuffer();
3828     length16=string16.length();
3829     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3830         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3831         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3832     ) {
3833         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3834     }
3835 }
3836