• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ********************************************************************************
3 *   Copyright (C) 1999-2007 International Business Machines Corporation and
4 *   others. All Rights Reserved.
5 ********************************************************************************
6 *   Date        Name        Description
7 *   10/20/99    alan        Creation.
8 *   03/22/2000  Madhu       Added additional tests
9 ********************************************************************************
10 */
11 
12 #include <stdio.h>
13 
14 #include <string.h>
15 #include "unicode/utypes.h"
16 #include "usettest.h"
17 #include "unicode/ucnv.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/usetiter.h"
21 #include "unicode/ustring.h"
22 #include "unicode/parsepos.h"
23 #include "unicode/symtable.h"
24 #include "unicode/uversion.h"
25 #include "hash.h"
26 
27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
28 
29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
30     errln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
31     u_errorName(status));}}
32 
33 #define TEST_ASSERT(expr) {if (!(expr)) { \
34     errln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
35 
operator +(const UnicodeString & left,const UnicodeSet & set)36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37     UnicodeString pat;
38     set.toPattern(pat);
39     return left + UnicodeSetTest::escape(pat);
40 }
41 
42 #define CASE(id,test) case id:                          \
43                           name = #test;                 \
44                           if (exec) {                   \
45                               logln(#test "---");       \
46                               logln();                  \
47                               test();                   \
48                           }                             \
49                           break
50 
UnicodeSetTest()51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52 }
53 
openUTF8Converter()54 UConverter *UnicodeSetTest::openUTF8Converter() {
55     if(utf8Cnv==NULL) {
56         UErrorCode errorCode=U_ZERO_ERROR;
57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
58     }
59     return utf8Cnv;
60 }
61 
~UnicodeSetTest()62 UnicodeSetTest::~UnicodeSetTest() {
63     ucnv_close(utf8Cnv);
64 }
65 
66 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68                                const char* &name, char* /*par*/) {
69     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70     switch (index) {
71         CASE(0,TestPatterns);
72         CASE(1,TestAddRemove);
73         CASE(2,TestCategories);
74         CASE(3,TestCloneEqualHash);
75         CASE(4,TestMinimalRep);
76         CASE(5,TestAPI);
77         CASE(6,TestScriptSet);
78         CASE(7,TestPropertySet);
79         CASE(8,TestClone);
80         CASE(9,TestExhaustive);
81         CASE(10,TestToPattern);
82         CASE(11,TestIndexOf);
83         CASE(12,TestStrings);
84         CASE(13,Testj2268);
85         CASE(14,TestCloseOver);
86         CASE(15,TestEscapePattern);
87         CASE(16,TestInvalidCodePoint);
88         CASE(17,TestSymbolTable);
89         CASE(18,TestSurrogate);
90         CASE(19,TestPosixClasses);
91         CASE(20,TestIteration);
92         CASE(21,TestFreezable);
93         CASE(22,TestSpan);
94         CASE(23,TestStringSpan);
95         default: name = ""; break;
96     }
97 }
98 
99 static const char NOT[] = "%%%%";
100 
101 /**
102  * UVector was improperly copying contents
103  * This code will crash this is still true
104  */
Testj2268()105 void UnicodeSetTest::Testj2268() {
106   UnicodeSet t;
107   t.add(UnicodeString("abc"));
108   UnicodeSet test(t);
109   UnicodeString ustrPat;
110   test.toPattern(ustrPat, TRUE);
111 }
112 
113 /**
114  * Test toPattern().
115  */
TestToPattern()116 void UnicodeSetTest::TestToPattern() {
117     UErrorCode ec = U_ZERO_ERROR;
118 
119     // Test that toPattern() round trips with syntax characters and
120     // whitespace.
121     {
122         static const char* OTHER_TOPATTERN_TESTS[] = {
123             "[[:latin:]&[:greek:]]",
124             "[[:latin:]-[:greek:]]",
125             "[:nonspacing mark:]",
126             NULL
127         };
128 
129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130             ec = U_ZERO_ERROR;
131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132             if (U_FAILURE(ec)) {
133                 errln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j]);
134                 continue;
135             }
136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
137         }
138 
139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141 
142                 // check various combinations to make sure they all work.
143                 if (i != 0 && !toPatternAux(i, i)){
144                     continue;
145                 }
146                 if (!toPatternAux(0, i)){
147                     continue;
148                 }
149                 if (!toPatternAux(i, 0xFFFF)){
150                     continue;
151                 }
152             }
153         }
154     }
155 
156     // Test pattern behavior of multicharacter strings.
157     {
158         ec = U_ZERO_ERROR;
159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160 
161         // This loop isn't a loop.  It's here to make the compiler happy.
162         // If you're curious, try removing it and changing the 'break'
163         // statements (except for the last) to goto's.
164         for (;;) {
165             if (U_FAILURE(ec)) break;
166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168 
169             s->add("ac");
170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172 
173             s->applyPattern("[a-z {\\{l} {r\\}}]", ec);
174             if (U_FAILURE(ec)) break;
175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176             expectToPattern(*s, "[a-z{r\\}}{\\{l}]", exp3);
177 
178             s->add("[]");
179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180             expectToPattern(*s, "[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
181 
182             s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
183             if (U_FAILURE(ec)) break;
184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185             expectToPattern(*s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
186 
187             // j2189
188             s->clear();
189             s->add(UnicodeString("abc", ""));
190             s->add(UnicodeString("abc", ""));
191             const char* exp6[] = {"abc", NOT, "ab", NULL};
192             expectToPattern(*s, "[{abc}]", exp6);
193 
194             break;
195         }
196 
197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198         delete s;
199     }
200 
201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202     UnicodeSet s;
203     s.add((UChar)97, (UChar)98); // 'a', 'b'
204     expectToPattern(s, "[ab]", NULL);
205 }
206 
toPatternAux(UChar32 start,UChar32 end)207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208 
209     // use Integer.toString because Utility.hex doesn't handle ints
210     UnicodeString pat = "";
211     // TODO do these in hex
212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214     UnicodeString source;
215     source = source + (uint32_t)start;
216     if (start != end)
217         source = source + ".." + (uint32_t)end;
218     UnicodeSet testSet;
219     testSet.add(start, end);
220     return checkPat(source, testSet);
221 }
222 
checkPat(const UnicodeString & source,const UnicodeSet & testSet)223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224                                const UnicodeSet& testSet) {
225     // What we want to make sure of is that a pattern generated
226     // by toPattern(), with or without escaped unprintables, can
227     // be passed back into the UnicodeSet constructor.
228     UnicodeString pat0;
229 
230     testSet.toPattern(pat0, TRUE);
231 
232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233 
234     //String pat1 = unescapeLeniently(pat0);
235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236 
237     UnicodeString pat2;
238     testSet.toPattern(pat2, FALSE);
239     if (!checkPat(source, testSet, pat2)) return FALSE;
240 
241     //String pat3 = unescapeLeniently(pat2);
242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243 
244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246     return TRUE;
247 }
248 
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250                                const UnicodeSet& testSet,
251                                const UnicodeString& pat) {
252     UErrorCode ec = U_ZERO_ERROR;
253     UnicodeSet testSet2(pat, ec);
254     if (testSet2 != testSet) {
255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256         return FALSE;
257     }
258     return TRUE;
259 }
260 
261 void
TestPatterns(void)262 UnicodeSetTest::TestPatterns(void) {
263     UnicodeSet set;
264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270 
271     // Throw in a test of complement
272     set.complement();
273     UnicodeString exp;
274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275     expectPairs(set, exp);
276 }
277 
278 void
TestCategories(void)279 UnicodeSetTest::TestCategories(void) {
280     UErrorCode status = U_ZERO_ERROR;
281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282     UnicodeSet set(pat, status);
283     if (U_FAILURE(status)) {
284         errln((UnicodeString)"Fail: Can't construct set with " + pat);
285     } else {
286         expectContainment(set, pat, "ABC", "abc");
287     }
288 
289     UChar32 i;
290     int32_t failures = 0;
291     // Make sure generation of L doesn't pollute cached Lu set
292     // First generate L, then Lu
293     set.applyPattern("[:L:]", status);
294     if (U_FAILURE(status)) { errln("FAIL"); return; }
295     for (i=0; i<0x200; ++i) {
296         UBool l = u_isalpha((UChar)i);
297         if (l != set.contains(i)) {
298             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
299                   set.contains(i));
300             if (++failures == 10) break;
301         }
302     }
303 
304     set.applyPattern("[:Lu:]", status);
305     if (U_FAILURE(status)) { errln("FAIL"); return; }
306     for (i=0; i<0x200; ++i) {
307         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
308         if (lu != set.contains(i)) {
309             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
310                   set.contains(i));
311             if (++failures == 20) break;
312         }
313     }
314 }
315 void
TestCloneEqualHash(void)316 UnicodeSetTest::TestCloneEqualHash(void) {
317     UErrorCode status = U_ZERO_ERROR;
318     // set1 and set2 used to be built with the obsolete constructor taking
319     // UCharCategory values; replaced with pattern constructors
320     // markus 20030502
321     UnicodeSet *set1=new UnicodeSet("\\p{Lowercase Letter}", status); //  :Ll: Letter, lowercase
322     UnicodeSet *set1a=new UnicodeSet("[:Ll:]", status); //  Letter, lowercase
323     if (U_FAILURE(status)){
324         errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
325         return;
326     }
327     UnicodeSet *set2=new UnicodeSet("\\p{Decimal Number}", status);   //Number, Decimal digit
328     UnicodeSet *set2a=new UnicodeSet("[:Nd:]", status);   //Number, Decimal digit
329     if (U_FAILURE(status)){
330         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
331         return;
332     }
333 
334     if (*set1 != *set1a) {
335         errln("FAIL: category constructor for Ll broken");
336     }
337     if (*set2 != *set2a) {
338         errln("FAIL: category constructor for Nd broken");
339     }
340     delete set1a;
341     delete set2a;
342 
343     logln("Testing copy construction");
344     UnicodeSet *set1copy=new UnicodeSet(*set1);
345     if(*set1 != *set1copy || *set1 == *set2 ||
346         getPairs(*set1) != getPairs(*set1copy) ||
347         set1->hashCode() != set1copy->hashCode()){
348         errln("FAIL : Error in copy construction");
349         return;
350     }
351 
352     logln("Testing =operator");
353     UnicodeSet set1equal=*set1;
354     UnicodeSet set2equal=*set2;
355     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
356         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
357         errln("FAIL: Error in =operator");
358     }
359 
360     logln("Testing clone()");
361     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
362     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
363     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
364         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
365         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
366         errln("FAIL: Error in clone");
367     }
368 
369     logln("Testing hashcode");
370     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
371         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
372         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
373         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
374         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
375         errln("FAIL: Error in hashCode()");
376     }
377 
378     delete set1;
379     delete set1copy;
380     delete set2;
381     delete set1clone;
382     delete set2clone;
383 
384 
385 }
386 void
TestAddRemove(void)387 UnicodeSetTest::TestAddRemove(void) {
388     UnicodeSet set; // Construct empty set
389     doAssert(set.isEmpty() == TRUE, "set should be empty");
390     doAssert(set.size() == 0, "size should be 0");
391     set.complement();
392     doAssert(set.size() == 0x110000, "size should be 0x110000");
393     set.clear();
394     set.add(0x0061, 0x007a);
395     expectPairs(set, "az");
396     doAssert(set.isEmpty() == FALSE, "set should not be empty");
397     doAssert(set.size() != 0, "size should not be equal to 0");
398     doAssert(set.size() == 26, "size should be equal to 26");
399     set.remove(0x006d, 0x0070);
400     expectPairs(set, "alqz");
401     doAssert(set.size() == 22, "size should be equal to 22");
402     set.remove(0x0065, 0x0067);
403     expectPairs(set, "adhlqz");
404     doAssert(set.size() == 19, "size should be equal to 19");
405     set.remove(0x0064, 0x0069);
406     expectPairs(set, "acjlqz");
407     doAssert(set.size() == 16, "size should be equal to 16");
408     set.remove(0x0063, 0x0072);
409     expectPairs(set, "absz");
410     doAssert(set.size() == 10, "size should be equal to 10");
411     set.add(0x0066, 0x0071);
412     expectPairs(set, "abfqsz");
413     doAssert(set.size() == 22, "size should be equal to 22");
414     set.remove(0x0061, 0x0067);
415     expectPairs(set, "hqsz");
416     set.remove(0x0061, 0x007a);
417     expectPairs(set, "");
418     doAssert(set.isEmpty() == TRUE, "set should be empty");
419     doAssert(set.size() == 0, "size should be 0");
420     set.add(0x0061);
421     doAssert(set.isEmpty() == FALSE, "set should not be empty");
422     doAssert(set.size() == 1, "size should not be equal to 1");
423     set.add(0x0062);
424     set.add(0x0063);
425     expectPairs(set, "ac");
426     doAssert(set.size() == 3, "size should not be equal to 3");
427     set.add(0x0070);
428     set.add(0x0071);
429     expectPairs(set, "acpq");
430     doAssert(set.size() == 5, "size should not be equal to 5");
431     set.clear();
432     expectPairs(set, "");
433     doAssert(set.isEmpty() == TRUE, "set should be empty");
434     doAssert(set.size() == 0, "size should be 0");
435 
436     // Try removing an entire set from another set
437     expectPattern(set, "[c-x]", "cx");
438     UnicodeSet set2;
439     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
440     set.removeAll(set2);
441     expectPairs(set, "deluxx");
442 
443     // Try adding an entire set to another set
444     expectPattern(set, "[jackiemclean]", "aacceein");
445     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
446     set.addAll(set2);
447     expectPairs(set, "aacehort");
448     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
449 
450     // Try retaining an set of elements contained in another set (intersection)
451     UnicodeSet set3;
452     expectPattern(set3, "[a-c]", "ac");
453     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
454     set3.remove(0x0062);
455     expectPairs(set3, "aacc");
456     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
457     set.retainAll(set3);
458     expectPairs(set, "aacc");
459     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
460     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
461     set.clear();
462     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
463 
464     // Test commutativity
465     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
466     expectPattern(set2, "[jackiemclean]", "aacceein");
467     set.addAll(set2);
468     expectPairs(set, "aacehort");
469     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
470 
471 
472 
473 
474 }
475 
476 /**
477  * Make sure minimal representation is maintained.
478  */
TestMinimalRep()479 void UnicodeSetTest::TestMinimalRep() {
480     UErrorCode status = U_ZERO_ERROR;
481     // This is pretty thoroughly tested by checkCanonicalRep()
482     // run against the exhaustive operation results.  Use the code
483     // here for debugging specific spot problems.
484 
485     // 1 overlap against 2
486     UnicodeSet set("[h-km-q]", status);
487     if (U_FAILURE(status)) { errln("FAIL"); return; }
488     UnicodeSet set2("[i-o]", status);
489     if (U_FAILURE(status)) { errln("FAIL"); return; }
490     set.addAll(set2);
491     expectPairs(set, "hq");
492     // right
493     set.applyPattern("[a-m]", status);
494     if (U_FAILURE(status)) { errln("FAIL"); return; }
495     set2.applyPattern("[e-o]", status);
496     if (U_FAILURE(status)) { errln("FAIL"); return; }
497     set.addAll(set2);
498     expectPairs(set, "ao");
499     // left
500     set.applyPattern("[e-o]", status);
501     if (U_FAILURE(status)) { errln("FAIL"); return; }
502     set2.applyPattern("[a-m]", status);
503     if (U_FAILURE(status)) { errln("FAIL"); return; }
504     set.addAll(set2);
505     expectPairs(set, "ao");
506     // 1 overlap against 3
507     set.applyPattern("[a-eg-mo-w]", status);
508     if (U_FAILURE(status)) { errln("FAIL"); return; }
509     set2.applyPattern("[d-q]", status);
510     if (U_FAILURE(status)) { errln("FAIL"); return; }
511     set.addAll(set2);
512     expectPairs(set, "aw");
513 }
514 
TestAPI()515 void UnicodeSetTest::TestAPI() {
516     UErrorCode status = U_ZERO_ERROR;
517     // default ct
518     UnicodeSet set;
519     if (!set.isEmpty() || set.getRangeCount() != 0) {
520         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
521               set);
522     }
523 
524     // clear(), isEmpty()
525     set.add(0x0061);
526     if (set.isEmpty()) {
527         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
528               set);
529     }
530     set.clear();
531     if (!set.isEmpty()) {
532         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
533               set);
534     }
535 
536     // size()
537     set.clear();
538     if (set.size() != 0) {
539         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
540               ": " + set);
541     }
542     set.add(0x0061);
543     if (set.size() != 1) {
544         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
545               ": " + set);
546     }
547     set.add(0x0031, 0x0039);
548     if (set.size() != 10) {
549         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
550               ": " + set);
551     }
552 
553     // contains(first, last)
554     set.clear();
555     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
556     if (U_FAILURE(status)) { errln("FAIL"); return; }
557     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
558         UChar32 a = set.getRangeStart(i);
559         UChar32 b = set.getRangeEnd(i);
560         if (!set.contains(a, b)) {
561             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
562                   " but doesn't: " + set);
563         }
564         if (set.contains((UChar32)(a-1), b)) {
565             errln((UnicodeString)"FAIL, shouldn't contain " +
566                   (unsigned short)(a-1) + '-' + (unsigned short)b +
567                   " but does: " + set);
568         }
569         if (set.contains(a, (UChar32)(b+1))) {
570             errln((UnicodeString)"FAIL, shouldn't contain " +
571                   (unsigned short)a + '-' + (unsigned short)(b+1) +
572                   " but does: " + set);
573         }
574     }
575 
576     // Ported InversionList test.
577     UnicodeSet a((UChar32)3,(UChar32)10);
578     UnicodeSet b((UChar32)7,(UChar32)15);
579     UnicodeSet c;
580 
581     logln((UnicodeString)"a [3-10]: " + a);
582     logln((UnicodeString)"b [7-15]: " + b);
583     c = a;
584     c.addAll(b);
585     UnicodeSet exp((UChar32)3,(UChar32)15);
586     if (c == exp) {
587         logln((UnicodeString)"c.set(a).add(b): " + c);
588     } else {
589         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
590     }
591     c.complement();
592     exp.set((UChar32)0, (UChar32)2);
593     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
594     if (c == exp) {
595         logln((UnicodeString)"c.complement(): " + c);
596     } else {
597         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
598     }
599     c.complement();
600     exp.set((UChar32)3, (UChar32)15);
601     if (c == exp) {
602         logln((UnicodeString)"c.complement(): " + c);
603     } else {
604         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
605     }
606     c = a;
607     c.complementAll(b);
608     exp.set((UChar32)3,(UChar32)6);
609     exp.add((UChar32)11,(UChar32) 15);
610     if (c == exp) {
611         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
612     } else {
613         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
614     }
615 
616     exp = c;
617     bitsToSet(setToBits(c), c);
618     if (c == exp) {
619         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
620     } else {
621         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
622     }
623 
624     // Additional tests for coverage JB#2118
625     //UnicodeSet::complement(class UnicodeString const &)
626     //UnicodeSet::complementAll(class UnicodeString const &)
627     //UnicodeSet::containsNone(class UnicodeSet const &)
628     //UnicodeSet::containsNone(long,long)
629     //UnicodeSet::containsSome(class UnicodeSet const &)
630     //UnicodeSet::containsSome(long,long)
631     //UnicodeSet::removeAll(class UnicodeString const &)
632     //UnicodeSet::retain(long)
633     //UnicodeSet::retainAll(class UnicodeString const &)
634     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
635     //UnicodeSetIterator::getString(void)
636     set.clear();
637     set.complement("ab");
638     exp.applyPattern("[{ab}]", status);
639     if (U_FAILURE(status)) { errln("FAIL"); return; }
640     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
641 
642     UnicodeSetIterator iset(set);
643     if (!iset.next() || !iset.isString()) {
644         errln("FAIL: UnicodeSetIterator::next/isString");
645     } else if (iset.getString() != "ab") {
646         errln("FAIL: UnicodeSetIterator::getString");
647     }
648 
649     set.add((UChar32)0x61, (UChar32)0x7A);
650     set.complementAll("alan");
651     exp.applyPattern("[{ab}b-kmo-z]", status);
652     if (U_FAILURE(status)) { errln("FAIL"); return; }
653     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
654 
655     exp.applyPattern("[a-z]", status);
656     if (U_FAILURE(status)) { errln("FAIL"); return; }
657     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
658     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
659     exp.applyPattern("[aln]", status);
660     if (U_FAILURE(status)) { errln("FAIL"); return; }
661     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
662     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
663 
664     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
665         errln("FAIL: containsNone(UChar32, UChar32)");
666     }
667     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
668         errln("FAIL: containsSome(UChar32, UChar32)");
669     }
670     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
671         errln("FAIL: containsNone(UChar32, UChar32)");
672     }
673     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
674         errln("FAIL: containsSome(UChar32, UChar32)");
675     }
676 
677     set.removeAll("liu");
678     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
679     if (U_FAILURE(status)) { errln("FAIL"); return; }
680     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
681 
682     set.retainAll("star");
683     exp.applyPattern("[rst]", status);
684     if (U_FAILURE(status)) { errln("FAIL"); return; }
685     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
686 
687     set.retain((UChar32)0x73);
688     exp.applyPattern("[s]", status);
689     if (U_FAILURE(status)) { errln("FAIL"); return; }
690     if (set != exp) { errln("FAIL: retain('s')"); return; }
691 
692     uint16_t buf[32];
693     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
694     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
695     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
696         errln("FAIL: serialize");
697         return;
698     }
699 }
700 
TestIteration()701 void UnicodeSetTest::TestIteration() {
702     UErrorCode ec = U_ZERO_ERROR;
703     int i = 0;
704     int outerLoop;
705 
706     // 6 code points, 3 ranges, 2 strings, 8 total elements
707     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
708     UnicodeSet set("[zabyc\\U0001abcd{str1}{str2}]", ec);
709     TEST_ASSERT_SUCCESS(ec);
710     UnicodeSetIterator it(set);
711 
712     for (outerLoop=0; outerLoop<3; outerLoop++) {
713         // Run the test multiple times, to check that iterator.reset() is working.
714         for (i=0; i<10; i++) {
715             UBool         nextv        = it.next();
716             UBool         isString     = it.isString();
717             int32_t       codePoint    = it.getCodepoint();
718             //int32_t       codePointEnd = it.getCodepointEnd();
719             UnicodeString s   = it.getString();
720             switch (i) {
721             case 0:
722                 TEST_ASSERT(nextv == TRUE);
723                 TEST_ASSERT(isString == FALSE);
724                 TEST_ASSERT(codePoint==0x61);
725                 TEST_ASSERT(s == "a");
726                 break;
727             case 1:
728                 TEST_ASSERT(nextv == TRUE);
729                 TEST_ASSERT(isString == FALSE);
730                 TEST_ASSERT(codePoint==0x62);
731                 TEST_ASSERT(s == "b");
732                 break;
733             case 2:
734                 TEST_ASSERT(nextv == TRUE);
735                 TEST_ASSERT(isString == FALSE);
736                 TEST_ASSERT(codePoint==0x63);
737                 TEST_ASSERT(s == "c");
738                 break;
739             case 3:
740                 TEST_ASSERT(nextv == TRUE);
741                 TEST_ASSERT(isString == FALSE);
742                 TEST_ASSERT(codePoint==0x79);
743                 TEST_ASSERT(s == "y");
744                 break;
745             case 4:
746                 TEST_ASSERT(nextv == TRUE);
747                 TEST_ASSERT(isString == FALSE);
748                 TEST_ASSERT(codePoint==0x7a);
749                 TEST_ASSERT(s == "z");
750                 break;
751             case 5:
752                 TEST_ASSERT(nextv == TRUE);
753                 TEST_ASSERT(isString == FALSE);
754                 TEST_ASSERT(codePoint==0x1abcd);
755                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
756                 break;
757             case 6:
758                 TEST_ASSERT(nextv == TRUE);
759                 TEST_ASSERT(isString == TRUE);
760                 TEST_ASSERT(s == "str1");
761                 break;
762             case 7:
763                 TEST_ASSERT(nextv == TRUE);
764                 TEST_ASSERT(isString == TRUE);
765                 TEST_ASSERT(s == "str2");
766                 break;
767             case 8:
768                 TEST_ASSERT(nextv == FALSE);
769                 break;
770             case 9:
771                 TEST_ASSERT(nextv == FALSE);
772                 break;
773             }
774         }
775         it.reset();  // prepare to run the iteration again.
776     }
777 }
778 
779 
780 
781 
TestStrings()782 void UnicodeSetTest::TestStrings() {
783     UErrorCode ec = U_ZERO_ERROR;
784 
785     UnicodeSet* testList[] = {
786         UnicodeSet::createFromAll("abc"),
787         new UnicodeSet("[a-c]", ec),
788 
789         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
790         new UnicodeSet("[{ll}{ch}a-z]", ec),
791 
792         UnicodeSet::createFrom("ab}c"),
793         new UnicodeSet("[{ab\\}c}]", ec),
794 
795         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
796         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
797 
798         NULL
799     };
800 
801     if (U_FAILURE(ec)) {
802         errln("FAIL: couldn't construct test sets");
803     }
804 
805     for (int32_t i = 0; testList[i] != NULL; i+=2) {
806         if (U_SUCCESS(ec)) {
807             UnicodeString pat0, pat1;
808             testList[i]->toPattern(pat0, TRUE);
809             testList[i+1]->toPattern(pat1, TRUE);
810             if (*testList[i] == *testList[i+1]) {
811                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
812             } else {
813                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
814             }
815         }
816         delete testList[i];
817         delete testList[i+1];
818     }
819 }
820 
821 /**
822  * Test the [:Latin:] syntax.
823  */
TestScriptSet()824 void UnicodeSetTest::TestScriptSet() {
825     expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
826 
827     expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
828 
829     /* Jitterbug 1423 */
830     expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
831 
832 }
833 
834 /**
835  * Test the [:Latin:] syntax.
836  */
TestPropertySet()837 void UnicodeSetTest::TestPropertySet() {
838     static const char* const DATA[] = {
839         // Pattern, Chars IN, Chars NOT in
840 
841         "[:Latin:]",
842         "aA",
843         "\\u0391\\u03B1",
844 
845         "[\\p{Greek}]",
846         "\\u0391\\u03B1",
847         "aA",
848 
849         "\\P{ GENERAL Category = upper case letter }",
850         "abc",
851         "ABC",
852 
853         // Combining class: @since ICU 2.2
854         // Check both symbolic and numeric
855         "\\p{ccc=Nukta}",
856         "\\u0ABC",
857         "abc",
858 
859         "\\p{Canonical Combining Class = 11}",
860         "\\u05B1",
861         "\\u05B2",
862 
863         "[:c c c = iota subscript :]",
864         "\\u0345",
865         "xyz",
866 
867         // Bidi class: @since ICU 2.2
868         "\\p{bidiclass=lefttoright}",
869         "abc",
870         "\\u0671\\u0672",
871 
872         // Binary properties: @since ICU 2.2
873         "\\p{ideographic}",
874         "\\u4E0A",
875         "x",
876 
877         "[:math=false:]",
878         "q)*(",
879         // weiv: )(and * were removed from math in Unicode 4.0.1
880         //"(*+)",
881         "+<>^",
882 
883         // JB#1767 \N{}, \p{ASCII}
884         "[:Ascii:]",
885         "abc\\u0000\\u007F",
886         "\\u0080\\u4E00",
887 
888         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
889         "az",
890         "qrs",
891 
892         // JB#2015
893         "[:any:]",
894         "a\\U0010FFFF",
895         "",
896 
897         "[:nv=0.5:]",
898         "\\u00BD\\u0F2A",
899         "\\u00BC",
900 
901         // JB#2653: Age
902         "[:Age=1.1:]",
903         "\\u03D6", // 1.1
904         "\\u03D8\\u03D9", // 3.2
905 
906         "[:Age=3.1:]",
907         "\\u1800\\u3400\\U0002f800",
908         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
909 
910         // JB#2350: Case_Sensitive
911         "[:Case Sensitive:]",
912         "A\\u1FFC\\U00010410",
913         ";\\u00B4\\U00010500",
914 
915         // JB#2832: C99-compatibility props
916         "[:blank:]",
917         " \\u0009",
918         "1-9A-Z",
919 
920         "[:graph:]",
921         "19AZ",
922         " \\u0003\\u0007\\u0009\\u000A\\u000D",
923 
924         "[:punct:]",
925         "!@#%&*()[]{}-_\\/;:,.?'\"",
926         "09azAZ",
927 
928         "[:xdigit:]",
929         "09afAF",
930         "gG!",
931 
932         // Regex compatibility test
933         "[-b]", // leading '-' is literal
934         "-b",
935         "ac",
936 
937         "[^-b]", // leading '-' is literal
938         "ac",
939         "-b",
940 
941         "[b-]", // trailing '-' is literal
942         "-b",
943         "ac",
944 
945         "[^b-]", // trailing '-' is literal
946         "ac",
947         "-b",
948 
949         "[a-b-]", // trailing '-' is literal
950         "ab-",
951         "c=",
952 
953         "[[a-q]&[p-z]-]", // trailing '-' is literal
954         "pq-",
955         "or=",
956 
957         "[\\s|\\)|:|$|\\>]", // from regex tests
958         "s|):$>",
959         "abc",
960 
961         "[\\uDC00cd]", // JB#2906: isolated trail at start
962         "cd\\uDC00",
963         "ab\\uD800\\U00010000",
964 
965         "[ab\\uD800]", // JB#2906: isolated trail at start
966         "ab\\uD800",
967         "cd\\uDC00\\U00010000",
968 
969         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
970         "abcd\\uD800",
971         "ef\\uDC00\\U00010000",
972 
973         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
974         "abcd\\uDC00",
975         "ef\\uD800\\U00010000",
976 
977         "[:^lccc=0:]", // Lead canonical class
978         "\\u0300\\u0301",
979         "abcd\\u00c0\\u00c5",
980 
981         "[:^tccc=0:]", // Trail canonical class
982         "\\u0300\\u0301\\u00c0\\u00c5",
983         "abcd",
984 
985         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
986         "\\u0300\\u0301\\u00c0\\u00c5",
987         "abcd",
988 
989         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
990         "",
991         "abcd\\u0300\\u0301\\u00c0\\u00c5",
992 
993         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
994         "\\u0F73\\u0F75\\u0F81",
995         "abcd\\u0300\\u0301\\u00c0\\u00c5",
996 
997         "[:Assigned:]",
998         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
999         "\\u0888\\uFDD3\\uFFFE\\U00050005"
1000     };
1001 
1002     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1003 
1004     for (int32_t i=0; i<DATA_LEN; i+=3) {
1005         expectContainment(DATA[i], CharsToUnicodeString(DATA[i+1]),
1006                           CharsToUnicodeString(DATA[i+2]));
1007     }
1008 }
1009 
1010 /**
1011   * Test that Posix style character classes [:digit:], etc.
1012   *   have the Unicode definitions from TR 18.
1013   */
TestPosixClasses()1014 void UnicodeSetTest::TestPosixClasses() {
1015     {
1016         UErrorCode status = U_ZERO_ERROR;
1017         UnicodeSet s1("[:alpha:]", status);
1018         UnicodeSet s2("\\p{Alphabetic}", status);
1019         TEST_ASSERT_SUCCESS(status);
1020         TEST_ASSERT(s1==s2);
1021     }
1022     {
1023         UErrorCode status = U_ZERO_ERROR;
1024         UnicodeSet s1("[:lower:]", status);
1025         UnicodeSet s2("\\p{lowercase}", status);
1026         TEST_ASSERT_SUCCESS(status);
1027         TEST_ASSERT(s1==s2);
1028     }
1029     {
1030         UErrorCode status = U_ZERO_ERROR;
1031         UnicodeSet s1("[:upper:]", status);
1032         UnicodeSet s2("\\p{Uppercase}", status);
1033         TEST_ASSERT_SUCCESS(status);
1034         TEST_ASSERT(s1==s2);
1035     }
1036     {
1037         UErrorCode status = U_ZERO_ERROR;
1038         UnicodeSet s1("[:punct:]", status);
1039         UnicodeSet s2("\\p{gc=Punctuation}", status);
1040         TEST_ASSERT_SUCCESS(status);
1041         TEST_ASSERT(s1==s2);
1042     }
1043     {
1044         UErrorCode status = U_ZERO_ERROR;
1045         UnicodeSet s1("[:digit:]", status);
1046         UnicodeSet s2("\\p{gc=DecimalNumber}", status);
1047         TEST_ASSERT_SUCCESS(status);
1048         TEST_ASSERT(s1==s2);
1049     }
1050     {
1051         UErrorCode status = U_ZERO_ERROR;
1052         UnicodeSet s1("[:xdigit:]", status);
1053         UnicodeSet s2("[\\p{DecimalNumber}\\p{HexDigit}]", status);
1054         TEST_ASSERT_SUCCESS(status);
1055         TEST_ASSERT(s1==s2);
1056     }
1057     {
1058         UErrorCode status = U_ZERO_ERROR;
1059         UnicodeSet s1("[:alnum:]", status);
1060         UnicodeSet s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1061         TEST_ASSERT_SUCCESS(status);
1062         TEST_ASSERT(s1==s2);
1063     }
1064     {
1065         UErrorCode status = U_ZERO_ERROR;
1066         UnicodeSet s1("[:space:]", status);
1067         UnicodeSet s2("\\p{Whitespace}", status);
1068         TEST_ASSERT_SUCCESS(status);
1069         TEST_ASSERT(s1==s2);
1070     }
1071     {
1072         UErrorCode status = U_ZERO_ERROR;
1073         UnicodeSet s1("[:blank:]", status);
1074         TEST_ASSERT_SUCCESS(status);
1075         UnicodeSet s2("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1076             status);
1077         TEST_ASSERT_SUCCESS(status);
1078         TEST_ASSERT(s1==s2);
1079     }
1080     {
1081         UErrorCode status = U_ZERO_ERROR;
1082         UnicodeSet s1("[:cntrl:]", status);
1083         TEST_ASSERT_SUCCESS(status);
1084         UnicodeSet s2("\\p{Control}", status);
1085         TEST_ASSERT_SUCCESS(status);
1086         TEST_ASSERT(s1==s2);
1087     }
1088     {
1089         UErrorCode status = U_ZERO_ERROR;
1090         UnicodeSet s1("[:graph:]", status);
1091         TEST_ASSERT_SUCCESS(status);
1092         UnicodeSet s2("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1093         TEST_ASSERT_SUCCESS(status);
1094         TEST_ASSERT(s1==s2);
1095     }
1096     {
1097         UErrorCode status = U_ZERO_ERROR;
1098         UnicodeSet s1("[:print:]", status);
1099         TEST_ASSERT_SUCCESS(status);
1100         UnicodeSet s2("[[:graph:][:blank:]-[\\p{Control}]]" ,status);
1101         TEST_ASSERT_SUCCESS(status);
1102         TEST_ASSERT(s1==s2);
1103     }
1104 }
1105 /**
1106  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1107  */
TestClone()1108 void UnicodeSetTest::TestClone() {
1109     UErrorCode ec = U_ZERO_ERROR;
1110     UnicodeSet s("[abcxyz]", ec);
1111     UnicodeSet t(s);
1112     expectContainment(t, "abc", "def");
1113 }
1114 
1115 /**
1116  * Test the indexOf() and charAt() methods.
1117  */
TestIndexOf()1118 void UnicodeSetTest::TestIndexOf() {
1119     UErrorCode ec = U_ZERO_ERROR;
1120     UnicodeSet set("[a-cx-y3578]", ec);
1121     if (U_FAILURE(ec)) {
1122         errln("FAIL: UnicodeSet constructor");
1123         return;
1124     }
1125     for (int32_t i=0; i<set.size(); ++i) {
1126         UChar32 c = set.charAt(i);
1127         if (set.indexOf(c) != i) {
1128             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1129                 i, c, set.indexOf(c));
1130         }
1131     }
1132     UChar32 c = set.charAt(set.size());
1133     if (c != -1) {
1134         errln("FAIL: charAt(<out of range>) = %X", c);
1135     }
1136     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1137     if (j != -1) {
1138         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1139     }
1140 }
1141 
1142 /**
1143  * Test closure API.
1144  */
TestCloseOver()1145 void UnicodeSetTest::TestCloseOver() {
1146     UErrorCode ec = U_ZERO_ERROR;
1147 
1148     char CASE[] = {(char)USET_CASE_INSENSITIVE};
1149     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1150     const char* DATA[] = {
1151         // selector, input, output
1152         CASE,
1153         "[aq\\u00DF{Bc}{bC}{Fi}]",
1154         "[aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]",
1155 
1156         CASE,
1157         "[\\u01F1]", // 'DZ'
1158         "[\\u01F1\\u01F2\\u01F3]",
1159 
1160         CASE,
1161         "[\\u1FB4]",
1162         "[\\u1FB4{\\u03AC\\u03B9}]",
1163 
1164         CASE,
1165         "[{F\\uFB01}]",
1166         "[\\uFB03{ffi}]",
1167 
1168         CASE, // make sure binary search finds limits
1169         "[a\\uFF3A]",
1170         "[aA\\uFF3A\\uFF5A]",
1171 
1172         CASE,
1173         "[a-z]","[A-Za-z\\u017F\\u212A]",
1174         CASE,
1175         "[abc]","[A-Ca-c]",
1176         CASE,
1177         "[ABC]","[A-Ca-c]",
1178 
1179         CASE, "[i]", "[iI]",
1180 
1181         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1182         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1183 
1184         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1185 
1186         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1187 
1188         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1189 
1190         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1191 
1192         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1193 
1194         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1195 
1196         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1197         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1198 
1199         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1200 
1201         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1202 
1203         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1204 
1205         CASE_MAPPINGS,
1206         "[aq\\u00DF{Bc}{bC}{Fi}]",
1207         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1208 
1209         CASE_MAPPINGS,
1210         "[\\u01F1]", // 'DZ'
1211         "[\\u01F1\\u01F2\\u01F3]",
1212 
1213         CASE_MAPPINGS,
1214         "[a-z]",
1215         "[A-Za-z]",
1216 
1217         NULL
1218     };
1219 
1220     UnicodeSet s;
1221     UnicodeSet t;
1222     UnicodeString buf;
1223     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1224         int32_t selector = DATA[i][0];
1225         UnicodeString pat(DATA[i+1]);
1226         UnicodeString exp(DATA[i+2]);
1227         s.applyPattern(pat, ec);
1228         s.closeOver(selector);
1229         t.applyPattern(exp, ec);
1230         if (U_FAILURE(ec)) {
1231             errln("FAIL: applyPattern failed");
1232             continue;
1233         }
1234         if (s == t) {
1235             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1236         } else {
1237             errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1238                   s.toPattern(buf, TRUE) + ", expected " + exp);
1239         }
1240     }
1241 
1242 #if 0
1243     /*
1244      * Unused test code.
1245      * This was used to compare the old implementation (using USET_CASE)
1246      * with the new one (using 0x100 temporarily)
1247      * while transitioning from hardcoded case closure tables in uniset.cpp
1248      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1249      * and using ucase.c functions for closure.
1250      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1251      *
1252      * Note: The old and new implementation never fully matched because
1253      * the old implementation turned out to not map U+0130 and U+0131 correctly
1254      * (dotted I and dotless i) and because the old implementation's data tables
1255      * were outdated compared to Unicode 4.0.1 at the time of the change to the
1256      * new implementation. (So sigmas and some other characters were not handled
1257      * according to the newer Unicode version.)
1258      */
1259     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1260     UnicodeSetIterator si(sens);
1261     UnicodeString str, buf2;
1262     const UnicodeString *pStr;
1263     UChar32 c;
1264     while(si.next()) {
1265         if(!si.isString()) {
1266             c=si.getCodepoint();
1267             s.clear();
1268             s.add(c);
1269 
1270             str.setTo(c);
1271             str.foldCase();
1272             sens2.add(str);
1273 
1274             t=s;
1275             s.closeOver(USET_CASE);
1276             t.closeOver(0x100);
1277             if(s!=t) {
1278                 errln("FAIL: closeOver(U+%04x) differs: ", c);
1279                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1280             }
1281         }
1282     }
1283     // remove all code points
1284     // should contain all full case folding mapping strings
1285     sens2.remove(0, 0x10ffff);
1286     si.reset(sens2);
1287     while(si.next()) {
1288         if(si.isString()) {
1289             pStr=&si.getString();
1290             s.clear();
1291             s.add(*pStr);
1292             t=s2=s;
1293             s.closeOver(USET_CASE);
1294             t.closeOver(0x100);
1295             if(s!=t) {
1296                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1297                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1298             }
1299         }
1300     }
1301 #endif
1302 
1303     // Test the pattern API
1304     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1305     if (U_FAILURE(ec)) {
1306         errln("FAIL: applyPattern failed");
1307     } else {
1308         expectContainment(s, "abcABC", "defDEF");
1309     }
1310     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1311     if (U_FAILURE(ec)) {
1312         errln("FAIL: constructor failed");
1313     } else {
1314         expectContainment(v, "defDEF", "abcABC");
1315     }
1316     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1317     if (U_FAILURE(ec)) {
1318         errln("FAIL: construct w/case mappings failed");
1319     } else {
1320         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1321     }
1322 }
1323 
TestEscapePattern()1324 void UnicodeSetTest::TestEscapePattern() {
1325     const char pattern[] =
1326         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1327     const char exp[] =
1328         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1329     // We test this with two passes; in the second pass we
1330     // pre-unescape the pattern.  Since U+200E is rule whitespace,
1331     // this fails -- which is what we expect.
1332     for (int32_t pass=1; pass<=2; ++pass) {
1333         UErrorCode ec = U_ZERO_ERROR;
1334         UnicodeString pat(pattern);
1335         if (pass==2) {
1336             pat = pat.unescape();
1337         }
1338         // Pattern is only good for pass 1
1339         UBool isPatternValid = (pass==1);
1340 
1341         UnicodeSet set(pat, ec);
1342         if (U_SUCCESS(ec) != isPatternValid){
1343             errln((UnicodeString)"FAIL: applyPattern(" +
1344                   escape(pat) + ") => " +
1345                   u_errorName(ec));
1346             continue;
1347         }
1348         if (U_FAILURE(ec)) {
1349             continue;
1350         }
1351         if (set.contains((UChar)0x0644)){
1352             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1353         }
1354 
1355         UnicodeString newpat;
1356         set.toPattern(newpat, TRUE);
1357         if (newpat == exp) {
1358             logln(escape(pat) + " => " + newpat);
1359         } else {
1360             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1361         }
1362 
1363         for (int32_t i=0; i<set.getRangeCount(); ++i) {
1364             UnicodeString str("Range ");
1365             str.append((UChar)(0x30 + i))
1366                 .append(": ")
1367                 .append((UChar32)set.getRangeStart(i))
1368                 .append(" - ")
1369                 .append((UChar32)set.getRangeEnd(i));
1370             str = str + " (" + set.getRangeStart(i) + " - " +
1371                 set.getRangeEnd(i) + ")";
1372             if (set.getRangeStart(i) < 0) {
1373                 errln((UnicodeString)"FAIL: " + escape(str));
1374             } else {
1375                 logln(escape(str));
1376             }
1377         }
1378     }
1379 }
1380 
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1381 void UnicodeSetTest::expectRange(const UnicodeString& label,
1382                                  const UnicodeSet& set,
1383                                  UChar32 start, UChar32 end) {
1384     UnicodeSet exp(start, end);
1385     UnicodeString pat;
1386     if (set == exp) {
1387         logln(label + " => " + set.toPattern(pat, TRUE));
1388     } else {
1389         UnicodeString xpat;
1390         errln((UnicodeString)"FAIL: " + label + " => " +
1391               set.toPattern(pat, TRUE) +
1392               ", expected " + exp.toPattern(xpat, TRUE));
1393     }
1394 }
1395 
TestInvalidCodePoint()1396 void UnicodeSetTest::TestInvalidCodePoint() {
1397 
1398     const UChar32 DATA[] = {
1399         // Test range             Expected range
1400         0, 0x10FFFF,              0, 0x10FFFF,
1401         (UChar32)-1, 8,           0, 8,
1402         8, 0x110000,              8, 0x10FFFF
1403     };
1404     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1405 
1406     UnicodeString pat;
1407     int32_t i;
1408 
1409     for (i=0; i<DATA_LENGTH; i+=4) {
1410         UChar32 start  = DATA[i];
1411         UChar32 end    = DATA[i+1];
1412         UChar32 xstart = DATA[i+2];
1413         UChar32 xend   = DATA[i+3];
1414 
1415         // Try various API using the test code points
1416 
1417         UnicodeSet set(start, end);
1418         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1419                     set, xstart, xend);
1420 
1421         set.clear();
1422         set.set(start, end);
1423         expectRange((UnicodeString)"set(" + start + "," + end + ")",
1424                     set, xstart, xend);
1425 
1426         UBool b = set.contains(start);
1427         b = set.contains(start, end);
1428         b = set.containsNone(start, end);
1429         b = set.containsSome(start, end);
1430 
1431         /*int32_t index = set.indexOf(start);*/
1432 
1433         set.clear();
1434         set.add(start);
1435         set.add(start, end);
1436         expectRange((UnicodeString)"add(" + start + "," + end + ")",
1437                     set, xstart, xend);
1438 
1439         set.set(0, 0x10FFFF);
1440         set.retain(start, end);
1441         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1442                     set, xstart, xend);
1443         set.retain(start);
1444 
1445         set.set(0, 0x10FFFF);
1446         set.remove(start);
1447         set.remove(start, end);
1448         set.complement();
1449         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1450                     set, xstart, xend);
1451 
1452         set.set(0, 0x10FFFF);
1453         set.complement(start, end);
1454         set.complement();
1455         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1456                     set, xstart, xend);
1457         set.complement(start);
1458     }
1459 
1460     const UChar32 DATA2[] = {
1461         0,
1462         0x10FFFF,
1463         (UChar32)-1,
1464         0x110000
1465     };
1466     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1467 
1468     for (i=0; i<DATA2_LENGTH; ++i) {
1469         UChar32 c = DATA2[i], end = 0x10FFFF;
1470         UBool valid = (c >= 0 && c <= 0x10FFFF);
1471 
1472         UnicodeSet set(0, 0x10FFFF);
1473 
1474         // For single-codepoint contains, invalid codepoints are NOT contained
1475         UBool b = set.contains(c);
1476         if (b == valid) {
1477             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1478                   ") = " + b);
1479         } else {
1480             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1481                   ") = " + b);
1482         }
1483 
1484         // For codepoint range contains, containsNone, and containsSome,
1485         // invalid or empty (start > end) ranges have UNDEFINED behavior.
1486         b = set.contains(c, end);
1487         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1488               "," + end + ") = " + b);
1489 
1490         b = set.containsNone(c, end);
1491         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1492               "," + end + ") = " + b);
1493 
1494         b = set.containsSome(c, end);
1495         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1496               "," + end + ") = " + b);
1497 
1498         int32_t index = set.indexOf(c);
1499         if ((index >= 0) == valid) {
1500             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1501                   ") = " + index);
1502         } else {
1503             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1504                   ") = " + index);
1505         }
1506     }
1507 }
1508 
1509 // Used by TestSymbolTable
1510 class TokenSymbolTable : public SymbolTable {
1511 public:
1512     Hashtable contents;
1513 
TokenSymbolTable(UErrorCode & ec)1514     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1515         contents.setValueDeleter(uhash_deleteUnicodeString);
1516     }
1517 
~TokenSymbolTable()1518     ~TokenSymbolTable() {}
1519 
1520     /**
1521      * (Non-SymbolTable API) Add the given variable and value to
1522      * the table.  Variable should NOT contain leading '$'.
1523      */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1524     void add(const UnicodeString& var, const UnicodeString& value,
1525              UErrorCode& ec) {
1526         if (U_SUCCESS(ec)) {
1527             contents.put(var, new UnicodeString(value), ec);
1528         }
1529     }
1530 
1531     /**
1532      * SymbolTable API
1533      */
lookup(const UnicodeString & s) const1534     virtual const UnicodeString* lookup(const UnicodeString& s) const {
1535         return (const UnicodeString*) contents.get(s);
1536     }
1537 
1538     /**
1539      * SymbolTable API
1540      */
lookupMatcher(UChar32) const1541     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1542         return NULL;
1543     }
1544 
1545     /**
1546      * SymbolTable API
1547      */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1548     virtual UnicodeString parseReference(const UnicodeString& text,
1549                                          ParsePosition& pos, int32_t limit) const {
1550         int32_t start = pos.getIndex();
1551         int32_t i = start;
1552         UnicodeString result;
1553         while (i < limit) {
1554             UChar c = text.charAt(i);
1555             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1556                 break;
1557             }
1558             ++i;
1559         }
1560         if (i == start) { // No valid name chars
1561             return result; // Indicate failure with empty string
1562         }
1563         pos.setIndex(i);
1564         text.extractBetween(start, i, result);
1565         return result;
1566     }
1567 };
1568 
TestSymbolTable()1569 void UnicodeSetTest::TestSymbolTable() {
1570     // Multiple test cases can be set up here.  Each test case
1571     // is terminated by null:
1572     // var, value, var, value,..., input pat., exp. output pat., null
1573     const char* DATA[] = {
1574         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1575         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1576         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1577         NULL
1578     };
1579 
1580     for (int32_t i=0; DATA[i]!=NULL; ++i) {
1581         UErrorCode ec = U_ZERO_ERROR;
1582         TokenSymbolTable sym(ec);
1583         if (U_FAILURE(ec)) {
1584             errln("FAIL: couldn't construct TokenSymbolTable");
1585             continue;
1586         }
1587 
1588         // Set up variables
1589         while (DATA[i+2] != NULL) {
1590             sym.add(DATA[i], DATA[i+1], ec);
1591             if (U_FAILURE(ec)) {
1592                 errln("FAIL: couldn't add to TokenSymbolTable");
1593                 continue;
1594             }
1595             i += 2;
1596         }
1597 
1598         // Input pattern and expected output pattern
1599         UnicodeString inpat = DATA[i], exppat = DATA[i+1];
1600         i += 2;
1601 
1602         ParsePosition pos(0);
1603         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1604         if (U_FAILURE(ec)) {
1605             errln("FAIL: couldn't construct UnicodeSet");
1606             continue;
1607         }
1608 
1609         // results
1610         if (pos.getIndex() != inpat.length()) {
1611             errln((UnicodeString)"Failed to read to end of string \""
1612                   + inpat + "\": read to "
1613                   + pos.getIndex() + ", length is "
1614                   + inpat.length());
1615         }
1616 
1617         UnicodeSet us2(exppat, ec);
1618         if (U_FAILURE(ec)) {
1619             errln("FAIL: couldn't construct expected UnicodeSet");
1620             continue;
1621         }
1622 
1623         UnicodeString a, b;
1624         if (us != us2) {
1625             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1626                   ", expected " + us2.toPattern(b, TRUE));
1627         } else {
1628             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1629         }
1630     }
1631 }
1632 
TestSurrogate()1633 void UnicodeSetTest::TestSurrogate() {
1634     const char* DATA[] = {
1635         // These should all behave identically
1636         "[abc\\uD800\\uDC00]",
1637         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1638         "[abc\\U00010000]",
1639         0
1640     };
1641     for (int i=0; DATA[i] != 0; ++i) {
1642         UErrorCode ec = U_ZERO_ERROR;
1643         logln((UnicodeString)"Test pattern " + i + " :" + DATA[i]);
1644         UnicodeSet set(DATA[i], ec);
1645         if (U_FAILURE(ec)) {
1646             errln("FAIL: UnicodeSet constructor");
1647             continue;
1648         }
1649         expectContainment(set,
1650                           CharsToUnicodeString("abc\\U00010000"),
1651                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1652         if (set.size() != 4) {
1653             errln((UnicodeString)"FAIL: " + DATA[i] + ".size() == " +
1654                   set.size() + ", expected 4");
1655         }
1656     }
1657 }
1658 
TestExhaustive()1659 void UnicodeSetTest::TestExhaustive() {
1660     // exhaustive tests. Simulate UnicodeSets with integers.
1661     // That gives us very solid tests (except for large memory tests).
1662 
1663     int32_t limit = 128;
1664 
1665     UnicodeSet x, y, z, aa;
1666 
1667     for (int32_t i = 0; i < limit; ++i) {
1668         bitsToSet(i, x);
1669         logln((UnicodeString)"Testing " + i + ", " + x);
1670         _testComplement(i, x, y);
1671 
1672         // AS LONG AS WE ARE HERE, check roundtrip
1673         checkRoundTrip(bitsToSet(i, aa));
1674 
1675         for (int32_t j = 0; j < limit; ++j) {
1676             _testAdd(i,j,  x,y,z);
1677             _testXor(i,j,  x,y,z);
1678             _testRetain(i,j,  x,y,z);
1679             _testRemove(i,j,  x,y,z);
1680         }
1681     }
1682 }
1683 
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1684 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1685     bitsToSet(a, x);
1686     z = x;
1687     z.complement();
1688     int32_t c = setToBits(z);
1689     if (c != (~a)) {
1690         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1691         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1692     }
1693     checkCanonicalRep(z, (UnicodeString)"complement " + a);
1694 }
1695 
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1696 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1697     bitsToSet(a, x);
1698     bitsToSet(b, y);
1699     z = x;
1700     z.addAll(y);
1701     int32_t c = setToBits(z);
1702     if (c != (a | b)) {
1703         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1704         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1705     }
1706     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1707 }
1708 
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1709 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1710     bitsToSet(a, x);
1711     bitsToSet(b, y);
1712     z = x;
1713     z.retainAll(y);
1714     int32_t c = setToBits(z);
1715     if (c != (a & b)) {
1716         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1717         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1718     }
1719     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1720 }
1721 
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1722 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1723     bitsToSet(a, x);
1724     bitsToSet(b, y);
1725     z = x;
1726     z.removeAll(y);
1727     int32_t c = setToBits(z);
1728     if (c != (a &~ b)) {
1729         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1730         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1731     }
1732     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1733 }
1734 
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1735 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1736     bitsToSet(a, x);
1737     bitsToSet(b, y);
1738     z = x;
1739     z.complementAll(y);
1740     int32_t c = setToBits(z);
1741     if (c != (a ^ b)) {
1742         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1743         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1744     }
1745     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1746 }
1747 
1748 /**
1749  * Check that ranges are monotonically increasing and non-
1750  * overlapping.
1751  */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1752 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1753     int32_t n = set.getRangeCount();
1754     if (n < 0) {
1755         errln((UnicodeString)"FAIL result of " + msg +
1756               ": range count should be >= 0 but is " +
1757               n /*+ " for " + set.toPattern())*/);
1758         return;
1759     }
1760     UChar32 last = 0;
1761     for (int32_t i=0; i<n; ++i) {
1762         UChar32 start = set.getRangeStart(i);
1763         UChar32 end = set.getRangeEnd(i);
1764         if (start > end) {
1765             errln((UnicodeString)"FAIL result of " + msg +
1766                   ": range " + (i+1) +
1767                   " start > end: " + (int)start + ", " + (int)end +
1768                   " for " + set);
1769         }
1770         if (i > 0 && start <= last) {
1771             errln((UnicodeString)"FAIL result of " + msg +
1772                   ": range " + (i+1) +
1773                   " overlaps previous range: " + (int)start + ", " + (int)end +
1774                   " for " + set);
1775         }
1776         last = end;
1777     }
1778 }
1779 
1780 /**
1781  * Convert a bitmask to a UnicodeSet.
1782  */
bitsToSet(int32_t a,UnicodeSet & result)1783 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1784     result.clear();
1785     for (UChar32 i = 0; i < 32; ++i) {
1786         if ((a & (1<<i)) != 0) {
1787             result.add(i);
1788         }
1789     }
1790     return result;
1791 }
1792 
1793 /**
1794  * Convert a UnicodeSet to a bitmask.  Only the characters
1795  * U+0000 to U+0020 are represented in the bitmask.
1796  */
setToBits(const UnicodeSet & x)1797 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1798     int32_t result = 0;
1799     for (int32_t i = 0; i < 32; ++i) {
1800         if (x.contains((UChar32)i)) {
1801             result |= (1<<i);
1802         }
1803     }
1804     return result;
1805 }
1806 
1807 /**
1808  * Return the representation of an inversion list based UnicodeSet
1809  * as a pairs list.  Ranges are listed in ascending Unicode order.
1810  * For example, the set [a-zA-M3] is represented as "33AMaz".
1811  */
getPairs(const UnicodeSet & set)1812 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1813     UnicodeString pairs;
1814     for (int32_t i=0; i<set.getRangeCount(); ++i) {
1815         UChar32 start = set.getRangeStart(i);
1816         UChar32 end = set.getRangeEnd(i);
1817         if (end > 0xFFFF) {
1818             end = 0xFFFF;
1819             i = set.getRangeCount(); // Should be unnecessary
1820         }
1821         pairs.append((UChar)start).append((UChar)end);
1822     }
1823     return pairs;
1824 }
1825 
1826 /**
1827  * Basic consistency check for a few items.
1828  * That the iterator works, and that we can create a pattern and
1829  * get the same thing back
1830  */
checkRoundTrip(const UnicodeSet & s)1831 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1832     UErrorCode ec = U_ZERO_ERROR;
1833 
1834     UnicodeSet t(s);
1835     checkEqual(s, t, "copy ct");
1836 
1837     t = s;
1838     checkEqual(s, t, "operator=");
1839 
1840     copyWithIterator(t, s, FALSE);
1841     checkEqual(s, t, "iterator roundtrip");
1842 
1843     copyWithIterator(t, s, TRUE); // try range
1844     checkEqual(s, t, "iterator roundtrip");
1845 
1846     UnicodeString pat; s.toPattern(pat, FALSE);
1847     t.applyPattern(pat, ec);
1848     if (U_FAILURE(ec)) {
1849         errln("FAIL: applyPattern");
1850         return;
1851     } else {
1852         checkEqual(s, t, "toPattern(false)");
1853     }
1854 
1855     s.toPattern(pat, TRUE);
1856     t.applyPattern(pat, ec);
1857     if (U_FAILURE(ec)) {
1858         errln("FAIL: applyPattern");
1859         return;
1860     } else {
1861         checkEqual(s, t, "toPattern(true)");
1862     }
1863 }
1864 
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1865 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1866     t.clear();
1867     UnicodeSetIterator it(s);
1868     if (withRange) {
1869         while (it.nextRange()) {
1870             if (it.isString()) {
1871                 t.add(it.getString());
1872             } else {
1873                 t.add(it.getCodepoint(), it.getCodepointEnd());
1874             }
1875         }
1876     } else {
1877         while (it.next()) {
1878             if (it.isString()) {
1879                 t.add(it.getString());
1880             } else {
1881                 t.add(it.getCodepoint());
1882             }
1883         }
1884     }
1885 }
1886 
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)1887 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1888     UnicodeString source; s.toPattern(source, TRUE);
1889     UnicodeString result; t.toPattern(result, TRUE);
1890     if (s != t) {
1891         errln((UnicodeString)"FAIL: " + message
1892               + "; source = " + source
1893               + "; result = " + result
1894               );
1895         return FALSE;
1896     } else {
1897         logln((UnicodeString)"Ok: " + message
1898               + "; source = " + source
1899               + "; result = " + result
1900               );
1901     }
1902     return TRUE;
1903 }
1904 
1905 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)1906 UnicodeSetTest::expectContainment(const UnicodeString& pat,
1907                                   const UnicodeString& charsIn,
1908                                   const UnicodeString& charsOut) {
1909     UErrorCode ec = U_ZERO_ERROR;
1910     UnicodeSet set(pat, ec);
1911     if (U_FAILURE(ec)) {
1912         errln((UnicodeString)"FAIL: pattern \"" +
1913               pat + "\" => " + u_errorName(ec));
1914         return;
1915     }
1916     expectContainment(set, pat, charsIn, charsOut);
1917 }
1918 
1919 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)1920 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1921                                   const UnicodeString& charsIn,
1922                                   const UnicodeString& charsOut) {
1923     UnicodeString pat;
1924     set.toPattern(pat);
1925     expectContainment(set, pat, charsIn, charsOut);
1926 }
1927 
1928 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)1929 UnicodeSetTest::expectContainment(const UnicodeSet& set,
1930                                   const UnicodeString& setName,
1931                                   const UnicodeString& charsIn,
1932                                   const UnicodeString& charsOut) {
1933     UnicodeString bad;
1934     UChar32 c;
1935     int32_t i;
1936 
1937     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1938         c = charsIn.char32At(i);
1939         if (!set.contains(c)) {
1940             bad.append(c);
1941         }
1942     }
1943     if (bad.length() > 0) {
1944         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1945               ", expected containment of " + prettify(charsIn));
1946     } else {
1947         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1948     }
1949 
1950     bad.truncate(0);
1951     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
1952         c = charsOut.char32At(i);
1953         if (set.contains(c)) {
1954             bad.append(c);
1955         }
1956     }
1957     if (bad.length() > 0) {
1958         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
1959               ", expected non-containment of " + prettify(charsOut));
1960     } else {
1961         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
1962     }
1963 }
1964 
1965 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)1966 UnicodeSetTest::expectPattern(UnicodeSet& set,
1967                               const UnicodeString& pattern,
1968                               const UnicodeString& expectedPairs){
1969     UErrorCode status = U_ZERO_ERROR;
1970     set.applyPattern(pattern, status);
1971     if (U_FAILURE(status)) {
1972         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1973               "\") failed");
1974         return;
1975     } else {
1976         if (getPairs(set) != expectedPairs ) {
1977             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
1978                   "\") => pairs \"" +
1979                   escape(getPairs(set)) + "\", expected \"" +
1980                   escape(expectedPairs) + "\"");
1981         } else {
1982             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
1983                   "\") => pairs \"" +
1984                   escape(getPairs(set)) + "\"");
1985         }
1986     }
1987     // the result of calling set.toPattern(), which is the string representation of
1988     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
1989     // will produce another set that is equal to this one.
1990     UnicodeString temppattern;
1991     set.toPattern(temppattern);
1992     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
1993     if (U_FAILURE(status)) {
1994         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
1995         return;
1996     }
1997     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
1998         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
1999             escape(getPairs(set)) + "\""));
2000     } else{
2001         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2002     }
2003 
2004     delete tempset;
2005 
2006 }
2007 
2008 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2009 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2010     if (getPairs(set) != expectedPairs) {
2011         errln(UnicodeString("FAIL: Expected pair list \"") +
2012               escape(expectedPairs) + "\", got \"" +
2013               escape(getPairs(set)) + "\"");
2014     }
2015 }
2016 
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2017 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2018                                      const UnicodeString& expPat,
2019                                      const char** expStrings) {
2020     UnicodeString pat;
2021     set.toPattern(pat, TRUE);
2022     if (pat == expPat) {
2023         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2024     } else {
2025         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2026         return;
2027     }
2028     if (expStrings == NULL) {
2029         return;
2030     }
2031     UBool in = TRUE;
2032     for (int32_t i=0; expStrings[i] != NULL; ++i) {
2033         if (expStrings[i] == NOT) { // sic; pointer comparison
2034             in = FALSE;
2035             continue;
2036         }
2037         UnicodeString s = CharsToUnicodeString(expStrings[i]);
2038         UBool contained = set.contains(s);
2039         if (contained == in) {
2040             logln((UnicodeString)"Ok: " + expPat +
2041                   (contained ? " contains {" : " does not contain {") +
2042                   escape(expStrings[i]) + "}");
2043         } else {
2044             errln((UnicodeString)"FAIL: " + expPat +
2045                   (contained ? " contains {" : " does not contain {") +
2046                   escape(expStrings[i]) + "}");
2047         }
2048     }
2049 }
2050 
toHexString(int32_t i)2051 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2052 
2053 void
doAssert(UBool condition,const char * message)2054 UnicodeSetTest::doAssert(UBool condition, const char *message)
2055 {
2056     if (!condition) {
2057         errln(UnicodeString("ERROR : ") + message);
2058     }
2059 }
2060 
2061 UnicodeString
escape(const UnicodeString & s)2062 UnicodeSetTest::escape(const UnicodeString& s) {
2063     UnicodeString buf;
2064     for (int32_t i=0; i<s.length(); )
2065     {
2066         UChar32 c = s.char32At(i);
2067         if (0x0020 <= c && c <= 0x007F) {
2068             buf += c;
2069         } else {
2070             if (c <= 0xFFFF) {
2071                 buf += (UChar)0x5c; buf += (UChar)0x75;
2072             } else {
2073                 buf += (UChar)0x5c; buf += (UChar)0x55;
2074                 buf += toHexString((c & 0xF0000000) >> 28);
2075                 buf += toHexString((c & 0x0F000000) >> 24);
2076                 buf += toHexString((c & 0x00F00000) >> 20);
2077                 buf += toHexString((c & 0x000F0000) >> 16);
2078             }
2079             buf += toHexString((c & 0xF000) >> 12);
2080             buf += toHexString((c & 0x0F00) >> 8);
2081             buf += toHexString((c & 0x00F0) >> 4);
2082             buf += toHexString(c & 0x000F);
2083         }
2084         i += U16_LENGTH(c);
2085     }
2086     return buf;
2087 }
2088 
TestFreezable()2089 void UnicodeSetTest::TestFreezable() {
2090     UErrorCode errorCode=U_ZERO_ERROR;
2091     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2092     UnicodeSet idSet(idPattern, errorCode);
2093     if(U_FAILURE(errorCode)) {
2094         errln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2095         return;
2096     }
2097 
2098     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2099     UnicodeSet wsSet(wsPattern, errorCode);
2100     if(U_FAILURE(errorCode)) {
2101         errln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2102         return;
2103     }
2104 
2105     idSet.add(idPattern);
2106     UnicodeSet frozen(idSet);
2107     frozen.freeze();
2108 
2109     if(idSet.isFrozen() || !frozen.isFrozen()) {
2110         errln("FAIL: isFrozen() is wrong");
2111     }
2112     if(frozen!=idSet || !(frozen==idSet)) {
2113         errln("FAIL: a copy-constructed frozen set differs from its original");
2114     }
2115 
2116     frozen=wsSet;
2117     if(frozen!=idSet || !(frozen==idSet)) {
2118         errln("FAIL: a frozen set was modified by operator=");
2119     }
2120 
2121     UnicodeSet frozen2(frozen);
2122     if(frozen2!=frozen || frozen2!=idSet) {
2123         errln("FAIL: a copied frozen set differs from its frozen original");
2124     }
2125     if(!frozen2.isFrozen()) {
2126         errln("FAIL: copy-constructing a frozen set results in a thawed one");
2127     }
2128     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2129     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2130         errln("FAIL: UnicodeSet(5, 55) failed");
2131     }
2132     frozen3=frozen;
2133     if(!frozen3.isFrozen()) {
2134         errln("FAIL: copying a frozen set results in a thawed one");
2135     }
2136 
2137     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2138     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2139         errln("FAIL: clone() failed");
2140     }
2141     cloned->add(0xd802, 0xd805);
2142     if(cloned->containsSome(0xd802, 0xd805)) {
2143         errln("FAIL: unable to modify clone");
2144     }
2145     delete cloned;
2146 
2147     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2148     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2149         errln("FAIL: cloneAsThawed() failed");
2150     }
2151     thawed->add(0xd802, 0xd805);
2152     if(!thawed->contains(0xd802, 0xd805)) {
2153         errln("FAIL: unable to modify thawed clone");
2154     }
2155     delete thawed;
2156 
2157     frozen.set(5, 55);
2158     if(frozen!=idSet || !(frozen==idSet)) {
2159         errln("FAIL: UnicodeSet::set() modified a frozen set");
2160     }
2161 
2162     frozen.clear();
2163     if(frozen!=idSet || !(frozen==idSet)) {
2164         errln("FAIL: UnicodeSet::clear() modified a frozen set");
2165     }
2166 
2167     frozen.closeOver(USET_CASE_INSENSITIVE);
2168     if(frozen!=idSet || !(frozen==idSet)) {
2169         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2170     }
2171 
2172     frozen.compact();
2173     if(frozen!=idSet || !(frozen==idSet)) {
2174         errln("FAIL: UnicodeSet::compact() modified a frozen set");
2175     }
2176 
2177     ParsePosition pos;
2178     frozen.
2179         applyPattern(wsPattern, errorCode).
2180         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2181         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2182         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2183         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2184     if(frozen!=idSet || !(frozen==idSet)) {
2185         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2186     }
2187 
2188     frozen.
2189         add(0xd800).
2190         add(0xd802, 0xd805).
2191         add(wsPattern).
2192         addAll(idPattern).
2193         addAll(wsSet);
2194     if(frozen!=idSet || !(frozen==idSet)) {
2195         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2196     }
2197 
2198     frozen.
2199         retain(0x62).
2200         retain(0x64, 0x69).
2201         retainAll(wsPattern).
2202         retainAll(wsSet);
2203     if(frozen!=idSet || !(frozen==idSet)) {
2204         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2205     }
2206 
2207     frozen.
2208         remove(0x62).
2209         remove(0x64, 0x69).
2210         remove(idPattern).
2211         removeAll(idPattern).
2212         removeAll(idSet);
2213     if(frozen!=idSet || !(frozen==idSet)) {
2214         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2215     }
2216 
2217     frozen.
2218         complement().
2219         complement(0x62).
2220         complement(0x64, 0x69).
2221         complement(idPattern).
2222         complementAll(idPattern).
2223         complementAll(idSet);
2224     if(frozen!=idSet || !(frozen==idSet)) {
2225         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2226     }
2227 }
2228 
2229 // Test span() etc. -------------------------------------------------------- ***
2230 
2231 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2232 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2233 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2234     UErrorCode errorCode=U_ZERO_ERROR;
2235     int32_t length8=0;
2236     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2237     if(U_SUCCESS(errorCode)) {
2238         return length8;
2239     } else {
2240         // The string contains an unpaired surrogate.
2241         // Ignore this string.
2242         return 0;
2243     }
2244 }
2245 
2246 class UnicodeSetWithStringsIterator;
2247 
2248 // Make the strings in a UnicodeSet easily accessible.
2249 class UnicodeSetWithStrings {
2250 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2251     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2252             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2253         int32_t size=set.size();
2254         if(size>0 && set.charAt(size-1)<0) {
2255             // If a set's last element is not a code point, then it must contain strings.
2256             // Iterate over the set, skip all code point ranges, and cache the strings.
2257             // Convert them to UTF-8 for spanUTF8().
2258             UnicodeSetIterator iter(set);
2259             const UnicodeString *s;
2260             char *s8=utf8;
2261             int32_t length8, utf8Count=0;
2262             while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2263                 if(iter.isString()) {
2264                     // Store the pointer to the set's string element
2265                     // which we happen to know is a stable pointer.
2266                     strings[stringsLength]=s=&iter.getString();
2267                     utf8Count+=
2268                         utf8Lengths[stringsLength]=length8=
2269                         appendUTF8(s->getBuffer(), s->length(),
2270                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
2271                     if(length8==0) {
2272                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
2273                     }
2274                     s8+=length8;
2275                     ++stringsLength;
2276                 }
2277             }
2278         }
2279     }
2280 
getSet() const2281     const UnicodeSet &getSet() const {
2282         return set;
2283     }
2284 
hasStrings() const2285     UBool hasStrings() const {
2286         return (UBool)(stringsLength>0);
2287     }
2288 
hasStringsWithSurrogates() const2289     UBool hasStringsWithSurrogates() const {
2290         return hasSurrogates;
2291     }
2292 
2293 private:
2294     friend class UnicodeSetWithStringsIterator;
2295 
2296     const UnicodeSet &set;
2297 
2298     const UnicodeString *strings[20];
2299     int32_t stringsLength;
2300     UBool hasSurrogates;
2301 
2302     char utf8[1024];
2303     int32_t utf8Lengths[20];
2304 
2305     int32_t nextStringIndex;
2306     int32_t nextUTF8Start;
2307 };
2308 
2309 class UnicodeSetWithStringsIterator {
2310 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2311     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2312             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2313     }
2314 
reset()2315     void reset() {
2316         nextStringIndex=nextUTF8Start=0;
2317     }
2318 
nextString()2319     const UnicodeString *nextString() {
2320         if(nextStringIndex<fSet.stringsLength) {
2321             return fSet.strings[nextStringIndex++];
2322         } else {
2323             return NULL;
2324         }
2325     }
2326 
2327     // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2328     const char *nextUTF8(int32_t &length) {
2329         if(nextStringIndex<fSet.stringsLength) {
2330             const char *s8=fSet.utf8+nextUTF8Start;
2331             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2332             return s8;
2333         } else {
2334             length=0;
2335             return NULL;
2336         }
2337     }
2338 
2339 private:
2340     const UnicodeSetWithStrings &fSet;
2341     int32_t nextStringIndex;
2342     int32_t nextUTF8Start;
2343 };
2344 
2345 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2346 // at code point boundaries.
2347 // That is, each edge of a match must not be in the middle of a surrogate pair.
2348 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2349 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2350     s+=start;
2351     limit-=start;
2352     int32_t length=t.length();
2353     return 0==t.compare(s, length) &&
2354            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2355            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2356 }
2357 
2358 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2359 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2360                                  USetSpanCondition spanCondition) {
2361     const UnicodeSet &realSet(set.getSet());
2362     if(!set.hasStrings()) {
2363         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2364             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2365         }
2366 
2367         UChar32 c;
2368         int32_t start=0, prev;
2369         while((prev=start)<length) {
2370             U16_NEXT(s, start, length, c);
2371             if(realSet.contains(c)!=spanCondition) {
2372                 break;
2373             }
2374         }
2375         return prev;
2376     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2377         UnicodeSetWithStringsIterator iter(set);
2378         UChar32 c;
2379         int32_t start, next;
2380         for(start=next=0; start<length;) {
2381             U16_NEXT(s, next, length, c);
2382             if(realSet.contains(c)) {
2383                 break;
2384             }
2385             const UnicodeString *str;
2386             iter.reset();
2387             while((str=iter.nextString())!=NULL) {
2388                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2389                     // spanNeedsStrings=TRUE;
2390                     return start;
2391                 }
2392             }
2393             start=next;
2394         }
2395         return start;
2396     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2397         UnicodeSetWithStringsIterator iter(set);
2398         UChar32 c;
2399         int32_t start, next, maxSpanLimit=0;
2400         for(start=next=0; start<length;) {
2401             U16_NEXT(s, next, length, c);
2402             if(!realSet.contains(c)) {
2403                 next=start;  // Do not span this single, not-contained code point.
2404             }
2405             const UnicodeString *str;
2406             iter.reset();
2407             while((str=iter.nextString())!=NULL) {
2408                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2409                     // spanNeedsStrings=TRUE;
2410                     int32_t matchLimit=start+str->length();
2411                     if(matchLimit==length) {
2412                         return length;
2413                     }
2414                     if(spanCondition==USET_SPAN_CONTAINED) {
2415                         // Iterate for the shortest match at each position.
2416                         // Recurse for each but the shortest match.
2417                         if(next==start) {
2418                             next=matchLimit;  // First match from start.
2419                         } else {
2420                             if(matchLimit<next) {
2421                                 // Remember shortest match from start for iteration.
2422                                 int32_t temp=next;
2423                                 next=matchLimit;
2424                                 matchLimit=temp;
2425                             }
2426                             // Recurse for non-shortest match from start.
2427                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2428                                                                  USET_SPAN_CONTAINED);
2429                             if((matchLimit+spanLength)>maxSpanLimit) {
2430                                 maxSpanLimit=matchLimit+spanLength;
2431                                 if(maxSpanLimit==length) {
2432                                     return length;
2433                                 }
2434                             }
2435                         }
2436                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2437                         if(matchLimit>next) {
2438                             // Remember longest match from start.
2439                             next=matchLimit;
2440                         }
2441                     }
2442                 }
2443             }
2444             if(next==start) {
2445                 break;  // No match from start.
2446             }
2447             start=next;
2448         }
2449         if(start>maxSpanLimit) {
2450             return start;
2451         } else {
2452             return maxSpanLimit;
2453         }
2454     }
2455 }
2456 
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2457 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2458                                      USetSpanCondition spanCondition) {
2459     if(length==0) {
2460         return 0;
2461     }
2462     const UnicodeSet &realSet(set.getSet());
2463     if(!set.hasStrings()) {
2464         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2465             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2466         }
2467 
2468         UChar32 c;
2469         int32_t prev=length;
2470         do {
2471             U16_PREV(s, 0, length, c);
2472             if(realSet.contains(c)!=spanCondition) {
2473                 break;
2474             }
2475         } while((prev=length)>0);
2476         return prev;
2477     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2478         UnicodeSetWithStringsIterator iter(set);
2479         UChar32 c;
2480         int32_t prev=length, length0=length;
2481         do {
2482             U16_PREV(s, 0, length, c);
2483             if(realSet.contains(c)) {
2484                 break;
2485             }
2486             const UnicodeString *str;
2487             iter.reset();
2488             while((str=iter.nextString())!=NULL) {
2489                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2490                     // spanNeedsStrings=TRUE;
2491                     return prev;
2492                 }
2493             }
2494         } while((prev=length)>0);
2495         return prev;
2496     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2497         UnicodeSetWithStringsIterator iter(set);
2498         UChar32 c;
2499         int32_t prev=length, minSpanStart=length, length0=length;
2500         do {
2501             U16_PREV(s, 0, length, c);
2502             if(!realSet.contains(c)) {
2503                 length=prev;  // Do not span this single, not-contained code point.
2504             }
2505             const UnicodeString *str;
2506             iter.reset();
2507             while((str=iter.nextString())!=NULL) {
2508                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2509                     // spanNeedsStrings=TRUE;
2510                     int32_t matchStart=prev-str->length();
2511                     if(matchStart==0) {
2512                         return 0;
2513                     }
2514                     if(spanCondition==USET_SPAN_CONTAINED) {
2515                         // Iterate for the shortest match at each position.
2516                         // Recurse for each but the shortest match.
2517                         if(length==prev) {
2518                             length=matchStart;  // First match from prev.
2519                         } else {
2520                             if(matchStart>length) {
2521                                 // Remember shortest match from prev for iteration.
2522                                 int32_t temp=length;
2523                                 length=matchStart;
2524                                 matchStart=temp;
2525                             }
2526                             // Recurse for non-shortest match from prev.
2527                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2528                                                                     USET_SPAN_CONTAINED);
2529                             if(spanStart<minSpanStart) {
2530                                 minSpanStart=spanStart;
2531                                 if(minSpanStart==0) {
2532                                     return 0;
2533                                 }
2534                             }
2535                         }
2536                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2537                         if(matchStart<length) {
2538                             // Remember longest match from prev.
2539                             length=matchStart;
2540                         }
2541                     }
2542                 }
2543             }
2544             if(length==prev) {
2545                 break;  // No match from prev.
2546             }
2547         } while((prev=length)>0);
2548         if(prev<minSpanStart) {
2549             return prev;
2550         } else {
2551             return minSpanStart;
2552         }
2553     }
2554 }
2555 
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2556 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2557                                 USetSpanCondition spanCondition) {
2558     const UnicodeSet &realSet(set.getSet());
2559     if(!set.hasStrings()) {
2560         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2561             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2562         }
2563 
2564         UChar32 c;
2565         int32_t start=0, prev;
2566         while((prev=start)<length) {
2567             U8_NEXT(s, start, length, c);
2568             if(c<0) {
2569                 c=0xfffd;
2570             }
2571             if(realSet.contains(c)!=spanCondition) {
2572                 break;
2573             }
2574         }
2575         return prev;
2576     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2577         UnicodeSetWithStringsIterator iter(set);
2578         UChar32 c;
2579         int32_t start, next;
2580         for(start=next=0; start<length;) {
2581             U8_NEXT(s, next, length, c);
2582             if(c<0) {
2583                 c=0xfffd;
2584             }
2585             if(realSet.contains(c)) {
2586                 break;
2587             }
2588             const char *s8;
2589             int32_t length8;
2590             iter.reset();
2591             while((s8=iter.nextUTF8(length8))!=NULL) {
2592                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2593                     // spanNeedsStrings=TRUE;
2594                     return start;
2595                 }
2596             }
2597             start=next;
2598         }
2599         return start;
2600     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2601         UnicodeSetWithStringsIterator iter(set);
2602         UChar32 c;
2603         int32_t start, next, maxSpanLimit=0;
2604         for(start=next=0; start<length;) {
2605             U8_NEXT(s, next, length, c);
2606             if(c<0) {
2607                 c=0xfffd;
2608             }
2609             if(!realSet.contains(c)) {
2610                 next=start;  // Do not span this single, not-contained code point.
2611             }
2612             const char *s8;
2613             int32_t length8;
2614             iter.reset();
2615             while((s8=iter.nextUTF8(length8))!=NULL) {
2616                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2617                     // spanNeedsStrings=TRUE;
2618                     int32_t matchLimit=start+length8;
2619                     if(matchLimit==length) {
2620                         return length;
2621                     }
2622                     if(spanCondition==USET_SPAN_CONTAINED) {
2623                         // Iterate for the shortest match at each position.
2624                         // Recurse for each but the shortest match.
2625                         if(next==start) {
2626                             next=matchLimit;  // First match from start.
2627                         } else {
2628                             if(matchLimit<next) {
2629                                 // Remember shortest match from start for iteration.
2630                                 int32_t temp=next;
2631                                 next=matchLimit;
2632                                 matchLimit=temp;
2633                             }
2634                             // Recurse for non-shortest match from start.
2635                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2636                                                                 USET_SPAN_CONTAINED);
2637                             if((matchLimit+spanLength)>maxSpanLimit) {
2638                                 maxSpanLimit=matchLimit+spanLength;
2639                                 if(maxSpanLimit==length) {
2640                                     return length;
2641                                 }
2642                             }
2643                         }
2644                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2645                         if(matchLimit>next) {
2646                             // Remember longest match from start.
2647                             next=matchLimit;
2648                         }
2649                     }
2650                 }
2651             }
2652             if(next==start) {
2653                 break;  // No match from start.
2654             }
2655             start=next;
2656         }
2657         if(start>maxSpanLimit) {
2658             return start;
2659         } else {
2660             return maxSpanLimit;
2661         }
2662     }
2663 }
2664 
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2665 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2666                                     USetSpanCondition spanCondition) {
2667     if(length==0) {
2668         return 0;
2669     }
2670     const UnicodeSet &realSet(set.getSet());
2671     if(!set.hasStrings()) {
2672         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2673             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2674         }
2675 
2676         UChar32 c;
2677         int32_t prev=length;
2678         do {
2679             U8_PREV(s, 0, length, c);
2680             if(c<0) {
2681                 c=0xfffd;
2682             }
2683             if(realSet.contains(c)!=spanCondition) {
2684                 break;
2685             }
2686         } while((prev=length)>0);
2687         return prev;
2688     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2689         UnicodeSetWithStringsIterator iter(set);
2690         UChar32 c;
2691         int32_t prev=length;
2692         do {
2693             U8_PREV(s, 0, length, c);
2694             if(c<0) {
2695                 c=0xfffd;
2696             }
2697             if(realSet.contains(c)) {
2698                 break;
2699             }
2700             const char *s8;
2701             int32_t length8;
2702             iter.reset();
2703             while((s8=iter.nextUTF8(length8))!=NULL) {
2704                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2705                     // spanNeedsStrings=TRUE;
2706                     return prev;
2707                 }
2708             }
2709         } while((prev=length)>0);
2710         return prev;
2711     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2712         UnicodeSetWithStringsIterator iter(set);
2713         UChar32 c;
2714         int32_t prev=length, minSpanStart=length;
2715         do {
2716             U8_PREV(s, 0, length, c);
2717             if(c<0) {
2718                 c=0xfffd;
2719             }
2720             if(!realSet.contains(c)) {
2721                 length=prev;  // Do not span this single, not-contained code point.
2722             }
2723             const char *s8;
2724             int32_t length8;
2725             iter.reset();
2726             while((s8=iter.nextUTF8(length8))!=NULL) {
2727                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2728                     // spanNeedsStrings=TRUE;
2729                     int32_t matchStart=prev-length8;
2730                     if(matchStart==0) {
2731                         return 0;
2732                     }
2733                     if(spanCondition==USET_SPAN_CONTAINED) {
2734                         // Iterate for the shortest match at each position.
2735                         // Recurse for each but the shortest match.
2736                         if(length==prev) {
2737                             length=matchStart;  // First match from prev.
2738                         } else {
2739                             if(matchStart>length) {
2740                                 // Remember shortest match from prev for iteration.
2741                                 int32_t temp=length;
2742                                 length=matchStart;
2743                                 matchStart=temp;
2744                             }
2745                             // Recurse for non-shortest match from prev.
2746                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2747                                                                    USET_SPAN_CONTAINED);
2748                             if(spanStart<minSpanStart) {
2749                                 minSpanStart=spanStart;
2750                                 if(minSpanStart==0) {
2751                                     return 0;
2752                                 }
2753                             }
2754                         }
2755                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
2756                         if(matchStart<length) {
2757                             // Remember longest match from prev.
2758                             length=matchStart;
2759                         }
2760                     }
2761                 }
2762             }
2763             if(length==prev) {
2764                 break;  // No match from prev.
2765             }
2766         } while((prev=length)>0);
2767         if(prev<minSpanStart) {
2768             return prev;
2769         } else {
2770             return minSpanStart;
2771         }
2772     }
2773 }
2774 
2775 // spans to be performed and compared
2776 enum {
2777     SPAN_UTF16          =1,
2778     SPAN_UTF8           =2,
2779     SPAN_UTFS           =3,
2780 
2781     SPAN_SET            =4,
2782     SPAN_COMPLEMENT     =8,
2783     SPAN_POLARITY       =0xc,
2784 
2785     SPAN_FWD            =0x10,
2786     SPAN_BACK           =0x20,
2787     SPAN_DIRS           =0x30,
2788 
2789     SPAN_CONTAINED      =0x100,
2790     SPAN_SIMPLE         =0x200,
2791     SPAN_CONDITION      =0x300,
2792 
2793     SPAN_ALL            =0x33f
2794 };
2795 
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2796 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2797     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2798 }
2799 
slen(const void * s,UBool isUTF16)2800 static inline int32_t slen(const void *s, UBool isUTF16) {
2801     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2802 }
2803 
2804 /*
2805  * Count spans on a string with the method according to type and set the span limits.
2806  * The set may be the complement of the original.
2807  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2808  * according to the expected number of spans.
2809  * Sets typeName to an empty string if there is no such type.
2810  * Returns -1 if the span option is filtered out.
2811  */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2812 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2813                         const void *s, int32_t length, UBool isUTF16,
2814                         uint32_t whichSpans,
2815                         int type, const char *&typeName,
2816                         int32_t limits[], int32_t limitsCapacity,
2817                         int32_t expectCount) {
2818     const UnicodeSet &realSet(set.getSet());
2819     int32_t start, count;
2820     USetSpanCondition spanCondition, firstSpanCondition, contained;
2821     UBool isForward;
2822 
2823     if(type<0 || 7<type) {
2824         typeName="";
2825         return 0;
2826     }
2827 
2828     static const char *const typeNames16[]={
2829         "contains", "contains(LM)",
2830         "span", "span(LM)",
2831         "containsBack", "containsBack(LM)",
2832         "spanBack", "spanBack(LM)"
2833     };
2834 
2835     static const char *const typeNames8[]={
2836         "containsUTF8", "containsUTF8(LM)",
2837         "spanUTF8", "spanUTF8(LM)",
2838         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2839         "spanBackUTF8", "spanBackUTF8(LM)"
2840     };
2841 
2842     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2843 
2844     // filter span options
2845     if(type<=3) {
2846         // span forward
2847         if((whichSpans&SPAN_FWD)==0) {
2848             return -1;
2849         }
2850         isForward=TRUE;
2851     } else {
2852         // span backward
2853         if((whichSpans&SPAN_BACK)==0) {
2854             return -1;
2855         }
2856         isForward=FALSE;
2857     }
2858     if((type&1)==0) {
2859         // use USET_SPAN_CONTAINED
2860         if((whichSpans&SPAN_CONTAINED)==0) {
2861             return -1;
2862         }
2863         contained=USET_SPAN_CONTAINED;
2864     } else {
2865         // use USET_SPAN_SIMPLE
2866         if((whichSpans&SPAN_SIMPLE)==0) {
2867             return -1;
2868         }
2869         contained=USET_SPAN_SIMPLE;
2870     }
2871 
2872     // Default first span condition for going forward with an uncomplemented set.
2873     spanCondition=USET_SPAN_NOT_CONTAINED;
2874     if(isComplement) {
2875         spanCondition=invertSpanCondition(spanCondition, contained);
2876     }
2877 
2878     // First span condition for span(), used to terminate the spanBack() iteration.
2879     firstSpanCondition=spanCondition;
2880 
2881     // spanBack(): Its initial span condition is span()'s last span condition,
2882     // which is the opposite of span()'s first span condition
2883     // if we expect an even number of spans.
2884     // (The loop inverts spanCondition (expectCount-1) times
2885     // before the expectCount'th span() call.)
2886     // If we do not compare forward and backward directions, then we do not have an
2887     // expectCount and just start with firstSpanCondition.
2888     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2889         spanCondition=invertSpanCondition(spanCondition, contained);
2890     }
2891 
2892     count=0;
2893     switch(type) {
2894     case 0:
2895     case 1:
2896         start=0;
2897         if(length<0) {
2898             length=slen(s, isUTF16);
2899         }
2900         for(;;) {
2901             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2902                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2903             if(count<limitsCapacity) {
2904                 limits[count]=start;
2905             }
2906             ++count;
2907             if(start>=length) {
2908                 break;
2909             }
2910             spanCondition=invertSpanCondition(spanCondition, contained);
2911         }
2912         break;
2913     case 2:
2914     case 3:
2915         start=0;
2916         for(;;) {
2917             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2918                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2919             if(count<limitsCapacity) {
2920                 limits[count]=start;
2921             }
2922             ++count;
2923             if(length>=0 ? start>=length :
2924                            isUTF16 ? ((const UChar *)s)[start]==0 :
2925                                      ((const char *)s)[start]==0
2926             ) {
2927                 break;
2928             }
2929             spanCondition=invertSpanCondition(spanCondition, contained);
2930         }
2931         break;
2932     case 4:
2933     case 5:
2934         if(length<0) {
2935             length=slen(s, isUTF16);
2936         }
2937         for(;;) {
2938             ++count;
2939             if(count<=limitsCapacity) {
2940                 limits[limitsCapacity-count]=length;
2941             }
2942             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2943                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2944             if(length==0 && spanCondition==firstSpanCondition) {
2945                 break;
2946             }
2947             spanCondition=invertSpanCondition(spanCondition, contained);
2948         }
2949         if(count<limitsCapacity) {
2950             memmove(limits, limits+(limitsCapacity-count), count*4);
2951         }
2952         break;
2953     case 6:
2954     case 7:
2955         for(;;) {
2956             ++count;
2957             if(count<=limitsCapacity) {
2958                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
2959             }
2960             // Note: Length<0 is tested only for the first spanBack().
2961             // If we wanted to keep length<0 for all spanBack()s, we would have to
2962             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
2963             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
2964                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
2965             if(length==0 && spanCondition==firstSpanCondition) {
2966                 break;
2967             }
2968             spanCondition=invertSpanCondition(spanCondition, contained);
2969         }
2970         if(count<limitsCapacity) {
2971             memmove(limits, limits+(limitsCapacity-count), count*4);
2972         }
2973         break;
2974     default:
2975         typeName="";
2976         return -1;
2977     }
2978 
2979     return count;
2980 }
2981 
2982 // sets to be tested; odd index=isComplement
2983 enum {
2984     SLOW,
2985     SLOW_NOT,
2986     FAST,
2987     FAST_NOT,
2988     SET_COUNT
2989 };
2990 
2991 static const char *const setNames[SET_COUNT]={
2992     "slow",
2993     "slow.not",
2994     "fast",
2995     "fast.not"
2996 };
2997 
2998 /*
2999  * Verify that we get the same results whether we look at text with contains(),
3000  * span() or spanBack(), using unfrozen or frozen versions of the set,
3001  * and using the set or its complement (switching the spanConditions accordingly).
3002  * The latter verifies that
3003  *   set.span(spanCondition) == set.complement().span(!spanCondition).
3004  *
3005  * The expectLimits[] are either provided by the caller (with expectCount>=0)
3006  * or returned to the caller (with an input expectCount<0).
3007  */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3008 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3009                               const void *s, int32_t length, UBool isUTF16,
3010                               uint32_t whichSpans,
3011                               int32_t expectLimits[], int32_t &expectCount,
3012                               const char *testName, int32_t index) {
3013     int32_t limits[500];
3014     int32_t limitsCount;
3015     int i, j;
3016 
3017     const char *typeName;
3018     int type;
3019 
3020     for(i=0; i<SET_COUNT; ++i) {
3021         if((i&1)==0) {
3022             // Even-numbered sets are original, uncomplemented sets.
3023             if((whichSpans&SPAN_SET)==0) {
3024                 continue;
3025             }
3026         } else {
3027             // Odd-numbered sets are complemented.
3028             if((whichSpans&SPAN_COMPLEMENT)==0) {
3029                 continue;
3030             }
3031         }
3032         for(type=0;; ++type) {
3033             limitsCount=getSpans(*sets[i], (UBool)(i&1),
3034                                  s, length, isUTF16,
3035                                  whichSpans,
3036                                  type, typeName,
3037                                  limits, LENGTHOF(limits), expectCount);
3038             if(typeName[0]==0) {
3039                 break; // All types tried.
3040             }
3041             if(limitsCount<0) {
3042                 continue; // Span option filtered out.
3043             }
3044             if(expectCount<0) {
3045                 expectCount=limitsCount;
3046                 if(limitsCount>LENGTHOF(limits)) {
3047                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3048                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3049                     return;
3050                 }
3051                 memcpy(expectLimits, limits, limitsCount*4);
3052             } else if(limitsCount!=expectCount) {
3053                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3054                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3055             } else {
3056                 for(j=0; j<limitsCount; ++j) {
3057                     if(limits[j]!=expectLimits[j]) {
3058                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3059                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
3060                               j, (long)limits[j], (long)expectLimits[j]);
3061                         break;
3062                     }
3063                 }
3064             }
3065         }
3066     }
3067 
3068     // Compare span() with containsAll()/containsNone(),
3069     // but only if we have expectLimits[] from the uncomplemented set.
3070     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3071         const UChar *s16=(const UChar *)s;
3072         UnicodeString string;
3073         int32_t prev=0, limit, length;
3074         for(i=0; i<expectCount; ++i) {
3075             limit=expectLimits[i];
3076             length=limit-prev;
3077             if(length>0) {
3078                 string.setTo(FALSE, s16+prev, length);  // read-only alias
3079                 if(i&1) {
3080                     if(!sets[SLOW]->getSet().containsAll(string)) {
3081                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3082                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3083                         return;
3084                     }
3085                     if(!sets[FAST]->getSet().containsAll(string)) {
3086                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3087                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3088                         return;
3089                     }
3090                 } else {
3091                     if(!sets[SLOW]->getSet().containsNone(string)) {
3092                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3093                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3094                         return;
3095                     }
3096                     if(!sets[FAST]->getSet().containsNone(string)) {
3097                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3098                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3099                         return;
3100                     }
3101                 }
3102             }
3103             prev=limit;
3104         }
3105     }
3106 }
3107 
3108 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3109 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3110                               const void *s, int32_t length, UBool isUTF16,
3111                               uint32_t whichSpans,
3112                               const char *testName, int32_t index) {
3113     int32_t expectLimits[500];
3114     int32_t expectCount=-1;
3115     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3116 }
3117 
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3118 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3119     UChar c, c2;
3120 
3121     if(length>=0) {
3122         while(length>0) {
3123             c=*s++;
3124             --length;
3125             if(0xd800<=c && c<0xe000) {
3126                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3127                     return TRUE;
3128                 }
3129                 --length;
3130             }
3131         }
3132     } else {
3133         while((c=*s++)!=0) {
3134             if(0xd800<=c && c<0xe000) {
3135                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3136                     return TRUE;
3137                 }
3138             }
3139         }
3140     }
3141     return FALSE;
3142 }
3143 
3144 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3145 // unless either UTF is turned off in whichSpans.
3146 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3147 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3148 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3149                                       const UChar *s16, int32_t length16,
3150                                       uint32_t whichSpans,
3151                                       const char *testName, int32_t index) {
3152     int32_t expectLimits[500];
3153     int32_t expectCount;
3154 
3155     expectCount=-1;  // Get expectLimits[] from testSpan().
3156 
3157     if((whichSpans&SPAN_UTF16)!=0) {
3158         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3159     }
3160     if((whichSpans&SPAN_UTF8)==0) {
3161         return;
3162     }
3163 
3164     // Convert s16[] and expectLimits[] to UTF-8.
3165     uint8_t s8[3000];
3166     int32_t offsets[3000];
3167 
3168     const UChar *s16Limit=s16+length16;
3169     char *t=(char *)s8;
3170     char *tLimit=t+sizeof(s8);
3171     int32_t *o=offsets;
3172     UErrorCode errorCode=U_ZERO_ERROR;
3173 
3174     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3175     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3176     if(U_FAILURE(errorCode)) {
3177         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3178               testName, (long)index, u_errorName(errorCode));
3179         ucnv_resetFromUnicode(utf8Cnv);
3180         return;
3181     }
3182     int32_t length8=(int32_t)(t-(char *)s8);
3183 
3184     // Convert expectLimits[].
3185     int32_t i, j, expect;
3186     for(i=j=0; i<expectCount; ++i) {
3187         expect=expectLimits[i];
3188         if(expect==length16) {
3189             expectLimits[i]=length8;
3190         } else {
3191             while(offsets[j]<expect) {
3192                 ++j;
3193             }
3194             expectLimits[i]=j;
3195         }
3196     }
3197 
3198     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3199 }
3200 
nextCodePoint(UChar32 c)3201 static UChar32 nextCodePoint(UChar32 c) {
3202     // Skip some large and boring ranges.
3203     switch(c) {
3204     case 0x3441:
3205         return 0x4d7f;
3206     case 0x5100:
3207         return 0x9f00;
3208     case 0xb040:
3209         return 0xd780;
3210     case 0xe041:
3211         return 0xf8fe;
3212     case 0x10100:
3213         return 0x20000;
3214     case 0x20041:
3215         return 0xe0000;
3216     case 0xe0101:
3217         return 0x10fffd;
3218     default:
3219         return c+1;
3220     }
3221 }
3222 
3223 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3224 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3225     // contains(U+FFFD) is inconsistent with contains(some surrogates),
3226     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3227     // Skip the UTF-8 part of the test - if the string contains surrogates -
3228     // because it is likely to produce a different result.
3229     UBool inconsistentSurrogates=
3230             (!(sets[0]->getSet().contains(0xfffd) ?
3231                sets[0]->getSet().contains(0xd800, 0xdfff) :
3232                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3233              sets[0]->hasStringsWithSurrogates());
3234 
3235     UChar s[1000];
3236     int32_t length=0;
3237     uint32_t localWhichSpans;
3238 
3239     UChar32 c, first;
3240     for(first=c=0;; c=nextCodePoint(c)) {
3241         if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3242             localWhichSpans=whichSpans;
3243             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3244                 localWhichSpans&=~SPAN_UTF8;
3245             }
3246             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3247             if(c>0x10ffff) {
3248                 break;
3249             }
3250             length=0;
3251             first=c;
3252         }
3253         U16_APPEND_UNSAFE(s, length, c);
3254     }
3255 }
3256 
3257 // Test with a particular, interesting string.
3258 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3259 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3260     static const UChar s[]={
3261         0x61, 0x62, 0x20,                       // Latin, space
3262         0x3b1, 0x3b2, 0x3b3,                    // Greek
3263         0xd900,                                 // lead surrogate
3264         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3265         0xdc05,                                 // trail surrogate
3266         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3267         0xd900, 0xdc05,                         // unassigned supplementary
3268         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3269         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3270         0                                       // NUL
3271     };
3272 
3273     if((whichSpans&SPAN_UTF16)==0) {
3274         return;
3275     }
3276     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3277     testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3278 }
3279 
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3280 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3281     static const char s[]={
3282         "abc"                                   // Latin
3283 
3284         /* trail byte in lead position */
3285         "\x80"
3286 
3287         " "                                     // space
3288 
3289         /* truncated multi-byte sequences */
3290         "\xd0"
3291         "\xe0"
3292         "\xe1"
3293         "\xed"
3294         "\xee"
3295         "\xf0"
3296         "\xf1"
3297         "\xf4"
3298         "\xf8"
3299         "\xfc"
3300 
3301         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3302 
3303         /* trail byte in lead position */
3304         "\x80"
3305 
3306         "\xe0\x80"
3307         "\xe0\xa0"
3308         "\xe1\x80"
3309         "\xed\x80"
3310         "\xed\xa0"
3311         "\xee\x80"
3312         "\xf0\x80"
3313         "\xf0\x90"
3314         "\xf1\x80"
3315         "\xf4\x80"
3316         "\xf4\x90"
3317         "\xf8\x80"
3318         "\xfc\x80"
3319 
3320         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3321 
3322         /* trail byte in lead position */
3323         "\x80"
3324 
3325         "\xf0\x80\x80"
3326         "\xf0\x90\x80"
3327         "\xf1\x80\x80"
3328         "\xf4\x80\x80"
3329         "\xf4\x90\x80"
3330         "\xf8\x80\x80"
3331         "\xfc\x80\x80"
3332 
3333         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3334 
3335         /* trail byte in lead position */
3336         "\x80"
3337 
3338         "\xf8\x80\x80\x80"
3339         "\xfc\x80\x80\x80"
3340 
3341         "\xF1\x90\x80\x85"                      // unassigned supplementary
3342 
3343         /* trail byte in lead position */
3344         "\x80"
3345 
3346         "\xfc\x80\x80\x80\x80"
3347 
3348         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3349 
3350         /* trail byte in lead position */
3351         "\x80"
3352 
3353         /* complete sequences but non-shortest forms or out of range etc. */
3354         "\xc0\x80"
3355         "\xe0\x80\x80"
3356         "\xed\xa0\x80"
3357         "\xf0\x80\x80\x80"
3358         "\xf4\x90\x80\x80"
3359         "\xf8\x80\x80\x80\x80"
3360         "\xfc\x80\x80\x80\x80\x80"
3361         "\xfe"
3362         "\xff"
3363 
3364         /* trail byte in lead position */
3365         "\x80"
3366 
3367         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3368     };
3369 
3370     if((whichSpans&SPAN_UTF8)==0) {
3371         return;
3372     }
3373     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3374     testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3375 }
3376 
3377 // Take a set of span options and multiply them so that
3378 // each portion only has one of the options a, b and c.
3379 // If b==0, then the set of options is just modified with mask and a.
3380 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3381 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3382 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3383                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3384     uint32_t s;
3385     int32_t i;
3386 
3387     for(i=0; i<whichSpansCount; ++i) {
3388         s=whichSpans[i]&mask;
3389         whichSpans[i]=s|a;
3390         if(b!=0) {
3391             whichSpans[whichSpansCount+i]=s|b;
3392             if(c!=0) {
3393                 whichSpans[2*whichSpansCount+i]=s|c;
3394             }
3395         }
3396     }
3397     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3398 }
3399 
3400 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3401 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3402 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3403 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3404 
TestSpan()3405 void UnicodeSetTest::TestSpan() {
3406     // "[...]" is a UnicodeSet pattern.
3407     // "*" performs tests on all Unicode code points and on a selection of
3408     //   malformed UTF-8/16 strings.
3409     // "-options" limits the scope of testing for the current set.
3410     //   By default, the test verifies that equivalent boundaries are found
3411     //   for UTF-16 and UTF-8, going forward and backward,
3412     //   alternating USET_SPAN_NOT_CONTAINED with
3413     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3414     //   Single-character options:
3415     //     8 -- UTF-16 and UTF-8 boundaries may differ.
3416     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3417     //          or the set contains strings with unpaired surrogates
3418     //          which do not translate to valid UTF-8.
3419     //     c -- set.span() and set.complement().span() boundaries may differ.
3420     //          Cause: Set strings are not complemented.
3421     //     b -- span() and spanBack() boundaries may differ.
3422     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3423     //          and spanBack(USET_SPAN_SIMPLE) are defined to
3424     //          match with non-overlapping substrings.
3425     //          For example, with a set containing "ab" and "ba",
3426     //          span() of "aba" yields boundaries { 0, 2, 3 }
3427     //          because the initial "ab" matches from 0 to 2,
3428     //          while spanBack() yields boundaries { 0, 1, 3 }
3429     //          because the final "ba" matches from 1 to 3.
3430     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3431     //          Cause: Strings in the set overlap, and a longer match may
3432     //          require a sequence including non-longest substrings.
3433     //          For example, with a set containing "ab", "abc" and "cd",
3434     //          span(contained) of "abcd" spans the entire string
3435     //          but span(longest match) only spans the first 3 characters.
3436     //   Each "-options" first resets all options and then applies the specified options.
3437     //   A "-" without options resets the options.
3438     //   The options are also reset for each new set.
3439     // Other strings will be spanned.
3440     static const char *const testdata[]={
3441         "[:ID_Continue:]",
3442         "*",
3443         "[:White_Space:]",
3444         "*",
3445         "[]",
3446         "*",
3447         "[\\u0000-\\U0010FFFF]",
3448         "*",
3449         "[\\u0000\\u0080\\u0800\\U00010000]",
3450         "*",
3451         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3452         "*",
3453         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3454         "-c",
3455         "*",
3456         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3457         "-c",
3458         "*",
3459 
3460         // Overlapping strings cause overlapping attempts to match.
3461         "[x{xy}{xya}{axy}{ax}]",
3462         "-cl",
3463 
3464         // More repetitions of "xya" would take too long with the recursive
3465         // reference implementation.
3466         // containsAll()=FALSE
3467         // test_string 0x14
3468         "xx"
3469         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3470         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3471         "xyaxyaxyaxya"
3472         "xx"
3473         "xyaxyaxyaxya"  // span() ends here.
3474         "aaa",
3475 
3476         // containsAll()=TRUE
3477         // test_string 0x15
3478         "xx"
3479         "xyaxyaxyaxya"
3480         "xx"
3481         "xyaxyaxyaxya"
3482         "xx"
3483         "xyaxyaxyaxy",
3484 
3485         "-bc",
3486         // test_string 0x17
3487         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3488         "-c",
3489         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3490         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3491         "-",
3492         "byaya",     // span() -> { 5 }
3493         "byay",      // span() -> { 4 }
3494         "bya",       // span() -> { 3 }
3495 
3496         // span(longest match) will not span the whole string.
3497         "[a{ab}{bc}]",
3498         "-cl",
3499         // test_string 0x21
3500         "abc",
3501 
3502         "[a{ab}{abc}{cd}]",
3503         "-cl",
3504         "acdabcdabccd",
3505 
3506         // spanBack(longest match) will not span the whole string.
3507         "[c{ab}{bc}]",
3508         "-cl",
3509         "abc",
3510 
3511         "[d{cd}{bcd}{ab}]",
3512         "-cl",
3513         "abbcdabcdabd",
3514 
3515         // Test with non-ASCII set strings - test proper handling of surrogate pairs
3516         // and UTF-8 trail bytes.
3517         // Copies of above test sets and strings, but transliterated to have
3518         // different code points with similar trail units.
3519         // Previous: a      b         c            d
3520         // Unicode:  042B   30AB      200AB        204AB
3521         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3522         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3523         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3524         "-cl",
3525         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3526 
3527         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3528         "-cl",
3529         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3530 
3531         // Stress bookkeeping and recursion.
3532         // The following strings are barely doable with the recursive
3533         // reference implementation.
3534         // The not-contained character at the end prevents an early exit from the span().
3535         "[b{bb}]",
3536         "-c",
3537         // test_string 0x33
3538         "bbbbbbbbbbbbbbbbbbbbbbbb-",
3539         // On complement sets, span() and spanBack() get different results
3540         // because b is not in the complement set and there is an odd number of b's
3541         // in the test string.
3542         "-bc",
3543         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3544 
3545         // Test with set strings with an initial or final code point span
3546         // longer than 254.
3547         "[a{" _64_a _64_a _64_a _64_a "b}"
3548           "{a" _64_b _64_b _64_b _64_b "}]",
3549         "-c",
3550         _64_a _64_a _64_a _63_a "b",
3551         _64_a _64_a _64_a _64_a "b",
3552         _64_a _64_a _64_a _64_a "aaaabbbb",
3553         "a" _64_b _64_b _64_b _63_b,
3554         "a" _64_b _64_b _64_b _64_b,
3555         "aaaabbbb" _64_b _64_b _64_b _64_b,
3556 
3557         // Test with strings containing unpaired surrogates.
3558         // They are not representable in UTF-8, and a leading trail surrogate
3559         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3560         // U+20001 == \\uD840\\uDC01
3561         // U+20400 == \\uD841\\uDC00
3562         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3563         "-8cl",
3564         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3565     };
3566     uint32_t whichSpans[96]={ SPAN_ALL };
3567     int32_t whichSpansCount=1;
3568 
3569     UnicodeSet *sets[SET_COUNT]={ NULL };
3570     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3571 
3572     char testName[1024];
3573     char *testNameLimit=testName;
3574 
3575     int32_t i, j;
3576     for(i=0; i<LENGTHOF(testdata); ++i) {
3577         const char *s=testdata[i];
3578         if(s[0]=='[') {
3579             // Create new test sets from this pattern.
3580             for(j=0; j<SET_COUNT; ++j) {
3581                 delete sets_with_str[j];
3582                 delete sets[j];
3583             }
3584             UErrorCode errorCode=U_ZERO_ERROR;
3585             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3586             if(U_FAILURE(errorCode)) {
3587                 errln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3588                 break;
3589             }
3590             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3591             sets[SLOW_NOT]->complement();
3592             // Intermediate set: Test cloning of a frozen set.
3593             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3594             fast->freeze();
3595             sets[FAST]=(UnicodeSet *)fast->clone();
3596             delete fast;
3597             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3598             fastNot->freeze();
3599             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3600             delete fastNot;
3601 
3602             for(j=0; j<SET_COUNT; ++j) {
3603                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3604             }
3605 
3606             strcpy(testName, s);
3607             testNameLimit=strchr(testName, 0);
3608             *testNameLimit++=':';
3609             *testNameLimit=0;
3610 
3611             whichSpans[0]=SPAN_ALL;
3612             whichSpansCount=1;
3613         } else if(s[0]=='-') {
3614             whichSpans[0]=SPAN_ALL;
3615             whichSpansCount=1;
3616 
3617             while(*++s!=0) {
3618                 switch(*s) {
3619                 case 'c':
3620                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3621                                                    ~SPAN_POLARITY,
3622                                                    SPAN_SET,
3623                                                    SPAN_COMPLEMENT,
3624                                                    0);
3625                     break;
3626                 case 'b':
3627                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3628                                                    ~SPAN_DIRS,
3629                                                    SPAN_FWD,
3630                                                    SPAN_BACK,
3631                                                    0);
3632                     break;
3633                 case 'l':
3634                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
3635                     // USET_SPAN_SIMPLE only FWD, and separately
3636                     // USET_SPAN_SIMPLE only BACK
3637                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3638                                                    ~(SPAN_DIRS|SPAN_CONDITION),
3639                                                    SPAN_DIRS|SPAN_CONTAINED,
3640                                                    SPAN_FWD|SPAN_SIMPLE,
3641                                                    SPAN_BACK|SPAN_SIMPLE);
3642                     break;
3643                 case '8':
3644                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3645                                                    ~SPAN_UTFS,
3646                                                    SPAN_UTF16,
3647                                                    SPAN_UTF8,
3648                                                    0);
3649                     break;
3650                 default:
3651                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3652                     break;
3653                 }
3654             }
3655         } else if(0==strcmp(s, "*")) {
3656             strcpy(testNameLimit, "bad_string");
3657             for(j=0; j<whichSpansCount; ++j) {
3658                 if(whichSpansCount>1) {
3659                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
3660                             "%%0x%3x",
3661                             whichSpans[j]);
3662                 }
3663                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3664                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3665             }
3666 
3667             strcpy(testNameLimit, "contents");
3668             for(j=0; j<whichSpansCount; ++j) {
3669                 if(whichSpansCount>1) {
3670                     sprintf(testNameLimit+8 /* strlen("contents") */,
3671                             "%%0x%3x",
3672                             whichSpans[j]);
3673                 }
3674                 testSpanContents(sets_with_str, whichSpans[j], testName);
3675             }
3676         } else {
3677             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3678             strcpy(testNameLimit, "test_string");
3679             for(j=0; j<whichSpansCount; ++j) {
3680                 if(whichSpansCount>1) {
3681                     sprintf(testNameLimit+11 /* strlen("test_string") */,
3682                             "%%0x%3x",
3683                             whichSpans[j]);
3684                 }
3685                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3686             }
3687         }
3688     }
3689     for(j=0; j<SET_COUNT; ++j) {
3690         delete sets_with_str[j];
3691         delete sets[j];
3692     }
3693 }
3694 
3695 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3696 void UnicodeSetTest::TestStringSpan() {
3697     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3698     static const char *const string=
3699         "xx"
3700         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3701         "xx"
3702         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3703         "xx"
3704         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3705         "aaaa";
3706 
3707     UErrorCode errorCode=U_ZERO_ERROR;
3708     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3709     UnicodeSet set(pattern16, errorCode);
3710     if(U_FAILURE(errorCode)) {
3711         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3712         return;
3713     }
3714 
3715     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3716 
3717     if(set.containsAll(string16)) {
3718         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3719     }
3720 
3721     // Remove trailing "aaaa".
3722     string16.truncate(string16.length()-4);
3723     if(!set.containsAll(string16)) {
3724         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3725     }
3726 
3727     string16=UNICODE_STRING_SIMPLE("byayaxya");
3728     const UChar *s16=string16.getBuffer();
3729     int32_t length16=string16.length();
3730     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3731         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3732         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3733         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3734         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3735         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3736     ) {
3737         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3738     }
3739 
3740     pattern="[a{ab}{abc}{cd}]";
3741     pattern16=UnicodeString(pattern, -1, US_INV);
3742     set.applyPattern(pattern16, errorCode);
3743     if(U_FAILURE(errorCode)) {
3744         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3745         return;
3746     }
3747     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3748     s16=string16.getBuffer();
3749     length16=string16.length();
3750     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3751         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3752         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3753     ) {
3754         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3755     }
3756 
3757     pattern="[d{cd}{bcd}{ab}]";
3758     pattern16=UnicodeString(pattern, -1, US_INV);
3759     set.applyPattern(pattern16, errorCode).freeze();
3760     if(U_FAILURE(errorCode)) {
3761         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3762         return;
3763     }
3764     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3765     s16=string16.getBuffer();
3766     length16=string16.length();
3767     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3768         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3769         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3770     ) {
3771         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3772     }
3773 }
3774