1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 * 03/22/2000 Madhu Added additional tests
11 ********************************************************************************
12 */
13
14 #include <stdio.h>
15
16 #include <string.h>
17 #include "unicode/utypes.h"
18 #include "usettest.h"
19 #include "unicode/ucnv.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/usetiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/parsepos.h"
25 #include "unicode/symtable.h"
26 #include "unicode/uversion.h"
27 #include "cmemory.h"
28 #include "hash.h"
29
30 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
31 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
32 u_errorName(status));}}
33
34 #define TEST_ASSERT(expr) {if (!(expr)) { \
35 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
36
operator +(const UnicodeString & left,const UnicodeSet & set)37 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
38 UnicodeString pat;
39 set.toPattern(pat);
40 return left + UnicodeSetTest::escape(pat);
41 }
42
43 #define CASE(id,test) case id: \
44 name = #test; \
45 if (exec) { \
46 logln(#test "---"); \
47 logln(); \
48 test(); \
49 } \
50 break
51
UnicodeSetTest()52 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
53 }
54
openUTF8Converter()55 UConverter *UnicodeSetTest::openUTF8Converter() {
56 if(utf8Cnv==NULL) {
57 UErrorCode errorCode=U_ZERO_ERROR;
58 utf8Cnv=ucnv_open("UTF-8", &errorCode);
59 }
60 return utf8Cnv;
61 }
62
~UnicodeSetTest()63 UnicodeSetTest::~UnicodeSetTest() {
64 ucnv_close(utf8Cnv);
65 }
66
67 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)68 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
69 const char* &name, char* /*par*/) {
70 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
71 switch (index) {
72 CASE(0,TestPatterns);
73 CASE(1,TestAddRemove);
74 CASE(2,TestCategories);
75 CASE(3,TestCloneEqualHash);
76 CASE(4,TestMinimalRep);
77 CASE(5,TestAPI);
78 CASE(6,TestScriptSet);
79 CASE(7,TestPropertySet);
80 CASE(8,TestClone);
81 CASE(9,TestExhaustive);
82 CASE(10,TestToPattern);
83 CASE(11,TestIndexOf);
84 CASE(12,TestStrings);
85 CASE(13,Testj2268);
86 CASE(14,TestCloseOver);
87 CASE(15,TestEscapePattern);
88 CASE(16,TestInvalidCodePoint);
89 CASE(17,TestSymbolTable);
90 CASE(18,TestSurrogate);
91 CASE(19,TestPosixClasses);
92 CASE(20,TestIteration);
93 CASE(21,TestFreezable);
94 CASE(22,TestSpan);
95 CASE(23,TestStringSpan);
96 CASE(24,TestUCAUnsafeBackwards);
97 default: name = ""; break;
98 }
99 }
100
101 static const char NOT[] = "%%%%";
102
103 /**
104 * UVector was improperly copying contents
105 * This code will crash this is still true
106 */
Testj2268()107 void UnicodeSetTest::Testj2268() {
108 UnicodeSet t;
109 t.add(UnicodeString("abc"));
110 UnicodeSet test(t);
111 UnicodeString ustrPat;
112 test.toPattern(ustrPat, TRUE);
113 }
114
115 /**
116 * Test toPattern().
117 */
TestToPattern()118 void UnicodeSetTest::TestToPattern() {
119 UErrorCode ec = U_ZERO_ERROR;
120
121 // Test that toPattern() round trips with syntax characters and
122 // whitespace.
123 {
124 static const char* OTHER_TOPATTERN_TESTS[] = {
125 "[[:latin:]&[:greek:]]",
126 "[[:latin:]-[:greek:]]",
127 "[:nonspacing mark:]",
128 NULL
129 };
130
131 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
132 ec = U_ZERO_ERROR;
133 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
134 if (U_FAILURE(ec)) {
135 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
136 continue;
137 }
138 checkPat(OTHER_TOPATTERN_TESTS[j], s);
139 }
140
141 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
142 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
143
144 // check various combinations to make sure they all work.
145 if (i != 0 && !toPatternAux(i, i)){
146 continue;
147 }
148 if (!toPatternAux(0, i)){
149 continue;
150 }
151 if (!toPatternAux(i, 0xFFFF)){
152 continue;
153 }
154 }
155 }
156 }
157
158 // Test pattern behavior of multicharacter strings.
159 {
160 ec = U_ZERO_ERROR;
161 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
162
163 // This loop isn't a loop. It's here to make the compiler happy.
164 // If you're curious, try removing it and changing the 'break'
165 // statements (except for the last) to goto's.
166 for (;;) {
167 if (U_FAILURE(ec)) break;
168 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
169 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
170
171 s->add("ac");
172 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
173 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
174
175 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
176 if (U_FAILURE(ec)) break;
177 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
178 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
179
180 s->add("[]");
181 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
182 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
183
184 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
185 if (U_FAILURE(ec)) break;
186 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
187 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
188
189 // j2189
190 s->clear();
191 s->add(UnicodeString("abc", ""));
192 s->add(UnicodeString("abc", ""));
193 const char* exp6[] = {"abc", NOT, "ab", NULL};
194 expectToPattern(*s, "[{abc}]", exp6);
195
196 break;
197 }
198
199 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
200 delete s;
201 }
202
203 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
204 UnicodeSet s;
205 s.add((UChar)97, (UChar)98); // 'a', 'b'
206 expectToPattern(s, "[ab]", NULL);
207 }
208
toPatternAux(UChar32 start,UChar32 end)209 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
210
211 // use Integer.toString because Utility.hex doesn't handle ints
212 UnicodeString pat = "";
213 // TODO do these in hex
214 //String source = "0x" + Integer.toString(start,16).toUpperCase();
215 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
216 UnicodeString source;
217 source = source + (uint32_t)start;
218 if (start != end)
219 source = source + ".." + (uint32_t)end;
220 UnicodeSet testSet;
221 testSet.add(start, end);
222 return checkPat(source, testSet);
223 }
224
checkPat(const UnicodeString & source,const UnicodeSet & testSet)225 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
226 const UnicodeSet& testSet) {
227 // What we want to make sure of is that a pattern generated
228 // by toPattern(), with or without escaped unprintables, can
229 // be passed back into the UnicodeSet constructor.
230 UnicodeString pat0;
231
232 testSet.toPattern(pat0, TRUE);
233
234 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
235
236 //String pat1 = unescapeLeniently(pat0);
237 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
238
239 UnicodeString pat2;
240 testSet.toPattern(pat2, FALSE);
241 if (!checkPat(source, testSet, pat2)) return FALSE;
242
243 //String pat3 = unescapeLeniently(pat2);
244 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
245
246 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
247 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
248 return TRUE;
249 }
250
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)251 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
252 const UnicodeSet& testSet,
253 const UnicodeString& pat) {
254 UErrorCode ec = U_ZERO_ERROR;
255 UnicodeSet testSet2(pat, ec);
256 if (testSet2 != testSet) {
257 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
258 return FALSE;
259 }
260 return TRUE;
261 }
262
263 void
TestPatterns(void)264 UnicodeSetTest::TestPatterns(void) {
265 UnicodeSet set;
266 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
267 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
268 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
269 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
270 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
271 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
272
273 // Throw in a test of complement
274 set.complement();
275 UnicodeString exp;
276 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
277 expectPairs(set, exp);
278 }
279
280 void
TestCategories(void)281 UnicodeSetTest::TestCategories(void) {
282 UErrorCode status = U_ZERO_ERROR;
283 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
284 UnicodeSet set(pat, status);
285 if (U_FAILURE(status)) {
286 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
287 return;
288 } else {
289 expectContainment(set, pat, "ABC", "abc");
290 }
291
292 UChar32 i;
293 int32_t failures = 0;
294 // Make sure generation of L doesn't pollute cached Lu set
295 // First generate L, then Lu
296 set.applyPattern("[:L:]", status);
297 if (U_FAILURE(status)) { errln("FAIL"); return; }
298 for (i=0; i<0x200; ++i) {
299 UBool l = u_isalpha((UChar)i);
300 if (l != set.contains(i)) {
301 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
302 set.contains(i));
303 if (++failures == 10) break;
304 }
305 }
306
307 set.applyPattern("[:Lu:]", status);
308 if (U_FAILURE(status)) { errln("FAIL"); return; }
309 for (i=0; i<0x200; ++i) {
310 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
311 if (lu != set.contains(i)) {
312 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
313 set.contains(i));
314 if (++failures == 20) break;
315 }
316 }
317 }
318 void
TestCloneEqualHash(void)319 UnicodeSetTest::TestCloneEqualHash(void) {
320 UErrorCode status = U_ZERO_ERROR;
321 // set1 and set2 used to be built with the obsolete constructor taking
322 // UCharCategory values; replaced with pattern constructors
323 // markus 20030502
324 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); // :Ll: Letter, lowercase
325 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); // Letter, lowercase
326 if (U_FAILURE(status)){
327 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
328 return;
329 }
330 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status); //Number, Decimal digit
331 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit
332 if (U_FAILURE(status)){
333 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
334 return;
335 }
336
337 if (*set1 != *set1a) {
338 errln("FAIL: category constructor for Ll broken");
339 }
340 if (*set2 != *set2a) {
341 errln("FAIL: category constructor for Nd broken");
342 }
343 delete set1a;
344 delete set2a;
345
346 logln("Testing copy construction");
347 UnicodeSet *set1copy=new UnicodeSet(*set1);
348 if(*set1 != *set1copy || *set1 == *set2 ||
349 getPairs(*set1) != getPairs(*set1copy) ||
350 set1->hashCode() != set1copy->hashCode()){
351 errln("FAIL : Error in copy construction");
352 return;
353 }
354
355 logln("Testing =operator");
356 UnicodeSet set1equal=*set1;
357 UnicodeSet set2equal=*set2;
358 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
359 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
360 errln("FAIL: Error in =operator");
361 }
362
363 logln("Testing clone()");
364 UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
365 UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
366 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
367 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
368 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
369 errln("FAIL: Error in clone");
370 }
371
372 logln("Testing hashcode");
373 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
374 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
375 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
376 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
377 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
378 errln("FAIL: Error in hashCode()");
379 }
380
381 delete set1;
382 delete set1copy;
383 delete set2;
384 delete set1clone;
385 delete set2clone;
386
387
388 }
389 void
TestAddRemove(void)390 UnicodeSetTest::TestAddRemove(void) {
391 UnicodeSet set; // Construct empty set
392 doAssert(set.isEmpty() == TRUE, "set should be empty");
393 doAssert(set.size() == 0, "size should be 0");
394 set.complement();
395 doAssert(set.size() == 0x110000, "size should be 0x110000");
396 set.clear();
397 set.add(0x0061, 0x007a);
398 expectPairs(set, "az");
399 doAssert(set.isEmpty() == FALSE, "set should not be empty");
400 doAssert(set.size() != 0, "size should not be equal to 0");
401 doAssert(set.size() == 26, "size should be equal to 26");
402 set.remove(0x006d, 0x0070);
403 expectPairs(set, "alqz");
404 doAssert(set.size() == 22, "size should be equal to 22");
405 set.remove(0x0065, 0x0067);
406 expectPairs(set, "adhlqz");
407 doAssert(set.size() == 19, "size should be equal to 19");
408 set.remove(0x0064, 0x0069);
409 expectPairs(set, "acjlqz");
410 doAssert(set.size() == 16, "size should be equal to 16");
411 set.remove(0x0063, 0x0072);
412 expectPairs(set, "absz");
413 doAssert(set.size() == 10, "size should be equal to 10");
414 set.add(0x0066, 0x0071);
415 expectPairs(set, "abfqsz");
416 doAssert(set.size() == 22, "size should be equal to 22");
417 set.remove(0x0061, 0x0067);
418 expectPairs(set, "hqsz");
419 set.remove(0x0061, 0x007a);
420 expectPairs(set, "");
421 doAssert(set.isEmpty() == TRUE, "set should be empty");
422 doAssert(set.size() == 0, "size should be 0");
423 set.add(0x0061);
424 doAssert(set.isEmpty() == FALSE, "set should not be empty");
425 doAssert(set.size() == 1, "size should not be equal to 1");
426 set.add(0x0062);
427 set.add(0x0063);
428 expectPairs(set, "ac");
429 doAssert(set.size() == 3, "size should not be equal to 3");
430 set.add(0x0070);
431 set.add(0x0071);
432 expectPairs(set, "acpq");
433 doAssert(set.size() == 5, "size should not be equal to 5");
434 set.clear();
435 expectPairs(set, "");
436 doAssert(set.isEmpty() == TRUE, "set should be empty");
437 doAssert(set.size() == 0, "size should be 0");
438
439 // Try removing an entire set from another set
440 expectPattern(set, "[c-x]", "cx");
441 UnicodeSet set2;
442 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
443 set.removeAll(set2);
444 expectPairs(set, "deluxx");
445
446 // Try adding an entire set to another set
447 expectPattern(set, "[jackiemclean]", "aacceein");
448 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
449 set.addAll(set2);
450 expectPairs(set, "aacehort");
451 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
452
453 // Try retaining an set of elements contained in another set (intersection)
454 UnicodeSet set3;
455 expectPattern(set3, "[a-c]", "ac");
456 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
457 set3.remove(0x0062);
458 expectPairs(set3, "aacc");
459 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
460 set.retainAll(set3);
461 expectPairs(set, "aacc");
462 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
463 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
464 set.clear();
465 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
466
467 // Test commutativity
468 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
469 expectPattern(set2, "[jackiemclean]", "aacceein");
470 set.addAll(set2);
471 expectPairs(set, "aacehort");
472 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
473
474
475
476
477 }
478
479 /**
480 * Make sure minimal representation is maintained.
481 */
TestMinimalRep()482 void UnicodeSetTest::TestMinimalRep() {
483 UErrorCode status = U_ZERO_ERROR;
484 // This is pretty thoroughly tested by checkCanonicalRep()
485 // run against the exhaustive operation results. Use the code
486 // here for debugging specific spot problems.
487
488 // 1 overlap against 2
489 UnicodeSet set("[h-km-q]", status);
490 if (U_FAILURE(status)) { errln("FAIL"); return; }
491 UnicodeSet set2("[i-o]", status);
492 if (U_FAILURE(status)) { errln("FAIL"); return; }
493 set.addAll(set2);
494 expectPairs(set, "hq");
495 // right
496 set.applyPattern("[a-m]", status);
497 if (U_FAILURE(status)) { errln("FAIL"); return; }
498 set2.applyPattern("[e-o]", status);
499 if (U_FAILURE(status)) { errln("FAIL"); return; }
500 set.addAll(set2);
501 expectPairs(set, "ao");
502 // left
503 set.applyPattern("[e-o]", status);
504 if (U_FAILURE(status)) { errln("FAIL"); return; }
505 set2.applyPattern("[a-m]", status);
506 if (U_FAILURE(status)) { errln("FAIL"); return; }
507 set.addAll(set2);
508 expectPairs(set, "ao");
509 // 1 overlap against 3
510 set.applyPattern("[a-eg-mo-w]", status);
511 if (U_FAILURE(status)) { errln("FAIL"); return; }
512 set2.applyPattern("[d-q]", status);
513 if (U_FAILURE(status)) { errln("FAIL"); return; }
514 set.addAll(set2);
515 expectPairs(set, "aw");
516 }
517
TestAPI()518 void UnicodeSetTest::TestAPI() {
519 UErrorCode status = U_ZERO_ERROR;
520 // default ct
521 UnicodeSet set;
522 if (!set.isEmpty() || set.getRangeCount() != 0) {
523 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
524 set);
525 }
526
527 // clear(), isEmpty()
528 set.add(0x0061);
529 if (set.isEmpty()) {
530 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
531 set);
532 }
533 set.clear();
534 if (!set.isEmpty()) {
535 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
536 set);
537 }
538
539 // size()
540 set.clear();
541 if (set.size() != 0) {
542 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
543 ": " + set);
544 }
545 set.add(0x0061);
546 if (set.size() != 1) {
547 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
548 ": " + set);
549 }
550 set.add(0x0031, 0x0039);
551 if (set.size() != 10) {
552 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
553 ": " + set);
554 }
555
556 // contains(first, last)
557 set.clear();
558 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
559 if (U_FAILURE(status)) { errln("FAIL"); return; }
560 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
561 UChar32 a = set.getRangeStart(i);
562 UChar32 b = set.getRangeEnd(i);
563 if (!set.contains(a, b)) {
564 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
565 " but doesn't: " + set);
566 }
567 if (set.contains((UChar32)(a-1), b)) {
568 errln((UnicodeString)"FAIL, shouldn't contain " +
569 (unsigned short)(a-1) + '-' + (unsigned short)b +
570 " but does: " + set);
571 }
572 if (set.contains(a, (UChar32)(b+1))) {
573 errln((UnicodeString)"FAIL, shouldn't contain " +
574 (unsigned short)a + '-' + (unsigned short)(b+1) +
575 " but does: " + set);
576 }
577 }
578
579 // Ported InversionList test.
580 UnicodeSet a((UChar32)3,(UChar32)10);
581 UnicodeSet b((UChar32)7,(UChar32)15);
582 UnicodeSet c;
583
584 logln((UnicodeString)"a [3-10]: " + a);
585 logln((UnicodeString)"b [7-15]: " + b);
586 c = a;
587 c.addAll(b);
588 UnicodeSet exp((UChar32)3,(UChar32)15);
589 if (c == exp) {
590 logln((UnicodeString)"c.set(a).add(b): " + c);
591 } else {
592 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
593 }
594 c.complement();
595 exp.set((UChar32)0, (UChar32)2);
596 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
597 if (c == exp) {
598 logln((UnicodeString)"c.complement(): " + c);
599 } else {
600 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
601 }
602 c.complement();
603 exp.set((UChar32)3, (UChar32)15);
604 if (c == exp) {
605 logln((UnicodeString)"c.complement(): " + c);
606 } else {
607 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
608 }
609 c = a;
610 c.complementAll(b);
611 exp.set((UChar32)3,(UChar32)6);
612 exp.add((UChar32)11,(UChar32) 15);
613 if (c == exp) {
614 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
615 } else {
616 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
617 }
618
619 exp = c;
620 bitsToSet(setToBits(c), c);
621 if (c == exp) {
622 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
623 } else {
624 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
625 }
626
627 // Additional tests for coverage JB#2118
628 //UnicodeSet::complement(class UnicodeString const &)
629 //UnicodeSet::complementAll(class UnicodeString const &)
630 //UnicodeSet::containsNone(class UnicodeSet const &)
631 //UnicodeSet::containsNone(long,long)
632 //UnicodeSet::containsSome(class UnicodeSet const &)
633 //UnicodeSet::containsSome(long,long)
634 //UnicodeSet::removeAll(class UnicodeString const &)
635 //UnicodeSet::retain(long)
636 //UnicodeSet::retainAll(class UnicodeString const &)
637 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
638 //UnicodeSetIterator::getString(void)
639 set.clear();
640 set.complement("ab");
641 exp.applyPattern("[{ab}]", status);
642 if (U_FAILURE(status)) { errln("FAIL"); return; }
643 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
644
645 UnicodeSetIterator iset(set);
646 if (!iset.next() || !iset.isString()) {
647 errln("FAIL: UnicodeSetIterator::next/isString");
648 } else if (iset.getString() != "ab") {
649 errln("FAIL: UnicodeSetIterator::getString");
650 }
651
652 set.add((UChar32)0x61, (UChar32)0x7A);
653 set.complementAll("alan");
654 exp.applyPattern("[{ab}b-kmo-z]", status);
655 if (U_FAILURE(status)) { errln("FAIL"); return; }
656 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
657
658 exp.applyPattern("[a-z]", status);
659 if (U_FAILURE(status)) { errln("FAIL"); return; }
660 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
661 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
662 exp.applyPattern("[aln]", status);
663 if (U_FAILURE(status)) { errln("FAIL"); return; }
664 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
665 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
666
667 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
668 errln("FAIL: containsNone(UChar32, UChar32)");
669 }
670 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
671 errln("FAIL: containsSome(UChar32, UChar32)");
672 }
673 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
674 errln("FAIL: containsNone(UChar32, UChar32)");
675 }
676 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
677 errln("FAIL: containsSome(UChar32, UChar32)");
678 }
679
680 set.removeAll("liu");
681 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
682 if (U_FAILURE(status)) { errln("FAIL"); return; }
683 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
684
685 set.retainAll("star");
686 exp.applyPattern("[rst]", status);
687 if (U_FAILURE(status)) { errln("FAIL"); return; }
688 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
689
690 set.retain((UChar32)0x73);
691 exp.applyPattern("[s]", status);
692 if (U_FAILURE(status)) { errln("FAIL"); return; }
693 if (set != exp) { errln("FAIL: retain('s')"); return; }
694
695 uint16_t buf[32];
696 int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
697 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
698 if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
699 errln("FAIL: serialize");
700 return;
701 }
702
703 // Conversions to and from USet
704 UnicodeSet *uniset = &set;
705 USet *uset = uniset->toUSet();
706 TEST_ASSERT((void *)uset == (void *)uniset);
707 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
708 TEST_ASSERT((void *)setx == (void *)uset);
709 const UnicodeSet *constSet = uniset;
710 const USet *constUSet = constSet->toUSet();
711 TEST_ASSERT((void *)constUSet == (void *)constSet);
712 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
713 TEST_ASSERT((void *)constSetx == (void *)constUSet);
714
715 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
716 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
717 UnicodeSet ac(0x61, 0x63);
718 ac.remove(0x62).freeze();
719 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
720 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
721 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
722 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
723 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
724 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
725 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
726 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
727 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
728 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
729 ) {
730 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
731 }
732 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
733 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
734 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
735 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
736 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
737 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
738 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
739 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
740 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
741 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
742 ) {
743 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
744 }
745 }
746
TestIteration()747 void UnicodeSetTest::TestIteration() {
748 UErrorCode ec = U_ZERO_ERROR;
749 int i = 0;
750 int outerLoop;
751
752 // 6 code points, 3 ranges, 2 strings, 8 total elements
753 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
754 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
755 TEST_ASSERT_SUCCESS(ec);
756 UnicodeSetIterator it(set);
757
758 for (outerLoop=0; outerLoop<3; outerLoop++) {
759 // Run the test multiple times, to check that iterator.reset() is working.
760 for (i=0; i<10; i++) {
761 UBool nextv = it.next();
762 UBool isString = it.isString();
763 int32_t codePoint = it.getCodepoint();
764 //int32_t codePointEnd = it.getCodepointEnd();
765 UnicodeString s = it.getString();
766 switch (i) {
767 case 0:
768 TEST_ASSERT(nextv == TRUE);
769 TEST_ASSERT(isString == FALSE);
770 TEST_ASSERT(codePoint==0x61);
771 TEST_ASSERT(s == "a");
772 break;
773 case 1:
774 TEST_ASSERT(nextv == TRUE);
775 TEST_ASSERT(isString == FALSE);
776 TEST_ASSERT(codePoint==0x62);
777 TEST_ASSERT(s == "b");
778 break;
779 case 2:
780 TEST_ASSERT(nextv == TRUE);
781 TEST_ASSERT(isString == FALSE);
782 TEST_ASSERT(codePoint==0x63);
783 TEST_ASSERT(s == "c");
784 break;
785 case 3:
786 TEST_ASSERT(nextv == TRUE);
787 TEST_ASSERT(isString == FALSE);
788 TEST_ASSERT(codePoint==0x79);
789 TEST_ASSERT(s == "y");
790 break;
791 case 4:
792 TEST_ASSERT(nextv == TRUE);
793 TEST_ASSERT(isString == FALSE);
794 TEST_ASSERT(codePoint==0x7a);
795 TEST_ASSERT(s == "z");
796 break;
797 case 5:
798 TEST_ASSERT(nextv == TRUE);
799 TEST_ASSERT(isString == FALSE);
800 TEST_ASSERT(codePoint==0x1abcd);
801 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
802 break;
803 case 6:
804 TEST_ASSERT(nextv == TRUE);
805 TEST_ASSERT(isString == TRUE);
806 TEST_ASSERT(s == "str1");
807 break;
808 case 7:
809 TEST_ASSERT(nextv == TRUE);
810 TEST_ASSERT(isString == TRUE);
811 TEST_ASSERT(s == "str2");
812 break;
813 case 8:
814 TEST_ASSERT(nextv == FALSE);
815 break;
816 case 9:
817 TEST_ASSERT(nextv == FALSE);
818 break;
819 }
820 }
821 it.reset(); // prepare to run the iteration again.
822 }
823 }
824
825
826
827
TestStrings()828 void UnicodeSetTest::TestStrings() {
829 UErrorCode ec = U_ZERO_ERROR;
830
831 UnicodeSet* testList[] = {
832 UnicodeSet::createFromAll("abc"),
833 new UnicodeSet("[a-c]", ec),
834
835 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
836 new UnicodeSet("[{ll}{ch}a-z]", ec),
837
838 UnicodeSet::createFrom("ab}c"),
839 new UnicodeSet("[{ab\\}c}]", ec),
840
841 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
842 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
843
844 NULL
845 };
846
847 if (U_FAILURE(ec)) {
848 errln("FAIL: couldn't construct test sets");
849 }
850
851 for (int32_t i = 0; testList[i] != NULL; i+=2) {
852 if (U_SUCCESS(ec)) {
853 UnicodeString pat0, pat1;
854 testList[i]->toPattern(pat0, TRUE);
855 testList[i+1]->toPattern(pat1, TRUE);
856 if (*testList[i] == *testList[i+1]) {
857 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
858 } else {
859 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
860 }
861 }
862 delete testList[i];
863 delete testList[i+1];
864 }
865 }
866
867 /**
868 * Test the [:Latin:] syntax.
869 */
TestScriptSet()870 void UnicodeSetTest::TestScriptSet() {
871 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
872
873 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
874
875 /* Jitterbug 1423 */
876 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
877
878 }
879
880 /**
881 * Test the [:Latin:] syntax.
882 */
TestPropertySet()883 void UnicodeSetTest::TestPropertySet() {
884 static const char* const DATA[] = {
885 // Pattern, Chars IN, Chars NOT in
886
887 "[:Latin:]",
888 "aA",
889 "\\u0391\\u03B1",
890
891 "[\\p{Greek}]",
892 "\\u0391\\u03B1",
893 "aA",
894
895 "\\P{ GENERAL Category = upper case letter }",
896 "abc",
897 "ABC",
898
899 #if !UCONFIG_NO_NORMALIZATION
900 // Combining class: @since ICU 2.2
901 // Check both symbolic and numeric
902 "\\p{ccc=Nukta}",
903 "\\u0ABC",
904 "abc",
905
906 "\\p{Canonical Combining Class = 11}",
907 "\\u05B1",
908 "\\u05B2",
909
910 "[:c c c = iota subscript :]",
911 "\\u0345",
912 "xyz",
913 #endif
914
915 // Bidi class: @since ICU 2.2
916 "\\p{bidiclass=lefttoright}",
917 "abc",
918 "\\u0671\\u0672",
919
920 // Binary properties: @since ICU 2.2
921 "\\p{ideographic}",
922 "\\u4E0A",
923 "x",
924
925 "[:math=false:]",
926 "q)*(",
927 // weiv: )(and * were removed from math in Unicode 4.0.1
928 //"(*+)",
929 "+<>^",
930
931 // JB#1767 \N{}, \p{ASCII}
932 "[:Ascii:]",
933 "abc\\u0000\\u007F",
934 "\\u0080\\u4E00",
935
936 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
937 "az",
938 "qrs",
939
940 // JB#2015
941 "[:any:]",
942 "a\\U0010FFFF",
943 "",
944
945 "[:nv=0.5:]",
946 "\\u00BD\\u0F2A",
947 "\\u00BC",
948
949 // JB#2653: Age
950 "[:Age=1.1:]",
951 "\\u03D6", // 1.1
952 "\\u03D8\\u03D9", // 3.2
953
954 "[:Age=3.1:]",
955 "\\u1800\\u3400\\U0002f800",
956 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
957
958 // JB#2350: Case_Sensitive
959 "[:Case Sensitive:]",
960 "A\\u1FFC\\U00010410",
961 ";\\u00B4\\U00010500",
962
963 // JB#2832: C99-compatibility props
964 "[:blank:]",
965 " \\u0009",
966 "1-9A-Z",
967
968 "[:graph:]",
969 "19AZ",
970 " \\u0003\\u0007\\u0009\\u000A\\u000D",
971
972 "[:punct:]",
973 "!@#%&*()[]{}-_\\/;:,.?'\"",
974 "09azAZ",
975
976 "[:xdigit:]",
977 "09afAF",
978 "gG!",
979
980 // Regex compatibility test
981 "[-b]", // leading '-' is literal
982 "-b",
983 "ac",
984
985 "[^-b]", // leading '-' is literal
986 "ac",
987 "-b",
988
989 "[b-]", // trailing '-' is literal
990 "-b",
991 "ac",
992
993 "[^b-]", // trailing '-' is literal
994 "ac",
995 "-b",
996
997 "[a-b-]", // trailing '-' is literal
998 "ab-",
999 "c=",
1000
1001 "[[a-q]&[p-z]-]", // trailing '-' is literal
1002 "pq-",
1003 "or=",
1004
1005 "[\\s|\\)|:|$|\\>]", // from regex tests
1006 "s|):$>",
1007 "abc",
1008
1009 "[\\uDC00cd]", // JB#2906: isolated trail at start
1010 "cd\\uDC00",
1011 "ab\\uD800\\U00010000",
1012
1013 "[ab\\uD800]", // JB#2906: isolated trail at start
1014 "ab\\uD800",
1015 "cd\\uDC00\\U00010000",
1016
1017 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1018 "abcd\\uD800",
1019 "ef\\uDC00\\U00010000",
1020
1021 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1022 "abcd\\uDC00",
1023 "ef\\uD800\\U00010000",
1024
1025 #if !UCONFIG_NO_NORMALIZATION
1026 "[:^lccc=0:]", // Lead canonical class
1027 "\\u0300\\u0301",
1028 "abcd\\u00c0\\u00c5",
1029
1030 "[:^tccc=0:]", // Trail canonical class
1031 "\\u0300\\u0301\\u00c0\\u00c5",
1032 "abcd",
1033
1034 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1035 "\\u0300\\u0301\\u00c0\\u00c5",
1036 "abcd",
1037
1038 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1039 "",
1040 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1041
1042 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1043 "\\u0F73\\u0F75\\u0F81",
1044 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1045 #endif /* !UCONFIG_NO_NORMALIZATION */
1046
1047 "[:Assigned:]",
1048 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1049 "\\u0888\\uFDD3\\uFFFE\\U00050005",
1050
1051 // Script_Extensions, new in Unicode 6.0
1052 "[:scx=Arab:]",
1053 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1054 "\\u061D\\uFDEF\\uFDFE",
1055
1056 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1057 // so scx-sc is missing U+FDF2.
1058 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1059 "\\u0640\\u064B\\u0650\\u0655",
1060 "\\uFDF2"
1061 };
1062
1063 static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1064
1065 for (int32_t i=0; i<DATA_LEN; i+=3) {
1066 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1067 CharsToUnicodeString(DATA[i+2]));
1068 }
1069 }
1070
1071 /**
1072 * Test that Posix style character classes [:digit:], etc.
1073 * have the Unicode definitions from TR 18.
1074 */
TestPosixClasses()1075 void UnicodeSetTest::TestPosixClasses() {
1076 {
1077 UErrorCode status = U_ZERO_ERROR;
1078 UnicodeSet s1("[:alpha:]", status);
1079 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1080 TEST_ASSERT_SUCCESS(status);
1081 TEST_ASSERT(s1==s2);
1082 }
1083 {
1084 UErrorCode status = U_ZERO_ERROR;
1085 UnicodeSet s1("[:lower:]", status);
1086 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1087 TEST_ASSERT_SUCCESS(status);
1088 TEST_ASSERT(s1==s2);
1089 }
1090 {
1091 UErrorCode status = U_ZERO_ERROR;
1092 UnicodeSet s1("[:upper:]", status);
1093 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1094 TEST_ASSERT_SUCCESS(status);
1095 TEST_ASSERT(s1==s2);
1096 }
1097 {
1098 UErrorCode status = U_ZERO_ERROR;
1099 UnicodeSet s1("[:punct:]", status);
1100 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1101 TEST_ASSERT_SUCCESS(status);
1102 TEST_ASSERT(s1==s2);
1103 }
1104 {
1105 UErrorCode status = U_ZERO_ERROR;
1106 UnicodeSet s1("[:digit:]", status);
1107 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1108 TEST_ASSERT_SUCCESS(status);
1109 TEST_ASSERT(s1==s2);
1110 }
1111 {
1112 UErrorCode status = U_ZERO_ERROR;
1113 UnicodeSet s1("[:xdigit:]", status);
1114 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1115 TEST_ASSERT_SUCCESS(status);
1116 TEST_ASSERT(s1==s2);
1117 }
1118 {
1119 UErrorCode status = U_ZERO_ERROR;
1120 UnicodeSet s1("[:alnum:]", status);
1121 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1122 TEST_ASSERT_SUCCESS(status);
1123 TEST_ASSERT(s1==s2);
1124 }
1125 {
1126 UErrorCode status = U_ZERO_ERROR;
1127 UnicodeSet s1("[:space:]", status);
1128 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1129 TEST_ASSERT_SUCCESS(status);
1130 TEST_ASSERT(s1==s2);
1131 }
1132 {
1133 UErrorCode status = U_ZERO_ERROR;
1134 UnicodeSet s1("[:blank:]", status);
1135 TEST_ASSERT_SUCCESS(status);
1136 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1137 status);
1138 TEST_ASSERT_SUCCESS(status);
1139 TEST_ASSERT(s1==s2);
1140 }
1141 {
1142 UErrorCode status = U_ZERO_ERROR;
1143 UnicodeSet s1("[:cntrl:]", status);
1144 TEST_ASSERT_SUCCESS(status);
1145 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1146 TEST_ASSERT_SUCCESS(status);
1147 TEST_ASSERT(s1==s2);
1148 }
1149 {
1150 UErrorCode status = U_ZERO_ERROR;
1151 UnicodeSet s1("[:graph:]", status);
1152 TEST_ASSERT_SUCCESS(status);
1153 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1154 TEST_ASSERT_SUCCESS(status);
1155 TEST_ASSERT(s1==s2);
1156 }
1157 {
1158 UErrorCode status = U_ZERO_ERROR;
1159 UnicodeSet s1("[:print:]", status);
1160 TEST_ASSERT_SUCCESS(status);
1161 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1162 TEST_ASSERT_SUCCESS(status);
1163 TEST_ASSERT(s1==s2);
1164 }
1165 }
1166 /**
1167 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1168 */
TestClone()1169 void UnicodeSetTest::TestClone() {
1170 UErrorCode ec = U_ZERO_ERROR;
1171 UnicodeSet s("[abcxyz]", ec);
1172 UnicodeSet t(s);
1173 expectContainment(t, "abc", "def");
1174 }
1175
1176 /**
1177 * Test the indexOf() and charAt() methods.
1178 */
TestIndexOf()1179 void UnicodeSetTest::TestIndexOf() {
1180 UErrorCode ec = U_ZERO_ERROR;
1181 UnicodeSet set("[a-cx-y3578]", ec);
1182 if (U_FAILURE(ec)) {
1183 errln("FAIL: UnicodeSet constructor");
1184 return;
1185 }
1186 for (int32_t i=0; i<set.size(); ++i) {
1187 UChar32 c = set.charAt(i);
1188 if (set.indexOf(c) != i) {
1189 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1190 i, c, set.indexOf(c));
1191 }
1192 }
1193 UChar32 c = set.charAt(set.size());
1194 if (c != -1) {
1195 errln("FAIL: charAt(<out of range>) = %X", c);
1196 }
1197 int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1198 if (j != -1) {
1199 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1200 }
1201 }
1202
1203 /**
1204 * Test closure API.
1205 */
TestCloseOver()1206 void UnicodeSetTest::TestCloseOver() {
1207 UErrorCode ec = U_ZERO_ERROR;
1208
1209 char CASE[] = {(char)USET_CASE_INSENSITIVE};
1210 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1211 const char* DATA[] = {
1212 // selector, input, output
1213 CASE,
1214 "[aq\\u00DF{Bc}{bC}{Fi}]",
1215 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1216
1217 CASE,
1218 "[\\u01F1]", // 'DZ'
1219 "[\\u01F1\\u01F2\\u01F3]",
1220
1221 CASE,
1222 "[\\u1FB4]",
1223 "[\\u1FB4{\\u03AC\\u03B9}]",
1224
1225 CASE,
1226 "[{F\\uFB01}]",
1227 "[\\uFB03{ffi}]",
1228
1229 CASE, // make sure binary search finds limits
1230 "[a\\uFF3A]",
1231 "[aA\\uFF3A\\uFF5A]",
1232
1233 CASE,
1234 "[a-z]","[A-Za-z\\u017F\\u212A]",
1235 CASE,
1236 "[abc]","[A-Ca-c]",
1237 CASE,
1238 "[ABC]","[A-Ca-c]",
1239
1240 CASE, "[i]", "[iI]",
1241
1242 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1243 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1244
1245 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1246
1247 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1248
1249 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1250
1251 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1252
1253 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1254
1255 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1256
1257 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1258 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1259
1260 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1261
1262 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1263
1264 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1265
1266 #if !UCONFIG_NO_FILE_IO
1267 CASE_MAPPINGS,
1268 "[aq\\u00DF{Bc}{bC}{Fi}]",
1269 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1270 #endif
1271
1272 CASE_MAPPINGS,
1273 "[\\u01F1]", // 'DZ'
1274 "[\\u01F1\\u01F2\\u01F3]",
1275
1276 CASE_MAPPINGS,
1277 "[a-z]",
1278 "[A-Za-z]",
1279
1280 NULL
1281 };
1282
1283 UnicodeSet s;
1284 UnicodeSet t;
1285 UnicodeString buf;
1286 for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1287 int32_t selector = DATA[i][0];
1288 UnicodeString pat(DATA[i+1], -1, US_INV);
1289 UnicodeString exp(DATA[i+2], -1, US_INV);
1290 s.applyPattern(pat, ec);
1291 s.closeOver(selector);
1292 t.applyPattern(exp, ec);
1293 if (U_FAILURE(ec)) {
1294 errln("FAIL: applyPattern failed");
1295 continue;
1296 }
1297 if (s == t) {
1298 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1299 } else {
1300 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1301 s.toPattern(buf, TRUE) + ", expected " + exp);
1302 }
1303 }
1304
1305 #if 0
1306 /*
1307 * Unused test code.
1308 * This was used to compare the old implementation (using USET_CASE)
1309 * with the new one (using 0x100 temporarily)
1310 * while transitioning from hardcoded case closure tables in uniset.cpp
1311 * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1312 * and using ucase.c functions for closure.
1313 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1314 *
1315 * Note: The old and new implementation never fully matched because
1316 * the old implementation turned out to not map U+0130 and U+0131 correctly
1317 * (dotted I and dotless i) and because the old implementation's data tables
1318 * were outdated compared to Unicode 4.0.1 at the time of the change to the
1319 * new implementation. (So sigmas and some other characters were not handled
1320 * according to the newer Unicode version.)
1321 */
1322 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1323 UnicodeSetIterator si(sens);
1324 UnicodeString str, buf2;
1325 const UnicodeString *pStr;
1326 UChar32 c;
1327 while(si.next()) {
1328 if(!si.isString()) {
1329 c=si.getCodepoint();
1330 s.clear();
1331 s.add(c);
1332
1333 str.setTo(c);
1334 str.foldCase();
1335 sens2.add(str);
1336
1337 t=s;
1338 s.closeOver(USET_CASE);
1339 t.closeOver(0x100);
1340 if(s!=t) {
1341 errln("FAIL: closeOver(U+%04x) differs: ", c);
1342 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1343 }
1344 }
1345 }
1346 // remove all code points
1347 // should contain all full case folding mapping strings
1348 sens2.remove(0, 0x10ffff);
1349 si.reset(sens2);
1350 while(si.next()) {
1351 if(si.isString()) {
1352 pStr=&si.getString();
1353 s.clear();
1354 s.add(*pStr);
1355 t=s2=s;
1356 s.closeOver(USET_CASE);
1357 t.closeOver(0x100);
1358 if(s!=t) {
1359 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1360 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1361 }
1362 }
1363 }
1364 #endif
1365
1366 // Test the pattern API
1367 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1368 if (U_FAILURE(ec)) {
1369 errln("FAIL: applyPattern failed");
1370 } else {
1371 expectContainment(s, "abcABC", "defDEF");
1372 }
1373 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1374 if (U_FAILURE(ec)) {
1375 errln("FAIL: constructor failed");
1376 } else {
1377 expectContainment(v, "defDEF", "abcABC");
1378 }
1379 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1380 if (U_FAILURE(ec)) {
1381 errln("FAIL: construct w/case mappings failed");
1382 } else {
1383 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1384 }
1385 }
1386
TestEscapePattern()1387 void UnicodeSetTest::TestEscapePattern() {
1388 const char pattern[] =
1389 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1390 const char exp[] =
1391 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1392 // We test this with two passes; in the second pass we
1393 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1394 // this fails -- which is what we expect.
1395 for (int32_t pass=1; pass<=2; ++pass) {
1396 UErrorCode ec = U_ZERO_ERROR;
1397 UnicodeString pat(pattern, -1, US_INV);
1398 if (pass==2) {
1399 pat = pat.unescape();
1400 }
1401 // Pattern is only good for pass 1
1402 UBool isPatternValid = (pass==1);
1403
1404 UnicodeSet set(pat, ec);
1405 if (U_SUCCESS(ec) != isPatternValid){
1406 errln((UnicodeString)"FAIL: applyPattern(" +
1407 escape(pat) + ") => " +
1408 u_errorName(ec));
1409 continue;
1410 }
1411 if (U_FAILURE(ec)) {
1412 continue;
1413 }
1414 if (set.contains((UChar)0x0644)){
1415 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1416 }
1417
1418 UnicodeString newpat;
1419 set.toPattern(newpat, TRUE);
1420 if (newpat == UnicodeString(exp, -1, US_INV)) {
1421 logln(escape(pat) + " => " + newpat);
1422 } else {
1423 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1424 }
1425
1426 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1427 UnicodeString str("Range ");
1428 str.append((UChar)(0x30 + i))
1429 .append(": ")
1430 .append((UChar32)set.getRangeStart(i))
1431 .append(" - ")
1432 .append((UChar32)set.getRangeEnd(i));
1433 str = str + " (" + set.getRangeStart(i) + " - " +
1434 set.getRangeEnd(i) + ")";
1435 if (set.getRangeStart(i) < 0) {
1436 errln((UnicodeString)"FAIL: " + escape(str));
1437 } else {
1438 logln(escape(str));
1439 }
1440 }
1441 }
1442 }
1443
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1444 void UnicodeSetTest::expectRange(const UnicodeString& label,
1445 const UnicodeSet& set,
1446 UChar32 start, UChar32 end) {
1447 UnicodeSet exp(start, end);
1448 UnicodeString pat;
1449 if (set == exp) {
1450 logln(label + " => " + set.toPattern(pat, TRUE));
1451 } else {
1452 UnicodeString xpat;
1453 errln((UnicodeString)"FAIL: " + label + " => " +
1454 set.toPattern(pat, TRUE) +
1455 ", expected " + exp.toPattern(xpat, TRUE));
1456 }
1457 }
1458
TestInvalidCodePoint()1459 void UnicodeSetTest::TestInvalidCodePoint() {
1460
1461 const UChar32 DATA[] = {
1462 // Test range Expected range
1463 0, 0x10FFFF, 0, 0x10FFFF,
1464 (UChar32)-1, 8, 0, 8,
1465 8, 0x110000, 8, 0x10FFFF
1466 };
1467 const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1468
1469 UnicodeString pat;
1470 int32_t i;
1471
1472 for (i=0; i<DATA_LENGTH; i+=4) {
1473 UChar32 start = DATA[i];
1474 UChar32 end = DATA[i+1];
1475 UChar32 xstart = DATA[i+2];
1476 UChar32 xend = DATA[i+3];
1477
1478 // Try various API using the test code points
1479
1480 UnicodeSet set(start, end);
1481 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1482 set, xstart, xend);
1483
1484 set.clear();
1485 set.set(start, end);
1486 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1487 set, xstart, xend);
1488
1489 UBool b = set.contains(start);
1490 b = set.contains(start, end);
1491 b = set.containsNone(start, end);
1492 b = set.containsSome(start, end);
1493 (void)b; // Suppress set but not used warning.
1494
1495 /*int32_t index = set.indexOf(start);*/
1496
1497 set.clear();
1498 set.add(start);
1499 set.add(start, end);
1500 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1501 set, xstart, xend);
1502
1503 set.set(0, 0x10FFFF);
1504 set.retain(start, end);
1505 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1506 set, xstart, xend);
1507 set.retain(start);
1508
1509 set.set(0, 0x10FFFF);
1510 set.remove(start);
1511 set.remove(start, end);
1512 set.complement();
1513 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1514 set, xstart, xend);
1515
1516 set.set(0, 0x10FFFF);
1517 set.complement(start, end);
1518 set.complement();
1519 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1520 set, xstart, xend);
1521 set.complement(start);
1522 }
1523
1524 const UChar32 DATA2[] = {
1525 0,
1526 0x10FFFF,
1527 (UChar32)-1,
1528 0x110000
1529 };
1530 const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1531
1532 for (i=0; i<DATA2_LENGTH; ++i) {
1533 UChar32 c = DATA2[i], end = 0x10FFFF;
1534 UBool valid = (c >= 0 && c <= 0x10FFFF);
1535
1536 UnicodeSet set(0, 0x10FFFF);
1537
1538 // For single-codepoint contains, invalid codepoints are NOT contained
1539 UBool b = set.contains(c);
1540 if (b == valid) {
1541 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1542 ") = " + b);
1543 } else {
1544 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1545 ") = " + b);
1546 }
1547
1548 // For codepoint range contains, containsNone, and containsSome,
1549 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1550 b = set.contains(c, end);
1551 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1552 "," + end + ") = " + b);
1553
1554 b = set.containsNone(c, end);
1555 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1556 "," + end + ") = " + b);
1557
1558 b = set.containsSome(c, end);
1559 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1560 "," + end + ") = " + b);
1561
1562 int32_t index = set.indexOf(c);
1563 if ((index >= 0) == valid) {
1564 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1565 ") = " + index);
1566 } else {
1567 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1568 ") = " + index);
1569 }
1570 }
1571 }
1572
1573 // Used by TestSymbolTable
1574 class TokenSymbolTable : public SymbolTable {
1575 public:
1576 Hashtable contents;
1577
TokenSymbolTable(UErrorCode & ec)1578 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1579 contents.setValueDeleter(uprv_deleteUObject);
1580 }
1581
~TokenSymbolTable()1582 ~TokenSymbolTable() {}
1583
1584 /**
1585 * (Non-SymbolTable API) Add the given variable and value to
1586 * the table. Variable should NOT contain leading '$'.
1587 */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1588 void add(const UnicodeString& var, const UnicodeString& value,
1589 UErrorCode& ec) {
1590 if (U_SUCCESS(ec)) {
1591 contents.put(var, new UnicodeString(value), ec);
1592 }
1593 }
1594
1595 /**
1596 * SymbolTable API
1597 */
lookup(const UnicodeString & s) const1598 virtual const UnicodeString* lookup(const UnicodeString& s) const {
1599 return (const UnicodeString*) contents.get(s);
1600 }
1601
1602 /**
1603 * SymbolTable API
1604 */
lookupMatcher(UChar32) const1605 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1606 return NULL;
1607 }
1608
1609 /**
1610 * SymbolTable API
1611 */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1612 virtual UnicodeString parseReference(const UnicodeString& text,
1613 ParsePosition& pos, int32_t limit) const {
1614 int32_t start = pos.getIndex();
1615 int32_t i = start;
1616 UnicodeString result;
1617 while (i < limit) {
1618 UChar c = text.charAt(i);
1619 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1620 break;
1621 }
1622 ++i;
1623 }
1624 if (i == start) { // No valid name chars
1625 return result; // Indicate failure with empty string
1626 }
1627 pos.setIndex(i);
1628 text.extractBetween(start, i, result);
1629 return result;
1630 }
1631 };
1632
TestSymbolTable()1633 void UnicodeSetTest::TestSymbolTable() {
1634 // Multiple test cases can be set up here. Each test case
1635 // is terminated by null:
1636 // var, value, var, value,..., input pat., exp. output pat., null
1637 const char* DATA[] = {
1638 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1639 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1640 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1641 NULL
1642 };
1643
1644 for (int32_t i=0; DATA[i]!=NULL; ++i) {
1645 UErrorCode ec = U_ZERO_ERROR;
1646 TokenSymbolTable sym(ec);
1647 if (U_FAILURE(ec)) {
1648 errln("FAIL: couldn't construct TokenSymbolTable");
1649 continue;
1650 }
1651
1652 // Set up variables
1653 while (DATA[i+2] != NULL) {
1654 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1655 if (U_FAILURE(ec)) {
1656 errln("FAIL: couldn't add to TokenSymbolTable");
1657 continue;
1658 }
1659 i += 2;
1660 }
1661
1662 // Input pattern and expected output pattern
1663 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1664 i += 2;
1665
1666 ParsePosition pos(0);
1667 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1668 if (U_FAILURE(ec)) {
1669 errln("FAIL: couldn't construct UnicodeSet");
1670 continue;
1671 }
1672
1673 // results
1674 if (pos.getIndex() != inpat.length()) {
1675 errln((UnicodeString)"Failed to read to end of string \""
1676 + inpat + "\": read to "
1677 + pos.getIndex() + ", length is "
1678 + inpat.length());
1679 }
1680
1681 UnicodeSet us2(exppat, ec);
1682 if (U_FAILURE(ec)) {
1683 errln("FAIL: couldn't construct expected UnicodeSet");
1684 continue;
1685 }
1686
1687 UnicodeString a, b;
1688 if (us != us2) {
1689 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1690 ", expected " + us2.toPattern(b, TRUE));
1691 } else {
1692 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1693 }
1694 }
1695 }
1696
TestSurrogate()1697 void UnicodeSetTest::TestSurrogate() {
1698 const char* DATA[] = {
1699 // These should all behave identically
1700 "[abc\\uD800\\uDC00]",
1701 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1702 "[abc\\U00010000]",
1703 0
1704 };
1705 for (int i=0; DATA[i] != 0; ++i) {
1706 UErrorCode ec = U_ZERO_ERROR;
1707 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1708 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1709 UnicodeSet set(str, ec);
1710 if (U_FAILURE(ec)) {
1711 errln("FAIL: UnicodeSet constructor");
1712 continue;
1713 }
1714 expectContainment(set,
1715 CharsToUnicodeString("abc\\U00010000"),
1716 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1717 if (set.size() != 4) {
1718 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1719 set.size() + ", expected 4");
1720 }
1721
1722 {
1723 UErrorCode subErr = U_ZERO_ERROR;
1724 checkRoundTrip(set);
1725 checkSerializeRoundTrip(set, subErr);
1726 }
1727 }
1728 }
1729
TestExhaustive()1730 void UnicodeSetTest::TestExhaustive() {
1731 // exhaustive tests. Simulate UnicodeSets with integers.
1732 // That gives us very solid tests (except for large memory tests).
1733
1734 int32_t limit = 128;
1735
1736 UnicodeSet x, y, z, aa;
1737
1738 for (int32_t i = 0; i < limit; ++i) {
1739 bitsToSet(i, x);
1740 logln((UnicodeString)"Testing " + i + ", " + x);
1741 _testComplement(i, x, y);
1742
1743 UnicodeSet &toTest = bitsToSet(i, aa);
1744
1745 // AS LONG AS WE ARE HERE, check roundtrip
1746 checkRoundTrip(toTest);
1747 UErrorCode ec = U_ZERO_ERROR;
1748 checkSerializeRoundTrip(toTest, ec);
1749
1750 for (int32_t j = 0; j < limit; ++j) {
1751 _testAdd(i,j, x,y,z);
1752 _testXor(i,j, x,y,z);
1753 _testRetain(i,j, x,y,z);
1754 _testRemove(i,j, x,y,z);
1755 }
1756 }
1757 }
1758
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1759 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1760 bitsToSet(a, x);
1761 z = x;
1762 z.complement();
1763 int32_t c = setToBits(z);
1764 if (c != (~a)) {
1765 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1766 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1767 }
1768 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1769 }
1770
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1771 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772 bitsToSet(a, x);
1773 bitsToSet(b, y);
1774 z = x;
1775 z.addAll(y);
1776 int32_t c = setToBits(z);
1777 if (c != (a | b)) {
1778 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1779 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1780 }
1781 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1782 }
1783
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1784 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785 bitsToSet(a, x);
1786 bitsToSet(b, y);
1787 z = x;
1788 z.retainAll(y);
1789 int32_t c = setToBits(z);
1790 if (c != (a & b)) {
1791 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1792 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1793 }
1794 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1795 }
1796
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1797 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798 bitsToSet(a, x);
1799 bitsToSet(b, y);
1800 z = x;
1801 z.removeAll(y);
1802 int32_t c = setToBits(z);
1803 if (c != (a &~ b)) {
1804 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1805 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1806 }
1807 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1808 }
1809
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1810 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1811 bitsToSet(a, x);
1812 bitsToSet(b, y);
1813 z = x;
1814 z.complementAll(y);
1815 int32_t c = setToBits(z);
1816 if (c != (a ^ b)) {
1817 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1818 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1819 }
1820 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1821 }
1822
1823 /**
1824 * Check that ranges are monotonically increasing and non-
1825 * overlapping.
1826 */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1827 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1828 int32_t n = set.getRangeCount();
1829 if (n < 0) {
1830 errln((UnicodeString)"FAIL result of " + msg +
1831 ": range count should be >= 0 but is " +
1832 n /*+ " for " + set.toPattern())*/);
1833 return;
1834 }
1835 UChar32 last = 0;
1836 for (int32_t i=0; i<n; ++i) {
1837 UChar32 start = set.getRangeStart(i);
1838 UChar32 end = set.getRangeEnd(i);
1839 if (start > end) {
1840 errln((UnicodeString)"FAIL result of " + msg +
1841 ": range " + (i+1) +
1842 " start > end: " + (int)start + ", " + (int)end +
1843 " for " + set);
1844 }
1845 if (i > 0 && start <= last) {
1846 errln((UnicodeString)"FAIL result of " + msg +
1847 ": range " + (i+1) +
1848 " overlaps previous range: " + (int)start + ", " + (int)end +
1849 " for " + set);
1850 }
1851 last = end;
1852 }
1853 }
1854
1855 /**
1856 * Convert a bitmask to a UnicodeSet.
1857 */
bitsToSet(int32_t a,UnicodeSet & result)1858 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1859 result.clear();
1860 for (UChar32 i = 0; i < 32; ++i) {
1861 if ((a & (1<<i)) != 0) {
1862 result.add(i);
1863 }
1864 }
1865 return result;
1866 }
1867
1868 /**
1869 * Convert a UnicodeSet to a bitmask. Only the characters
1870 * U+0000 to U+0020 are represented in the bitmask.
1871 */
setToBits(const UnicodeSet & x)1872 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1873 int32_t result = 0;
1874 for (int32_t i = 0; i < 32; ++i) {
1875 if (x.contains((UChar32)i)) {
1876 result |= (1<<i);
1877 }
1878 }
1879 return result;
1880 }
1881
1882 /**
1883 * Return the representation of an inversion list based UnicodeSet
1884 * as a pairs list. Ranges are listed in ascending Unicode order.
1885 * For example, the set [a-zA-M3] is represented as "33AMaz".
1886 */
getPairs(const UnicodeSet & set)1887 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1888 UnicodeString pairs;
1889 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1890 UChar32 start = set.getRangeStart(i);
1891 UChar32 end = set.getRangeEnd(i);
1892 if (end > 0xFFFF) {
1893 end = 0xFFFF;
1894 i = set.getRangeCount(); // Should be unnecessary
1895 }
1896 pairs.append((UChar)start).append((UChar)end);
1897 }
1898 return pairs;
1899 }
1900
1901 /**
1902 * Basic consistency check for a few items.
1903 * That the iterator works, and that we can create a pattern and
1904 * get the same thing back
1905 */
checkRoundTrip(const UnicodeSet & s)1906 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1907 {
1908 UnicodeSet t(s);
1909 checkEqual(s, t, "copy ct");
1910 }
1911
1912 {
1913 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
1914 t = s;
1915 checkEqual(s, t, "operator=");
1916 }
1917
1918 {
1919 UnicodeSet t;
1920 copyWithIterator(t, s, FALSE);
1921 checkEqual(s, t, "iterator roundtrip");
1922 }
1923
1924 {
1925 UnicodeSet t;
1926 copyWithIterator(t, s, TRUE); // try range
1927 checkEqual(s, t, "iterator roundtrip");
1928 }
1929
1930 {
1931 UnicodeSet t;
1932 UnicodeString pat;
1933 UErrorCode ec = U_ZERO_ERROR;
1934 s.toPattern(pat, FALSE);
1935 t.applyPattern(pat, ec);
1936 if (U_FAILURE(ec)) {
1937 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
1938 return;
1939 } else {
1940 checkEqual(s, t, "toPattern(false)");
1941 }
1942 }
1943
1944 {
1945 UnicodeSet t;
1946 UnicodeString pat;
1947 UErrorCode ec = U_ZERO_ERROR;
1948 s.toPattern(pat, TRUE);
1949 t.applyPattern(pat, ec);
1950 if (U_FAILURE(ec)) {
1951 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
1952 return;
1953 } else {
1954 checkEqual(s, t, "toPattern(true)");
1955 }
1956 }
1957 }
1958
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)1959 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
1960 if(U_FAILURE(status)) return;
1961 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1962 if(status == U_BUFFER_OVERFLOW_ERROR) {
1963 status = U_ZERO_ERROR;
1964 serializeBuffer.resize(len);
1965 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
1966 // let 2nd error stand
1967 }
1968 if(U_FAILURE(status)) {
1969 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
1970 return;
1971 }
1972 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
1973 if(U_FAILURE(status)) {
1974 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
1975 return;
1976 }
1977
1978 checkEqual(t, deserialized, "Set was unequal when deserialized");
1979 }
1980
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)1981 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1982 t.clear();
1983 UnicodeSetIterator it(s);
1984 if (withRange) {
1985 while (it.nextRange()) {
1986 if (it.isString()) {
1987 t.add(it.getString());
1988 } else {
1989 t.add(it.getCodepoint(), it.getCodepointEnd());
1990 }
1991 }
1992 } else {
1993 while (it.next()) {
1994 if (it.isString()) {
1995 t.add(it.getString());
1996 } else {
1997 t.add(it.getCodepoint());
1998 }
1999 }
2000 }
2001 }
2002
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2003 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2004 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2005 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2006 UnicodeString source; s.toPattern(source, TRUE);
2007 UnicodeString result; t.toPattern(result, TRUE);
2008 if (s != t) {
2009 errln((UnicodeString)"FAIL: " + message
2010 + "; source = " + source
2011 + "; result = " + result
2012 );
2013 return FALSE;
2014 } else {
2015 logln((UnicodeString)"Ok: " + message
2016 + "; source = " + source
2017 + "; result = " + result
2018 );
2019 }
2020 return TRUE;
2021 }
2022
2023 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2024 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2025 const UnicodeString& charsIn,
2026 const UnicodeString& charsOut) {
2027 UErrorCode ec = U_ZERO_ERROR;
2028 UnicodeSet set(pat, ec);
2029 if (U_FAILURE(ec)) {
2030 dataerrln((UnicodeString)"FAIL: pattern \"" +
2031 pat + "\" => " + u_errorName(ec));
2032 return;
2033 }
2034 expectContainment(set, pat, charsIn, charsOut);
2035 }
2036
2037 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2038 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2039 const UnicodeString& charsIn,
2040 const UnicodeString& charsOut) {
2041 UnicodeString pat;
2042 set.toPattern(pat);
2043 expectContainment(set, pat, charsIn, charsOut);
2044 }
2045
2046 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2047 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2048 const UnicodeString& setName,
2049 const UnicodeString& charsIn,
2050 const UnicodeString& charsOut) {
2051 UnicodeString bad;
2052 UChar32 c;
2053 int32_t i;
2054
2055 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2056 c = charsIn.char32At(i);
2057 if (!set.contains(c)) {
2058 bad.append(c);
2059 }
2060 }
2061 if (bad.length() > 0) {
2062 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2063 ", expected containment of " + prettify(charsIn));
2064 } else {
2065 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2066 }
2067
2068 bad.truncate(0);
2069 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2070 c = charsOut.char32At(i);
2071 if (set.contains(c)) {
2072 bad.append(c);
2073 }
2074 }
2075 if (bad.length() > 0) {
2076 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2077 ", expected non-containment of " + prettify(charsOut));
2078 } else {
2079 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2080 }
2081 }
2082
2083 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2084 UnicodeSetTest::expectPattern(UnicodeSet& set,
2085 const UnicodeString& pattern,
2086 const UnicodeString& expectedPairs){
2087 UErrorCode status = U_ZERO_ERROR;
2088 set.applyPattern(pattern, status);
2089 if (U_FAILURE(status)) {
2090 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2091 "\") failed");
2092 return;
2093 } else {
2094 if (getPairs(set) != expectedPairs ) {
2095 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2096 "\") => pairs \"" +
2097 escape(getPairs(set)) + "\", expected \"" +
2098 escape(expectedPairs) + "\"");
2099 } else {
2100 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2101 "\") => pairs \"" +
2102 escape(getPairs(set)) + "\"");
2103 }
2104 }
2105 // the result of calling set.toPattern(), which is the string representation of
2106 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2107 // will produce another set that is equal to this one.
2108 UnicodeString temppattern;
2109 set.toPattern(temppattern);
2110 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2111 if (U_FAILURE(status)) {
2112 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2113 return;
2114 }
2115 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2116 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2117 escape(getPairs(set)) + "\""));
2118 } else{
2119 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2120 }
2121
2122 delete tempset;
2123
2124 }
2125
2126 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2127 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2128 if (getPairs(set) != expectedPairs) {
2129 errln(UnicodeString("FAIL: Expected pair list \"") +
2130 escape(expectedPairs) + "\", got \"" +
2131 escape(getPairs(set)) + "\"");
2132 }
2133 }
2134
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2135 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2136 const UnicodeString& expPat,
2137 const char** expStrings) {
2138 UnicodeString pat;
2139 set.toPattern(pat, TRUE);
2140 if (pat == expPat) {
2141 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2142 } else {
2143 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2144 return;
2145 }
2146 if (expStrings == NULL) {
2147 return;
2148 }
2149 UBool in = TRUE;
2150 for (int32_t i=0; expStrings[i] != NULL; ++i) {
2151 if (expStrings[i] == NOT) { // sic; pointer comparison
2152 in = FALSE;
2153 continue;
2154 }
2155 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2156 UBool contained = set.contains(s);
2157 if (contained == in) {
2158 logln((UnicodeString)"Ok: " + expPat +
2159 (contained ? " contains {" : " does not contain {") +
2160 escape(expStrings[i]) + "}");
2161 } else {
2162 errln((UnicodeString)"FAIL: " + expPat +
2163 (contained ? " contains {" : " does not contain {") +
2164 escape(expStrings[i]) + "}");
2165 }
2166 }
2167 }
2168
toHexString(int32_t i)2169 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2170
2171 void
doAssert(UBool condition,const char * message)2172 UnicodeSetTest::doAssert(UBool condition, const char *message)
2173 {
2174 if (!condition) {
2175 errln(UnicodeString("ERROR : ") + message);
2176 }
2177 }
2178
2179 UnicodeString
escape(const UnicodeString & s)2180 UnicodeSetTest::escape(const UnicodeString& s) {
2181 UnicodeString buf;
2182 for (int32_t i=0; i<s.length(); )
2183 {
2184 UChar32 c = s.char32At(i);
2185 if (0x0020 <= c && c <= 0x007F) {
2186 buf += c;
2187 } else {
2188 if (c <= 0xFFFF) {
2189 buf += (UChar)0x5c; buf += (UChar)0x75;
2190 } else {
2191 buf += (UChar)0x5c; buf += (UChar)0x55;
2192 buf += toHexString((c & 0xF0000000) >> 28);
2193 buf += toHexString((c & 0x0F000000) >> 24);
2194 buf += toHexString((c & 0x00F00000) >> 20);
2195 buf += toHexString((c & 0x000F0000) >> 16);
2196 }
2197 buf += toHexString((c & 0xF000) >> 12);
2198 buf += toHexString((c & 0x0F00) >> 8);
2199 buf += toHexString((c & 0x00F0) >> 4);
2200 buf += toHexString(c & 0x000F);
2201 }
2202 i += U16_LENGTH(c);
2203 }
2204 return buf;
2205 }
2206
TestFreezable()2207 void UnicodeSetTest::TestFreezable() {
2208 UErrorCode errorCode=U_ZERO_ERROR;
2209 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2210 UnicodeSet idSet(idPattern, errorCode);
2211 if(U_FAILURE(errorCode)) {
2212 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2213 return;
2214 }
2215
2216 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2217 UnicodeSet wsSet(wsPattern, errorCode);
2218 if(U_FAILURE(errorCode)) {
2219 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2220 return;
2221 }
2222
2223 idSet.add(idPattern);
2224 UnicodeSet frozen(idSet);
2225 frozen.freeze();
2226
2227 if(idSet.isFrozen() || !frozen.isFrozen()) {
2228 errln("FAIL: isFrozen() is wrong");
2229 }
2230 if(frozen!=idSet || !(frozen==idSet)) {
2231 errln("FAIL: a copy-constructed frozen set differs from its original");
2232 }
2233
2234 frozen=wsSet;
2235 if(frozen!=idSet || !(frozen==idSet)) {
2236 errln("FAIL: a frozen set was modified by operator=");
2237 }
2238
2239 UnicodeSet frozen2(frozen);
2240 if(frozen2!=frozen || frozen2!=idSet) {
2241 errln("FAIL: a copied frozen set differs from its frozen original");
2242 }
2243 if(!frozen2.isFrozen()) {
2244 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2245 }
2246 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2247 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2248 errln("FAIL: UnicodeSet(5, 55) failed");
2249 }
2250 frozen3=frozen;
2251 if(!frozen3.isFrozen()) {
2252 errln("FAIL: copying a frozen set results in a thawed one");
2253 }
2254
2255 UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2256 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2257 errln("FAIL: clone() failed");
2258 }
2259 cloned->add(0xd802, 0xd805);
2260 if(cloned->containsSome(0xd802, 0xd805)) {
2261 errln("FAIL: unable to modify clone");
2262 }
2263 delete cloned;
2264
2265 UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2266 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2267 errln("FAIL: cloneAsThawed() failed");
2268 }
2269 thawed->add(0xd802, 0xd805);
2270 if(!thawed->contains(0xd802, 0xd805)) {
2271 errln("FAIL: unable to modify thawed clone");
2272 }
2273 delete thawed;
2274
2275 frozen.set(5, 55);
2276 if(frozen!=idSet || !(frozen==idSet)) {
2277 errln("FAIL: UnicodeSet::set() modified a frozen set");
2278 }
2279
2280 frozen.clear();
2281 if(frozen!=idSet || !(frozen==idSet)) {
2282 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2283 }
2284
2285 frozen.closeOver(USET_CASE_INSENSITIVE);
2286 if(frozen!=idSet || !(frozen==idSet)) {
2287 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2288 }
2289
2290 frozen.compact();
2291 if(frozen!=idSet || !(frozen==idSet)) {
2292 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2293 }
2294
2295 ParsePosition pos;
2296 frozen.
2297 applyPattern(wsPattern, errorCode).
2298 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2299 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2300 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2301 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2302 if(frozen!=idSet || !(frozen==idSet)) {
2303 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2304 }
2305
2306 frozen.
2307 add(0xd800).
2308 add(0xd802, 0xd805).
2309 add(wsPattern).
2310 addAll(idPattern).
2311 addAll(wsSet);
2312 if(frozen!=idSet || !(frozen==idSet)) {
2313 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2314 }
2315
2316 frozen.
2317 retain(0x62).
2318 retain(0x64, 0x69).
2319 retainAll(wsPattern).
2320 retainAll(wsSet);
2321 if(frozen!=idSet || !(frozen==idSet)) {
2322 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2323 }
2324
2325 frozen.
2326 remove(0x62).
2327 remove(0x64, 0x69).
2328 remove(idPattern).
2329 removeAll(idPattern).
2330 removeAll(idSet);
2331 if(frozen!=idSet || !(frozen==idSet)) {
2332 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2333 }
2334
2335 frozen.
2336 complement().
2337 complement(0x62).
2338 complement(0x64, 0x69).
2339 complement(idPattern).
2340 complementAll(idPattern).
2341 complementAll(idSet);
2342 if(frozen!=idSet || !(frozen==idSet)) {
2343 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2344 }
2345 }
2346
2347 // Test span() etc. -------------------------------------------------------- ***
2348
2349 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2350 static int32_t
appendUTF8(const UChar * s,int32_t length,char * t,int32_t capacity)2351 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2352 UErrorCode errorCode=U_ZERO_ERROR;
2353 int32_t length8=0;
2354 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2355 if(U_SUCCESS(errorCode)) {
2356 return length8;
2357 } else {
2358 // The string contains an unpaired surrogate.
2359 // Ignore this string.
2360 return 0;
2361 }
2362 }
2363
2364 class UnicodeSetWithStringsIterator;
2365
2366 // Make the strings in a UnicodeSet easily accessible.
2367 class UnicodeSetWithStrings {
2368 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2369 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2370 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2371 int32_t size=set.size();
2372 if(size>0 && set.charAt(size-1)<0) {
2373 // If a set's last element is not a code point, then it must contain strings.
2374 // Iterate over the set, skip all code point ranges, and cache the strings.
2375 // Convert them to UTF-8 for spanUTF8().
2376 UnicodeSetIterator iter(set);
2377 const UnicodeString *s;
2378 char *s8=utf8;
2379 int32_t length8, utf8Count=0;
2380 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2381 if(iter.isString()) {
2382 // Store the pointer to the set's string element
2383 // which we happen to know is a stable pointer.
2384 strings[stringsLength]=s=&iter.getString();
2385 utf8Count+=
2386 utf8Lengths[stringsLength]=length8=
2387 appendUTF8(s->getBuffer(), s->length(),
2388 s8, (int32_t)(sizeof(utf8)-utf8Count));
2389 if(length8==0) {
2390 hasSurrogates=TRUE; // Contains unpaired surrogates.
2391 }
2392 s8+=length8;
2393 ++stringsLength;
2394 }
2395 }
2396 }
2397 }
2398
getSet() const2399 const UnicodeSet &getSet() const {
2400 return set;
2401 }
2402
hasStrings() const2403 UBool hasStrings() const {
2404 return (UBool)(stringsLength>0);
2405 }
2406
hasStringsWithSurrogates() const2407 UBool hasStringsWithSurrogates() const {
2408 return hasSurrogates;
2409 }
2410
2411 private:
2412 friend class UnicodeSetWithStringsIterator;
2413
2414 const UnicodeSet &set;
2415
2416 const UnicodeString *strings[20];
2417 int32_t stringsLength;
2418 UBool hasSurrogates;
2419
2420 char utf8[1024];
2421 int32_t utf8Lengths[20];
2422 };
2423
2424 class UnicodeSetWithStringsIterator {
2425 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2426 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2427 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2428 }
2429
reset()2430 void reset() {
2431 nextStringIndex=nextUTF8Start=0;
2432 }
2433
nextString()2434 const UnicodeString *nextString() {
2435 if(nextStringIndex<fSet.stringsLength) {
2436 return fSet.strings[nextStringIndex++];
2437 } else {
2438 return NULL;
2439 }
2440 }
2441
2442 // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2443 const char *nextUTF8(int32_t &length) {
2444 if(nextStringIndex<fSet.stringsLength) {
2445 const char *s8=fSet.utf8+nextUTF8Start;
2446 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2447 return s8;
2448 } else {
2449 length=0;
2450 return NULL;
2451 }
2452 }
2453
2454 private:
2455 const UnicodeSetWithStrings &fSet;
2456 int32_t nextStringIndex;
2457 int32_t nextUTF8Start;
2458 };
2459
2460 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2461 // at code point boundaries.
2462 // That is, each edge of a match must not be in the middle of a surrogate pair.
2463 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UnicodeString & t)2464 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2465 s+=start;
2466 limit-=start;
2467 int32_t length=t.length();
2468 return 0==t.compare(s, length) &&
2469 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2470 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2471 }
2472
2473 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2474 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2475 USetSpanCondition spanCondition) {
2476 const UnicodeSet &realSet(set.getSet());
2477 if(!set.hasStrings()) {
2478 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2479 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2480 }
2481
2482 UChar32 c;
2483 int32_t start=0, prev;
2484 while((prev=start)<length) {
2485 U16_NEXT(s, start, length, c);
2486 if(realSet.contains(c)!=spanCondition) {
2487 break;
2488 }
2489 }
2490 return prev;
2491 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2492 UnicodeSetWithStringsIterator iter(set);
2493 UChar32 c;
2494 int32_t start, next;
2495 for(start=next=0; start<length;) {
2496 U16_NEXT(s, next, length, c);
2497 if(realSet.contains(c)) {
2498 break;
2499 }
2500 const UnicodeString *str;
2501 iter.reset();
2502 while((str=iter.nextString())!=NULL) {
2503 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2504 // spanNeedsStrings=TRUE;
2505 return start;
2506 }
2507 }
2508 start=next;
2509 }
2510 return start;
2511 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2512 UnicodeSetWithStringsIterator iter(set);
2513 UChar32 c;
2514 int32_t start, next, maxSpanLimit=0;
2515 for(start=next=0; start<length;) {
2516 U16_NEXT(s, next, length, c);
2517 if(!realSet.contains(c)) {
2518 next=start; // Do not span this single, not-contained code point.
2519 }
2520 const UnicodeString *str;
2521 iter.reset();
2522 while((str=iter.nextString())!=NULL) {
2523 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2524 // spanNeedsStrings=TRUE;
2525 int32_t matchLimit=start+str->length();
2526 if(matchLimit==length) {
2527 return length;
2528 }
2529 if(spanCondition==USET_SPAN_CONTAINED) {
2530 // Iterate for the shortest match at each position.
2531 // Recurse for each but the shortest match.
2532 if(next==start) {
2533 next=matchLimit; // First match from start.
2534 } else {
2535 if(matchLimit<next) {
2536 // Remember shortest match from start for iteration.
2537 int32_t temp=next;
2538 next=matchLimit;
2539 matchLimit=temp;
2540 }
2541 // Recurse for non-shortest match from start.
2542 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2543 USET_SPAN_CONTAINED);
2544 if((matchLimit+spanLength)>maxSpanLimit) {
2545 maxSpanLimit=matchLimit+spanLength;
2546 if(maxSpanLimit==length) {
2547 return length;
2548 }
2549 }
2550 }
2551 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2552 if(matchLimit>next) {
2553 // Remember longest match from start.
2554 next=matchLimit;
2555 }
2556 }
2557 }
2558 }
2559 if(next==start) {
2560 break; // No match from start.
2561 }
2562 start=next;
2563 }
2564 if(start>maxSpanLimit) {
2565 return start;
2566 } else {
2567 return maxSpanLimit;
2568 }
2569 }
2570 }
2571
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const UChar * s,int32_t length,USetSpanCondition spanCondition)2572 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2573 USetSpanCondition spanCondition) {
2574 if(length==0) {
2575 return 0;
2576 }
2577 const UnicodeSet &realSet(set.getSet());
2578 if(!set.hasStrings()) {
2579 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2580 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2581 }
2582
2583 UChar32 c;
2584 int32_t prev=length;
2585 do {
2586 U16_PREV(s, 0, length, c);
2587 if(realSet.contains(c)!=spanCondition) {
2588 break;
2589 }
2590 } while((prev=length)>0);
2591 return prev;
2592 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2593 UnicodeSetWithStringsIterator iter(set);
2594 UChar32 c;
2595 int32_t prev=length, length0=length;
2596 do {
2597 U16_PREV(s, 0, length, c);
2598 if(realSet.contains(c)) {
2599 break;
2600 }
2601 const UnicodeString *str;
2602 iter.reset();
2603 while((str=iter.nextString())!=NULL) {
2604 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2605 // spanNeedsStrings=TRUE;
2606 return prev;
2607 }
2608 }
2609 } while((prev=length)>0);
2610 return prev;
2611 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2612 UnicodeSetWithStringsIterator iter(set);
2613 UChar32 c;
2614 int32_t prev=length, minSpanStart=length, length0=length;
2615 do {
2616 U16_PREV(s, 0, length, c);
2617 if(!realSet.contains(c)) {
2618 length=prev; // Do not span this single, not-contained code point.
2619 }
2620 const UnicodeString *str;
2621 iter.reset();
2622 while((str=iter.nextString())!=NULL) {
2623 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2624 // spanNeedsStrings=TRUE;
2625 int32_t matchStart=prev-str->length();
2626 if(matchStart==0) {
2627 return 0;
2628 }
2629 if(spanCondition==USET_SPAN_CONTAINED) {
2630 // Iterate for the shortest match at each position.
2631 // Recurse for each but the shortest match.
2632 if(length==prev) {
2633 length=matchStart; // First match from prev.
2634 } else {
2635 if(matchStart>length) {
2636 // Remember shortest match from prev for iteration.
2637 int32_t temp=length;
2638 length=matchStart;
2639 matchStart=temp;
2640 }
2641 // Recurse for non-shortest match from prev.
2642 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2643 USET_SPAN_CONTAINED);
2644 if(spanStart<minSpanStart) {
2645 minSpanStart=spanStart;
2646 if(minSpanStart==0) {
2647 return 0;
2648 }
2649 }
2650 }
2651 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2652 if(matchStart<length) {
2653 // Remember longest match from prev.
2654 length=matchStart;
2655 }
2656 }
2657 }
2658 }
2659 if(length==prev) {
2660 break; // No match from prev.
2661 }
2662 } while((prev=length)>0);
2663 if(prev<minSpanStart) {
2664 return prev;
2665 } else {
2666 return minSpanStart;
2667 }
2668 }
2669 }
2670
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2671 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2672 USetSpanCondition spanCondition) {
2673 const UnicodeSet &realSet(set.getSet());
2674 if(!set.hasStrings()) {
2675 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2676 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2677 }
2678
2679 UChar32 c;
2680 int32_t start=0, prev;
2681 while((prev=start)<length) {
2682 U8_NEXT_OR_FFFD(s, start, length, c);
2683 if(realSet.contains(c)!=spanCondition) {
2684 break;
2685 }
2686 }
2687 return prev;
2688 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2689 UnicodeSetWithStringsIterator iter(set);
2690 UChar32 c;
2691 int32_t start, next;
2692 for(start=next=0; start<length;) {
2693 U8_NEXT_OR_FFFD(s, next, length, c);
2694 if(realSet.contains(c)) {
2695 break;
2696 }
2697 const char *s8;
2698 int32_t length8;
2699 iter.reset();
2700 while((s8=iter.nextUTF8(length8))!=NULL) {
2701 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2702 // spanNeedsStrings=TRUE;
2703 return start;
2704 }
2705 }
2706 start=next;
2707 }
2708 return start;
2709 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2710 UnicodeSetWithStringsIterator iter(set);
2711 UChar32 c;
2712 int32_t start, next, maxSpanLimit=0;
2713 for(start=next=0; start<length;) {
2714 U8_NEXT_OR_FFFD(s, next, length, c);
2715 if(!realSet.contains(c)) {
2716 next=start; // Do not span this single, not-contained code point.
2717 }
2718 const char *s8;
2719 int32_t length8;
2720 iter.reset();
2721 while((s8=iter.nextUTF8(length8))!=NULL) {
2722 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2723 // spanNeedsStrings=TRUE;
2724 int32_t matchLimit=start+length8;
2725 if(matchLimit==length) {
2726 return length;
2727 }
2728 if(spanCondition==USET_SPAN_CONTAINED) {
2729 // Iterate for the shortest match at each position.
2730 // Recurse for each but the shortest match.
2731 if(next==start) {
2732 next=matchLimit; // First match from start.
2733 } else {
2734 if(matchLimit<next) {
2735 // Remember shortest match from start for iteration.
2736 int32_t temp=next;
2737 next=matchLimit;
2738 matchLimit=temp;
2739 }
2740 // Recurse for non-shortest match from start.
2741 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2742 USET_SPAN_CONTAINED);
2743 if((matchLimit+spanLength)>maxSpanLimit) {
2744 maxSpanLimit=matchLimit+spanLength;
2745 if(maxSpanLimit==length) {
2746 return length;
2747 }
2748 }
2749 }
2750 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2751 if(matchLimit>next) {
2752 // Remember longest match from start.
2753 next=matchLimit;
2754 }
2755 }
2756 }
2757 }
2758 if(next==start) {
2759 break; // No match from start.
2760 }
2761 start=next;
2762 }
2763 if(start>maxSpanLimit) {
2764 return start;
2765 } else {
2766 return maxSpanLimit;
2767 }
2768 }
2769 }
2770
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2771 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2772 USetSpanCondition spanCondition) {
2773 if(length==0) {
2774 return 0;
2775 }
2776 const UnicodeSet &realSet(set.getSet());
2777 if(!set.hasStrings()) {
2778 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2779 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2780 }
2781
2782 UChar32 c;
2783 int32_t prev=length;
2784 do {
2785 U8_PREV_OR_FFFD(s, 0, length, c);
2786 if(realSet.contains(c)!=spanCondition) {
2787 break;
2788 }
2789 } while((prev=length)>0);
2790 return prev;
2791 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2792 UnicodeSetWithStringsIterator iter(set);
2793 UChar32 c;
2794 int32_t prev=length;
2795 do {
2796 U8_PREV_OR_FFFD(s, 0, length, c);
2797 if(realSet.contains(c)) {
2798 break;
2799 }
2800 const char *s8;
2801 int32_t length8;
2802 iter.reset();
2803 while((s8=iter.nextUTF8(length8))!=NULL) {
2804 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2805 // spanNeedsStrings=TRUE;
2806 return prev;
2807 }
2808 }
2809 } while((prev=length)>0);
2810 return prev;
2811 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2812 UnicodeSetWithStringsIterator iter(set);
2813 UChar32 c;
2814 int32_t prev=length, minSpanStart=length;
2815 do {
2816 U8_PREV_OR_FFFD(s, 0, length, c);
2817 if(!realSet.contains(c)) {
2818 length=prev; // Do not span this single, not-contained code point.
2819 }
2820 const char *s8;
2821 int32_t length8;
2822 iter.reset();
2823 while((s8=iter.nextUTF8(length8))!=NULL) {
2824 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2825 // spanNeedsStrings=TRUE;
2826 int32_t matchStart=prev-length8;
2827 if(matchStart==0) {
2828 return 0;
2829 }
2830 if(spanCondition==USET_SPAN_CONTAINED) {
2831 // Iterate for the shortest match at each position.
2832 // Recurse for each but the shortest match.
2833 if(length==prev) {
2834 length=matchStart; // First match from prev.
2835 } else {
2836 if(matchStart>length) {
2837 // Remember shortest match from prev for iteration.
2838 int32_t temp=length;
2839 length=matchStart;
2840 matchStart=temp;
2841 }
2842 // Recurse for non-shortest match from prev.
2843 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2844 USET_SPAN_CONTAINED);
2845 if(spanStart<minSpanStart) {
2846 minSpanStart=spanStart;
2847 if(minSpanStart==0) {
2848 return 0;
2849 }
2850 }
2851 }
2852 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2853 if(matchStart<length) {
2854 // Remember longest match from prev.
2855 length=matchStart;
2856 }
2857 }
2858 }
2859 }
2860 if(length==prev) {
2861 break; // No match from prev.
2862 }
2863 } while((prev=length)>0);
2864 if(prev<minSpanStart) {
2865 return prev;
2866 } else {
2867 return minSpanStart;
2868 }
2869 }
2870 }
2871
2872 // spans to be performed and compared
2873 enum {
2874 SPAN_UTF16 =1,
2875 SPAN_UTF8 =2,
2876 SPAN_UTFS =3,
2877
2878 SPAN_SET =4,
2879 SPAN_COMPLEMENT =8,
2880 SPAN_POLARITY =0xc,
2881
2882 SPAN_FWD =0x10,
2883 SPAN_BACK =0x20,
2884 SPAN_DIRS =0x30,
2885
2886 SPAN_CONTAINED =0x100,
2887 SPAN_SIMPLE =0x200,
2888 SPAN_CONDITION =0x300,
2889
2890 SPAN_ALL =0x33f
2891 };
2892
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)2893 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2894 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2895 }
2896
slen(const void * s,UBool isUTF16)2897 static inline int32_t slen(const void *s, UBool isUTF16) {
2898 return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2899 }
2900
2901 /*
2902 * Count spans on a string with the method according to type and set the span limits.
2903 * The set may be the complement of the original.
2904 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2905 * according to the expected number of spans.
2906 * Sets typeName to an empty string if there is no such type.
2907 * Returns -1 if the span option is filtered out.
2908 */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)2909 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2910 const void *s, int32_t length, UBool isUTF16,
2911 uint32_t whichSpans,
2912 int type, const char *&typeName,
2913 int32_t limits[], int32_t limitsCapacity,
2914 int32_t expectCount) {
2915 const UnicodeSet &realSet(set.getSet());
2916 int32_t start, count;
2917 USetSpanCondition spanCondition, firstSpanCondition, contained;
2918 UBool isForward;
2919
2920 if(type<0 || 7<type) {
2921 typeName="";
2922 return 0;
2923 }
2924
2925 static const char *const typeNames16[]={
2926 "contains", "contains(LM)",
2927 "span", "span(LM)",
2928 "containsBack", "containsBack(LM)",
2929 "spanBack", "spanBack(LM)"
2930 };
2931
2932 static const char *const typeNames8[]={
2933 "containsUTF8", "containsUTF8(LM)",
2934 "spanUTF8", "spanUTF8(LM)",
2935 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2936 "spanBackUTF8", "spanBackUTF8(LM)"
2937 };
2938
2939 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2940
2941 // filter span options
2942 if(type<=3) {
2943 // span forward
2944 if((whichSpans&SPAN_FWD)==0) {
2945 return -1;
2946 }
2947 isForward=TRUE;
2948 } else {
2949 // span backward
2950 if((whichSpans&SPAN_BACK)==0) {
2951 return -1;
2952 }
2953 isForward=FALSE;
2954 }
2955 if((type&1)==0) {
2956 // use USET_SPAN_CONTAINED
2957 if((whichSpans&SPAN_CONTAINED)==0) {
2958 return -1;
2959 }
2960 contained=USET_SPAN_CONTAINED;
2961 } else {
2962 // use USET_SPAN_SIMPLE
2963 if((whichSpans&SPAN_SIMPLE)==0) {
2964 return -1;
2965 }
2966 contained=USET_SPAN_SIMPLE;
2967 }
2968
2969 // Default first span condition for going forward with an uncomplemented set.
2970 spanCondition=USET_SPAN_NOT_CONTAINED;
2971 if(isComplement) {
2972 spanCondition=invertSpanCondition(spanCondition, contained);
2973 }
2974
2975 // First span condition for span(), used to terminate the spanBack() iteration.
2976 firstSpanCondition=spanCondition;
2977
2978 // spanBack(): Its initial span condition is span()'s last span condition,
2979 // which is the opposite of span()'s first span condition
2980 // if we expect an even number of spans.
2981 // (The loop inverts spanCondition (expectCount-1) times
2982 // before the expectCount'th span() call.)
2983 // If we do not compare forward and backward directions, then we do not have an
2984 // expectCount and just start with firstSpanCondition.
2985 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2986 spanCondition=invertSpanCondition(spanCondition, contained);
2987 }
2988
2989 count=0;
2990 switch(type) {
2991 case 0:
2992 case 1:
2993 start=0;
2994 if(length<0) {
2995 length=slen(s, isUTF16);
2996 }
2997 for(;;) {
2998 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2999 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3000 if(count<limitsCapacity) {
3001 limits[count]=start;
3002 }
3003 ++count;
3004 if(start>=length) {
3005 break;
3006 }
3007 spanCondition=invertSpanCondition(spanCondition, contained);
3008 }
3009 break;
3010 case 2:
3011 case 3:
3012 start=0;
3013 for(;;) {
3014 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
3015 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3016 if(count<limitsCapacity) {
3017 limits[count]=start;
3018 }
3019 ++count;
3020 if(length>=0 ? start>=length :
3021 isUTF16 ? ((const UChar *)s)[start]==0 :
3022 ((const char *)s)[start]==0
3023 ) {
3024 break;
3025 }
3026 spanCondition=invertSpanCondition(spanCondition, contained);
3027 }
3028 break;
3029 case 4:
3030 case 5:
3031 if(length<0) {
3032 length=slen(s, isUTF16);
3033 }
3034 for(;;) {
3035 ++count;
3036 if(count<=limitsCapacity) {
3037 limits[limitsCapacity-count]=length;
3038 }
3039 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3040 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3041 if(length==0 && spanCondition==firstSpanCondition) {
3042 break;
3043 }
3044 spanCondition=invertSpanCondition(spanCondition, contained);
3045 }
3046 if(count<limitsCapacity) {
3047 memmove(limits, limits+(limitsCapacity-count), count*4);
3048 }
3049 break;
3050 case 6:
3051 case 7:
3052 for(;;) {
3053 ++count;
3054 if(count<=limitsCapacity) {
3055 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3056 }
3057 // Note: Length<0 is tested only for the first spanBack().
3058 // If we wanted to keep length<0 for all spanBack()s, we would have to
3059 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3060 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3061 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3062 if(length==0 && spanCondition==firstSpanCondition) {
3063 break;
3064 }
3065 spanCondition=invertSpanCondition(spanCondition, contained);
3066 }
3067 if(count<limitsCapacity) {
3068 memmove(limits, limits+(limitsCapacity-count), count*4);
3069 }
3070 break;
3071 default:
3072 typeName="";
3073 return -1;
3074 }
3075
3076 return count;
3077 }
3078
3079 // sets to be tested; odd index=isComplement
3080 enum {
3081 SLOW,
3082 SLOW_NOT,
3083 FAST,
3084 FAST_NOT,
3085 SET_COUNT
3086 };
3087
3088 static const char *const setNames[SET_COUNT]={
3089 "slow",
3090 "slow.not",
3091 "fast",
3092 "fast.not"
3093 };
3094
3095 /*
3096 * Verify that we get the same results whether we look at text with contains(),
3097 * span() or spanBack(), using unfrozen or frozen versions of the set,
3098 * and using the set or its complement (switching the spanConditions accordingly).
3099 * The latter verifies that
3100 * set.span(spanCondition) == set.complement().span(!spanCondition).
3101 *
3102 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3103 * or returned to the caller (with an input expectCount<0).
3104 */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3105 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3106 const void *s, int32_t length, UBool isUTF16,
3107 uint32_t whichSpans,
3108 int32_t expectLimits[], int32_t &expectCount,
3109 const char *testName, int32_t index) {
3110 int32_t limits[500];
3111 int32_t limitsCount;
3112 int i, j;
3113
3114 const char *typeName;
3115 int type;
3116
3117 for(i=0; i<SET_COUNT; ++i) {
3118 if((i&1)==0) {
3119 // Even-numbered sets are original, uncomplemented sets.
3120 if((whichSpans&SPAN_SET)==0) {
3121 continue;
3122 }
3123 } else {
3124 // Odd-numbered sets are complemented.
3125 if((whichSpans&SPAN_COMPLEMENT)==0) {
3126 continue;
3127 }
3128 }
3129 for(type=0;; ++type) {
3130 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3131 s, length, isUTF16,
3132 whichSpans,
3133 type, typeName,
3134 limits, UPRV_LENGTHOF(limits), expectCount);
3135 if(typeName[0]==0) {
3136 break; // All types tried.
3137 }
3138 if(limitsCount<0) {
3139 continue; // Span option filtered out.
3140 }
3141 if(expectCount<0) {
3142 expectCount=limitsCount;
3143 if(limitsCount>UPRV_LENGTHOF(limits)) {
3144 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3145 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3146 return;
3147 }
3148 memcpy(expectLimits, limits, limitsCount*4);
3149 } else if(limitsCount!=expectCount) {
3150 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3151 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3152 } else {
3153 for(j=0; j<limitsCount; ++j) {
3154 if(limits[j]!=expectLimits[j]) {
3155 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3156 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3157 j, (long)limits[j], (long)expectLimits[j]);
3158 break;
3159 }
3160 }
3161 }
3162 }
3163 }
3164
3165 // Compare span() with containsAll()/containsNone(),
3166 // but only if we have expectLimits[] from the uncomplemented set.
3167 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3168 const UChar *s16=(const UChar *)s;
3169 UnicodeString string;
3170 int32_t prev=0, limit, length;
3171 for(i=0; i<expectCount; ++i) {
3172 limit=expectLimits[i];
3173 length=limit-prev;
3174 if(length>0) {
3175 string.setTo(FALSE, s16+prev, length); // read-only alias
3176 if(i&1) {
3177 if(!sets[SLOW]->getSet().containsAll(string)) {
3178 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3179 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3180 return;
3181 }
3182 if(!sets[FAST]->getSet().containsAll(string)) {
3183 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3184 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3185 return;
3186 }
3187 } else {
3188 if(!sets[SLOW]->getSet().containsNone(string)) {
3189 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3190 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3191 return;
3192 }
3193 if(!sets[FAST]->getSet().containsNone(string)) {
3194 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3195 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3196 return;
3197 }
3198 }
3199 }
3200 prev=limit;
3201 }
3202 }
3203 }
3204
3205 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3206 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3207 const void *s, int32_t length, UBool isUTF16,
3208 uint32_t whichSpans,
3209 const char *testName, int32_t index) {
3210 int32_t expectLimits[500];
3211 int32_t expectCount=-1;
3212 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3213 }
3214
stringContainsUnpairedSurrogate(const UChar * s,int32_t length)3215 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3216 UChar c, c2;
3217
3218 if(length>=0) {
3219 while(length>0) {
3220 c=*s++;
3221 --length;
3222 if(0xd800<=c && c<0xe000) {
3223 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3224 return TRUE;
3225 }
3226 --length;
3227 }
3228 }
3229 } else {
3230 while((c=*s++)!=0) {
3231 if(0xd800<=c && c<0xe000) {
3232 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3233 return TRUE;
3234 }
3235 }
3236 }
3237 }
3238 return FALSE;
3239 }
3240
3241 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3242 // unless either UTF is turned off in whichSpans.
3243 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3244 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const UChar * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3245 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3246 const UChar *s16, int32_t length16,
3247 uint32_t whichSpans,
3248 const char *testName, int32_t index) {
3249 int32_t expectLimits[500];
3250 int32_t expectCount;
3251
3252 expectCount=-1; // Get expectLimits[] from testSpan().
3253
3254 if((whichSpans&SPAN_UTF16)!=0) {
3255 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3256 }
3257 if((whichSpans&SPAN_UTF8)==0) {
3258 return;
3259 }
3260
3261 // Convert s16[] and expectLimits[] to UTF-8.
3262 uint8_t s8[3000];
3263 int32_t offsets[3000];
3264
3265 const UChar *s16Limit=s16+length16;
3266 char *t=(char *)s8;
3267 char *tLimit=t+sizeof(s8);
3268 int32_t *o=offsets;
3269 UErrorCode errorCode=U_ZERO_ERROR;
3270
3271 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3272 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3273 if(U_FAILURE(errorCode)) {
3274 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3275 testName, (long)index, u_errorName(errorCode));
3276 ucnv_resetFromUnicode(utf8Cnv);
3277 return;
3278 }
3279 int32_t length8=(int32_t)(t-(char *)s8);
3280
3281 // Convert expectLimits[].
3282 int32_t i, j, expect;
3283 for(i=j=0; i<expectCount; ++i) {
3284 expect=expectLimits[i];
3285 if(expect==length16) {
3286 expectLimits[i]=length8;
3287 } else {
3288 while(offsets[j]<expect) {
3289 ++j;
3290 }
3291 expectLimits[i]=j;
3292 }
3293 }
3294
3295 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3296 }
3297
nextCodePoint(UChar32 c)3298 static UChar32 nextCodePoint(UChar32 c) {
3299 // Skip some large and boring ranges.
3300 switch(c) {
3301 case 0x3441:
3302 return 0x4d7f;
3303 case 0x5100:
3304 return 0x9f00;
3305 case 0xb040:
3306 return 0xd780;
3307 case 0xe041:
3308 return 0xf8fe;
3309 case 0x10100:
3310 return 0x20000;
3311 case 0x20041:
3312 return 0xe0000;
3313 case 0xe0101:
3314 return 0x10fffd;
3315 default:
3316 return c+1;
3317 }
3318 }
3319
3320 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3321 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3322 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3323 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3324 // Skip the UTF-8 part of the test - if the string contains surrogates -
3325 // because it is likely to produce a different result.
3326 UBool inconsistentSurrogates=
3327 (!(sets[0]->getSet().contains(0xfffd) ?
3328 sets[0]->getSet().contains(0xd800, 0xdfff) :
3329 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3330 sets[0]->hasStringsWithSurrogates());
3331
3332 UChar s[1000];
3333 int32_t length=0;
3334 uint32_t localWhichSpans;
3335
3336 UChar32 c, first;
3337 for(first=c=0;; c=nextCodePoint(c)) {
3338 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3339 localWhichSpans=whichSpans;
3340 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3341 localWhichSpans&=~SPAN_UTF8;
3342 }
3343 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3344 if(c>0x10ffff) {
3345 break;
3346 }
3347 length=0;
3348 first=c;
3349 }
3350 U16_APPEND_UNSAFE(s, length, c);
3351 }
3352 }
3353
3354 // Test with a particular, interesting string.
3355 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3356 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3357 static const UChar s[]={
3358 0x61, 0x62, 0x20, // Latin, space
3359 0x3b1, 0x3b2, 0x3b3, // Greek
3360 0xd900, // lead surrogate
3361 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3362 0xdc05, // trail surrogate
3363 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3364 0xd900, 0xdc05, // unassigned supplementary
3365 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3366 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3367 0 // NUL
3368 };
3369
3370 if((whichSpans&SPAN_UTF16)==0) {
3371 return;
3372 }
3373 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3374 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3375 }
3376
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3377 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3378 static const char s[]={
3379 "abc" // Latin
3380
3381 /* trail byte in lead position */
3382 "\x80"
3383
3384 " " // space
3385
3386 /* truncated multi-byte sequences */
3387 "\xd0"
3388 "\xe0"
3389 "\xe1"
3390 "\xed"
3391 "\xee"
3392 "\xf0"
3393 "\xf1"
3394 "\xf4"
3395 "\xf8"
3396 "\xfc"
3397
3398 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3399
3400 /* trail byte in lead position */
3401 "\x80"
3402
3403 "\xe0\x80"
3404 "\xe0\xa0"
3405 "\xe1\x80"
3406 "\xed\x80"
3407 "\xed\xa0"
3408 "\xee\x80"
3409 "\xf0\x80"
3410 "\xf0\x90"
3411 "\xf1\x80"
3412 "\xf4\x80"
3413 "\xf4\x90"
3414 "\xf8\x80"
3415 "\xfc\x80"
3416
3417 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3418
3419 /* trail byte in lead position */
3420 "\x80"
3421
3422 "\xf0\x80\x80"
3423 "\xf0\x90\x80"
3424 "\xf1\x80\x80"
3425 "\xf4\x80\x80"
3426 "\xf4\x90\x80"
3427 "\xf8\x80\x80"
3428 "\xfc\x80\x80"
3429
3430 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3431
3432 /* trail byte in lead position */
3433 "\x80"
3434
3435 "\xf8\x80\x80\x80"
3436 "\xfc\x80\x80\x80"
3437
3438 "\xF1\x90\x80\x85" // unassigned supplementary
3439
3440 /* trail byte in lead position */
3441 "\x80"
3442
3443 "\xfc\x80\x80\x80\x80"
3444
3445 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3446
3447 /* trail byte in lead position */
3448 "\x80"
3449
3450 /* complete sequences but non-shortest forms or out of range etc. */
3451 "\xc0\x80"
3452 "\xe0\x80\x80"
3453 "\xed\xa0\x80"
3454 "\xf0\x80\x80\x80"
3455 "\xf4\x90\x80\x80"
3456 "\xf8\x80\x80\x80\x80"
3457 "\xfc\x80\x80\x80\x80\x80"
3458 "\xfe"
3459 "\xff"
3460
3461 /* trail byte in lead position */
3462 "\x80"
3463
3464 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3465 };
3466
3467 if((whichSpans&SPAN_UTF8)==0) {
3468 return;
3469 }
3470 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3471 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3472 }
3473
3474 // Take a set of span options and multiply them so that
3475 // each portion only has one of the options a, b and c.
3476 // If b==0, then the set of options is just modified with mask and a.
3477 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3478 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3479 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3480 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3481 uint32_t s;
3482 int32_t i;
3483
3484 for(i=0; i<whichSpansCount; ++i) {
3485 s=whichSpans[i]&mask;
3486 whichSpans[i]=s|a;
3487 if(b!=0) {
3488 whichSpans[whichSpansCount+i]=s|b;
3489 if(c!=0) {
3490 whichSpans[2*whichSpansCount+i]=s|c;
3491 }
3492 }
3493 }
3494 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3495 }
3496
3497 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3498 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3499 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3500 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3501
TestSpan()3502 void UnicodeSetTest::TestSpan() {
3503 // "[...]" is a UnicodeSet pattern.
3504 // "*" performs tests on all Unicode code points and on a selection of
3505 // malformed UTF-8/16 strings.
3506 // "-options" limits the scope of testing for the current set.
3507 // By default, the test verifies that equivalent boundaries are found
3508 // for UTF-16 and UTF-8, going forward and backward,
3509 // alternating USET_SPAN_NOT_CONTAINED with
3510 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3511 // Single-character options:
3512 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3513 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3514 // or the set contains strings with unpaired surrogates
3515 // which do not translate to valid UTF-8.
3516 // c -- set.span() and set.complement().span() boundaries may differ.
3517 // Cause: Set strings are not complemented.
3518 // b -- span() and spanBack() boundaries may differ.
3519 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3520 // and spanBack(USET_SPAN_SIMPLE) are defined to
3521 // match with non-overlapping substrings.
3522 // For example, with a set containing "ab" and "ba",
3523 // span() of "aba" yields boundaries { 0, 2, 3 }
3524 // because the initial "ab" matches from 0 to 2,
3525 // while spanBack() yields boundaries { 0, 1, 3 }
3526 // because the final "ba" matches from 1 to 3.
3527 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3528 // Cause: Strings in the set overlap, and a longer match may
3529 // require a sequence including non-longest substrings.
3530 // For example, with a set containing "ab", "abc" and "cd",
3531 // span(contained) of "abcd" spans the entire string
3532 // but span(longest match) only spans the first 3 characters.
3533 // Each "-options" first resets all options and then applies the specified options.
3534 // A "-" without options resets the options.
3535 // The options are also reset for each new set.
3536 // Other strings will be spanned.
3537 static const char *const testdata[]={
3538 "[:ID_Continue:]",
3539 "*",
3540 "[:White_Space:]",
3541 "*",
3542 "[]",
3543 "*",
3544 "[\\u0000-\\U0010FFFF]",
3545 "*",
3546 "[\\u0000\\u0080\\u0800\\U00010000]",
3547 "*",
3548 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3549 "*",
3550 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3551 "-c",
3552 "*",
3553 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3554 "-c",
3555 "*",
3556
3557 // Overlapping strings cause overlapping attempts to match.
3558 "[x{xy}{xya}{axy}{ax}]",
3559 "-cl",
3560
3561 // More repetitions of "xya" would take too long with the recursive
3562 // reference implementation.
3563 // containsAll()=FALSE
3564 // test_string 0x14
3565 "xx"
3566 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3567 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3568 "xyaxyaxyaxya"
3569 "xx"
3570 "xyaxyaxyaxya" // span() ends here.
3571 "aaa",
3572
3573 // containsAll()=TRUE
3574 // test_string 0x15
3575 "xx"
3576 "xyaxyaxyaxya"
3577 "xx"
3578 "xyaxyaxyaxya"
3579 "xx"
3580 "xyaxyaxyaxy",
3581
3582 "-bc",
3583 // test_string 0x17
3584 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3585 "-c",
3586 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3587 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3588 "-",
3589 "byaya", // span() -> { 5 }
3590 "byay", // span() -> { 4 }
3591 "bya", // span() -> { 3 }
3592
3593 // span(longest match) will not span the whole string.
3594 "[a{ab}{bc}]",
3595 "-cl",
3596 // test_string 0x21
3597 "abc",
3598
3599 "[a{ab}{abc}{cd}]",
3600 "-cl",
3601 "acdabcdabccd",
3602
3603 // spanBack(longest match) will not span the whole string.
3604 "[c{ab}{bc}]",
3605 "-cl",
3606 "abc",
3607
3608 "[d{cd}{bcd}{ab}]",
3609 "-cl",
3610 "abbcdabcdabd",
3611
3612 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3613 // and UTF-8 trail bytes.
3614 // Copies of above test sets and strings, but transliterated to have
3615 // different code points with similar trail units.
3616 // Previous: a b c d
3617 // Unicode: 042B 30AB 200AB 204AB
3618 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3619 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3620 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3621 "-cl",
3622 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3623
3624 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3625 "-cl",
3626 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3627
3628 // Stress bookkeeping and recursion.
3629 // The following strings are barely doable with the recursive
3630 // reference implementation.
3631 // The not-contained character at the end prevents an early exit from the span().
3632 "[b{bb}]",
3633 "-c",
3634 // test_string 0x33
3635 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3636 // On complement sets, span() and spanBack() get different results
3637 // because b is not in the complement set and there is an odd number of b's
3638 // in the test string.
3639 "-bc",
3640 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3641
3642 // Test with set strings with an initial or final code point span
3643 // longer than 254.
3644 "[a{" _64_a _64_a _64_a _64_a "b}"
3645 "{a" _64_b _64_b _64_b _64_b "}]",
3646 "-c",
3647 _64_a _64_a _64_a _63_a "b",
3648 _64_a _64_a _64_a _64_a "b",
3649 _64_a _64_a _64_a _64_a "aaaabbbb",
3650 "a" _64_b _64_b _64_b _63_b,
3651 "a" _64_b _64_b _64_b _64_b,
3652 "aaaabbbb" _64_b _64_b _64_b _64_b,
3653
3654 // Test with strings containing unpaired surrogates.
3655 // They are not representable in UTF-8, and a leading trail surrogate
3656 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3657 // U+20001 == \\uD840\\uDC01
3658 // U+20400 == \\uD841\\uDC00
3659 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3660 "-8cl",
3661 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3662 };
3663 uint32_t whichSpans[96]={ SPAN_ALL };
3664 int32_t whichSpansCount=1;
3665
3666 UnicodeSet *sets[SET_COUNT]={ NULL };
3667 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3668
3669 char testName[1024];
3670 char *testNameLimit=testName;
3671
3672 int32_t i, j;
3673 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3674 const char *s=testdata[i];
3675 if(s[0]=='[') {
3676 // Create new test sets from this pattern.
3677 for(j=0; j<SET_COUNT; ++j) {
3678 delete sets_with_str[j];
3679 delete sets[j];
3680 }
3681 UErrorCode errorCode=U_ZERO_ERROR;
3682 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3683 if(U_FAILURE(errorCode)) {
3684 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3685 break;
3686 }
3687 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3688 sets[SLOW_NOT]->complement();
3689 // Intermediate set: Test cloning of a frozen set.
3690 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3691 fast->freeze();
3692 sets[FAST]=(UnicodeSet *)fast->clone();
3693 delete fast;
3694 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3695 fastNot->freeze();
3696 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3697 delete fastNot;
3698
3699 for(j=0; j<SET_COUNT; ++j) {
3700 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3701 }
3702
3703 strcpy(testName, s);
3704 testNameLimit=strchr(testName, 0);
3705 *testNameLimit++=':';
3706 *testNameLimit=0;
3707
3708 whichSpans[0]=SPAN_ALL;
3709 whichSpansCount=1;
3710 } else if(s[0]=='-') {
3711 whichSpans[0]=SPAN_ALL;
3712 whichSpansCount=1;
3713
3714 while(*++s!=0) {
3715 switch(*s) {
3716 case 'c':
3717 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3718 ~SPAN_POLARITY,
3719 SPAN_SET,
3720 SPAN_COMPLEMENT,
3721 0);
3722 break;
3723 case 'b':
3724 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3725 ~SPAN_DIRS,
3726 SPAN_FWD,
3727 SPAN_BACK,
3728 0);
3729 break;
3730 case 'l':
3731 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3732 // USET_SPAN_SIMPLE only FWD, and separately
3733 // USET_SPAN_SIMPLE only BACK
3734 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3735 ~(SPAN_DIRS|SPAN_CONDITION),
3736 SPAN_DIRS|SPAN_CONTAINED,
3737 SPAN_FWD|SPAN_SIMPLE,
3738 SPAN_BACK|SPAN_SIMPLE);
3739 break;
3740 case '8':
3741 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3742 ~SPAN_UTFS,
3743 SPAN_UTF16,
3744 SPAN_UTF8,
3745 0);
3746 break;
3747 default:
3748 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3749 break;
3750 }
3751 }
3752 } else if(0==strcmp(s, "*")) {
3753 strcpy(testNameLimit, "bad_string");
3754 for(j=0; j<whichSpansCount; ++j) {
3755 if(whichSpansCount>1) {
3756 sprintf(testNameLimit+10 /* strlen("bad_string") */,
3757 "%%0x%3x",
3758 whichSpans[j]);
3759 }
3760 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3761 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3762 }
3763
3764 strcpy(testNameLimit, "contents");
3765 for(j=0; j<whichSpansCount; ++j) {
3766 if(whichSpansCount>1) {
3767 sprintf(testNameLimit+8 /* strlen("contents") */,
3768 "%%0x%3x",
3769 whichSpans[j]);
3770 }
3771 testSpanContents(sets_with_str, whichSpans[j], testName);
3772 }
3773 } else {
3774 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3775 strcpy(testNameLimit, "test_string");
3776 for(j=0; j<whichSpansCount; ++j) {
3777 if(whichSpansCount>1) {
3778 sprintf(testNameLimit+11 /* strlen("test_string") */,
3779 "%%0x%3x",
3780 whichSpans[j]);
3781 }
3782 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3783 }
3784 }
3785 }
3786 for(j=0; j<SET_COUNT; ++j) {
3787 delete sets_with_str[j];
3788 delete sets[j];
3789 }
3790 }
3791
3792 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3793 void UnicodeSetTest::TestStringSpan() {
3794 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3795 static const char *const string=
3796 "xx"
3797 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3798 "xx"
3799 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3800 "xx"
3801 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3802 "aaaa";
3803
3804 UErrorCode errorCode=U_ZERO_ERROR;
3805 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3806 UnicodeSet set(pattern16, errorCode);
3807 if(U_FAILURE(errorCode)) {
3808 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3809 return;
3810 }
3811
3812 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3813
3814 if(set.containsAll(string16)) {
3815 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3816 }
3817
3818 // Remove trailing "aaaa".
3819 string16.truncate(string16.length()-4);
3820 if(!set.containsAll(string16)) {
3821 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3822 }
3823
3824 string16=UNICODE_STRING_SIMPLE("byayaxya");
3825 const UChar *s16=string16.getBuffer();
3826 int32_t length16=string16.length();
3827 (void)length16; // Suppress set but not used warning.
3828 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3829 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3830 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3831 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3832 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3833 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3834 ) {
3835 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3836 }
3837
3838 pattern="[a{ab}{abc}{cd}]";
3839 pattern16=UnicodeString(pattern, -1, US_INV);
3840 set.applyPattern(pattern16, errorCode);
3841 if(U_FAILURE(errorCode)) {
3842 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3843 return;
3844 }
3845 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3846 s16=string16.getBuffer();
3847 length16=string16.length();
3848 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3849 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3850 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3851 ) {
3852 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3853 }
3854
3855 pattern="[d{cd}{bcd}{ab}]";
3856 pattern16=UnicodeString(pattern, -1, US_INV);
3857 set.applyPattern(pattern16, errorCode).freeze();
3858 if(U_FAILURE(errorCode)) {
3859 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3860 return;
3861 }
3862 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3863 s16=string16.getBuffer();
3864 length16=string16.length();
3865 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3866 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3867 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3868 ) {
3869 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3870 }
3871 }
3872
3873 /**
3874 * Including collationroot.h fails here with
3875 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
3876 * .. so, we skip this test on Windows.
3877 *
3878 * the cause is that intltest builds with /Za which disables language extensions - which means
3879 * windows header files can't be used.
3880 */
3881 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
3882 #include "collationroot.h"
3883 #include "collationtailoring.h"
3884 #endif
3885
TestUCAUnsafeBackwards()3886 void UnicodeSetTest::TestUCAUnsafeBackwards() {
3887 #if U_PLATFORM_HAS_WIN32_API
3888 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
3889 #elif !UCONFIG_NO_COLLATION
3890 UErrorCode errorCode = U_ZERO_ERROR;
3891
3892 // Get the unsafeBackwardsSet
3893 const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
3894 if(U_FAILURE(errorCode)) {
3895 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
3896 return;
3897 }
3898 //const UVersionInfo &version = rootEntry->tailoring->version;
3899 const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
3900
3901 checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
3902
3903 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
3904 // simple test case
3905 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
3906 // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
3907 UnicodeSet surrogates;
3908 surrogates.add(0xd83a); // a lead surrogate
3909 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
3910 UnicodeString pat;
3911 surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
3912 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
3913 // so that at least one type of surrogate code points are escaped,
3914 // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
3915 errorCode = U_ZERO_ERROR;
3916 UnicodeSet s2;
3917 s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
3918 if(U_FAILURE(errorCode)) {
3919 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
3920 } else {
3921 checkEqual(surrogates, s2, "surrogates to/from pattern");
3922 }
3923 // This occurs in the UCA unsafe-backwards set.
3924 checkRoundTrip(*unsafeBackwardSet);
3925 }
3926 #endif
3927 }
3928