1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 * 03/22/2000 Madhu Added additional tests
11 ********************************************************************************
12 */
13
14 #include <stdio.h>
15
16 #include <string.h>
17 #include <unordered_map>
18 #include "unicode/utypes.h"
19 #include "usettest.h"
20 #include "unicode/ucnv.h"
21 #include "unicode/uniset.h"
22 #include "unicode/uchar.h"
23 #include "unicode/usetiter.h"
24 #include "unicode/ustring.h"
25 #include "unicode/parsepos.h"
26 #include "unicode/symtable.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
29 #include "unicode/uversion.h"
30 #include "cmemory.h"
31 #include "hash.h"
32
33 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
34 if (U_FAILURE(status)) { \
35 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
36 u_errorName(status)); \
37 } \
38 } UPRV_BLOCK_MACRO_END
39
40 #define TEST_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
41 if (!(expr)) { \
42 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); \
43 } \
44 } UPRV_BLOCK_MACRO_END
45
operator +(const UnicodeString & left,const UnicodeSet & set)46 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
47 UnicodeString pat;
48 set.toPattern(pat);
49 return left + UnicodeSetTest::escape(pat);
50 }
51
UnicodeSetTest()52 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(nullptr) {
53 }
54
openUTF8Converter()55 UConverter *UnicodeSetTest::openUTF8Converter() {
56 if(utf8Cnv==nullptr) {
57 UErrorCode errorCode=U_ZERO_ERROR;
58 utf8Cnv=ucnv_open("UTF-8", &errorCode);
59 }
60 return utf8Cnv;
61 }
62
~UnicodeSetTest()63 UnicodeSetTest::~UnicodeSetTest() {
64 ucnv_close(utf8Cnv);
65 }
66
67 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)68 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
69 const char* &name, char* /*par*/) {
70 if (exec) {
71 logln(u"TestSuite UnicodeSetTest");
72 }
73 TESTCASE_AUTO_BEGIN;
74 TESTCASE_AUTO(TestPatterns);
75 TESTCASE_AUTO(TestAddRemove);
76 TESTCASE_AUTO(TestCategories);
77 TESTCASE_AUTO(TestCloneEqualHash);
78 TESTCASE_AUTO(TestMinimalRep);
79 TESTCASE_AUTO(TestAPI);
80 TESTCASE_AUTO(TestScriptSet);
81 TESTCASE_AUTO(TestPropertySet);
82 TESTCASE_AUTO(TestClone);
83 TESTCASE_AUTO(TestExhaustive);
84 TESTCASE_AUTO(TestToPattern);
85 TESTCASE_AUTO(TestIndexOf);
86 TESTCASE_AUTO(TestStrings);
87 TESTCASE_AUTO(Testj2268);
88 TESTCASE_AUTO(TestCloseOver);
89 TESTCASE_AUTO(TestCloseOverSimpleCaseFolding);
90 TESTCASE_AUTO(TestCloseOverLargeSets);
91 TESTCASE_AUTO(TestEscapePattern);
92 TESTCASE_AUTO(TestInvalidCodePoint);
93 TESTCASE_AUTO(TestSymbolTable);
94 TESTCASE_AUTO(TestSurrogate);
95 TESTCASE_AUTO(TestPosixClasses);
96 TESTCASE_AUTO(TestIteration);
97 TESTCASE_AUTO(TestFreezable);
98 TESTCASE_AUTO(TestSpan);
99 TESTCASE_AUTO(TestStringSpan);
100 TESTCASE_AUTO(TestPatternWithSurrogates);
101 TESTCASE_AUTO(TestIntOverflow);
102 TESTCASE_AUTO(TestUnusedCcc);
103 TESTCASE_AUTO(TestDeepPattern);
104 TESTCASE_AUTO(TestEmptyString);
105 TESTCASE_AUTO(TestSkipToStrings);
106 TESTCASE_AUTO(TestPatternCodePointComplement);
107 TESTCASE_AUTO_END;
108 }
109
110 static const char NOT[] = "%%%%";
111
112 /**
113 * UVector was improperly copying contents
114 * This code will crash this is still true
115 */
Testj2268()116 void UnicodeSetTest::Testj2268() {
117 UnicodeSet t;
118 t.add(UnicodeString("abc"));
119 UnicodeSet test(t);
120 UnicodeString ustrPat;
121 test.toPattern(ustrPat, true);
122 }
123
124 /**
125 * Test toPattern().
126 */
TestToPattern()127 void UnicodeSetTest::TestToPattern() {
128 UErrorCode ec = U_ZERO_ERROR;
129
130 // Test that toPattern() round trips with syntax characters and
131 // whitespace.
132 {
133 static const char* OTHER_TOPATTERN_TESTS[] = {
134 "[[:latin:]&[:greek:]]",
135 "[[:latin:]-[:greek:]]",
136 "[:nonspacing mark:]",
137 nullptr
138 };
139
140 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=nullptr; ++j) {
141 ec = U_ZERO_ERROR;
142 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
143 if (U_FAILURE(ec)) {
144 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
145 continue;
146 }
147 checkPat(OTHER_TOPATTERN_TESTS[j], s);
148 }
149
150 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
151 if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
152
153 // check various combinations to make sure they all work.
154 if (i != 0 && !toPatternAux(i, i)){
155 continue;
156 }
157 if (!toPatternAux(0, i)){
158 continue;
159 }
160 if (!toPatternAux(i, 0xFFFF)){
161 continue;
162 }
163 }
164 }
165 }
166
167 // Test pattern behavior of multicharacter strings.
168 {
169 ec = U_ZERO_ERROR;
170 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
171
172 // This loop isn't a loop. It's here to make the compiler happy.
173 // If you're curious, try removing it and changing the 'break'
174 // statements (except for the last) to goto's.
175 for (;;) {
176 if (U_FAILURE(ec)) break;
177 const char* exp1[] = {"aa", "ab", NOT, "ac", nullptr};
178 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
179
180 s->add("ac");
181 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", nullptr};
182 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
183
184 s->applyPattern(u"[a-z {\\{l} {r\\}}]", ec);
185 if (U_FAILURE(ec)) break;
186 const char* exp3[] = {"{l", "r}", NOT, "xy", nullptr};
187 expectToPattern(*s, u"[a-z{r\\}}{\\{l}]", exp3);
188
189 s->add("[]");
190 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", nullptr};
191 expectToPattern(*s, u"[a-z{\\[\\]}{r\\}}{\\{l}]", exp4);
192
193 s->applyPattern(u"[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
194 if (U_FAILURE(ec)) break;
195 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", nullptr};
196 expectToPattern(*s, u"[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);
197
198 // j2189
199 s->clear();
200 s->add(UnicodeString("abc", ""));
201 s->add(UnicodeString("abc", ""));
202 const char* exp6[] = {"abc", NOT, "ab", nullptr};
203 expectToPattern(*s, "[{abc}]", exp6);
204
205 break;
206 }
207
208 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
209 delete s;
210 }
211
212 // JB#3400: For 2 character ranges prefer [ab] to [a-b]
213 UnicodeSet s;
214 s.add(u'a', u'b');
215 expectToPattern(s, "[ab]", nullptr);
216 }
217
toPatternAux(UChar32 start,UChar32 end)218 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
219
220 // use Integer.toString because Utility.hex doesn't handle ints
221 UnicodeString pat = "";
222 // TODO do these in hex
223 //String source = "0x" + Integer.toString(start,16).toUpperCase();
224 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
225 UnicodeString source;
226 source = source + (uint32_t)start;
227 if (start != end)
228 source = source + ".." + (uint32_t)end;
229 UnicodeSet testSet;
230 testSet.add(start, end);
231 return checkPat(source, testSet);
232 }
233
checkPat(const UnicodeString & source,const UnicodeSet & testSet)234 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
235 const UnicodeSet& testSet) {
236 // What we want to make sure of is that a pattern generated
237 // by toPattern(), with or without escaped unprintables, can
238 // be passed back into the UnicodeSet constructor.
239 UnicodeString pat0;
240
241 testSet.toPattern(pat0, true);
242
243 if (!checkPat(source + " (escaped)", testSet, pat0)) return false;
244
245 //String pat1 = unescapeLeniently(pat0);
246 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
247
248 UnicodeString pat2;
249 testSet.toPattern(pat2, false);
250 if (!checkPat(source, testSet, pat2)) return false;
251
252 //String pat3 = unescapeLeniently(pat2);
253 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
254
255 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
256 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
257 return true;
258 }
259
checkPat(const UnicodeString & source,const UnicodeSet & testSet,const UnicodeString & pat)260 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
261 const UnicodeSet& testSet,
262 const UnicodeString& pat) {
263 UErrorCode ec = U_ZERO_ERROR;
264 UnicodeSet testSet2(pat, ec);
265 if (testSet2 != testSet) {
266 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
267 return false;
268 }
269 return true;
270 }
271
272 void
TestPatterns()273 UnicodeSetTest::TestPatterns() {
274 UnicodeSet set;
275 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");
276 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");
277 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");
278 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");
279 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");
280 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
281
282 // Throw in a test of complement
283 set.complement();
284 UnicodeString exp;
285 exp.append((char16_t)0x0000).append("aeeoouu").append((char16_t)(u'z'+1)).append((char16_t)0xFFFF);
286 expectPairs(set, exp);
287 }
288
289 void
TestCategories()290 UnicodeSetTest::TestCategories() {
291 UErrorCode status = U_ZERO_ERROR;
292 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
293 UnicodeSet set(pat, status);
294 if (U_FAILURE(status)) {
295 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
296 return;
297 } else {
298 expectContainment(set, pat, "ABC", "abc");
299 }
300
301 UChar32 i;
302 int32_t failures = 0;
303 // Make sure generation of L doesn't pollute cached Lu set
304 // First generate L, then Lu
305 set.applyPattern("[:L:]", status);
306 if (U_FAILURE(status)) { errln("FAIL"); return; }
307 for (i=0; i<0x200; ++i) {
308 UBool l = u_isalpha((char16_t)i);
309 if (l != set.contains(i)) {
310 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
311 set.contains(i));
312 if (++failures == 10) break;
313 }
314 }
315
316 set.applyPattern("[:Lu:]", status);
317 if (U_FAILURE(status)) { errln("FAIL"); return; }
318 for (i=0; i<0x200; ++i) {
319 UBool lu = (u_charType((char16_t)i) == U_UPPERCASE_LETTER);
320 if (lu != set.contains(i)) {
321 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
322 set.contains(i));
323 if (++failures == 20) break;
324 }
325 }
326 }
327 void
TestCloneEqualHash()328 UnicodeSetTest::TestCloneEqualHash() {
329 UErrorCode status = U_ZERO_ERROR;
330 // set1 and set2 used to be built with the obsolete constructor taking
331 // UCharCategory values; replaced with pattern constructors
332 // markus 20030502
333 UnicodeSet *set1=new UnicodeSet(u"\\p{Lowercase Letter}", status); // :Ll: Letter, lowercase
334 UnicodeSet *set1a=new UnicodeSet(u"[:Ll:]", status); // Letter, lowercase
335 if (U_FAILURE(status)){
336 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
337 return;
338 }
339 UnicodeSet *set2=new UnicodeSet(u"\\p{Decimal Number}", status); //Number, Decimal digit
340 UnicodeSet *set2a=new UnicodeSet(u"[:Nd:]", status); //Number, Decimal digit
341 if (U_FAILURE(status)){
342 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
343 return;
344 }
345
346 if (*set1 != *set1a) {
347 errln("FAIL: category constructor for Ll broken");
348 }
349 if (*set2 != *set2a) {
350 errln("FAIL: category constructor for Nd broken");
351 }
352 delete set1a;
353 delete set2a;
354
355 logln("Testing copy construction");
356 UnicodeSet *set1copy=new UnicodeSet(*set1);
357 if(*set1 != *set1copy || *set1 == *set2 ||
358 getPairs(*set1) != getPairs(*set1copy) ||
359 set1->hashCode() != set1copy->hashCode()){
360 errln("FAIL : Error in copy construction");
361 return;
362 }
363
364 logln("Testing =operator");
365 UnicodeSet set1equal=*set1;
366 UnicodeSet set2equal=*set2;
367 if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
368 set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
369 errln("FAIL: Error in =operator");
370 }
371
372 logln("Testing clone()");
373 UnicodeSet *set1clone=set1->clone();
374 UnicodeSet *set2clone=set2->clone();
375 if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
376 *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
377 *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
378 errln("FAIL: Error in clone");
379 }
380
381 logln("Testing hashcode");
382 if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
383 set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
384 set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
385 set1->hashCode() == set2->hashCode() || set1copy->hashCode() == set2->hashCode() ||
386 set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
387 errln("FAIL: Error in hashCode()");
388 }
389
390 delete set1;
391 delete set1copy;
392 delete set2;
393 delete set1clone;
394 delete set2clone;
395
396
397 }
398 void
TestAddRemove()399 UnicodeSetTest::TestAddRemove() {
400 UnicodeSet set; // Construct empty set
401 doAssert(set.isEmpty() == true, "set should be empty");
402 doAssert(set.size() == 0, "size should be 0");
403 set.complement();
404 doAssert(set.size() == 0x110000, "size should be 0x110000");
405 set.clear();
406 set.add(0x0061, 0x007a);
407 expectPairs(set, "az");
408 doAssert(set.isEmpty() == false, "set should not be empty");
409 doAssert(set.size() != 0, "size should not be equal to 0");
410 doAssert(set.size() == 26, "size should be equal to 26");
411 set.remove(0x006d, 0x0070);
412 expectPairs(set, "alqz");
413 doAssert(set.size() == 22, "size should be equal to 22");
414 set.remove(0x0065, 0x0067);
415 expectPairs(set, "adhlqz");
416 doAssert(set.size() == 19, "size should be equal to 19");
417 set.remove(0x0064, 0x0069);
418 expectPairs(set, "acjlqz");
419 doAssert(set.size() == 16, "size should be equal to 16");
420 set.remove(0x0063, 0x0072);
421 expectPairs(set, "absz");
422 doAssert(set.size() == 10, "size should be equal to 10");
423 set.add(0x0066, 0x0071);
424 expectPairs(set, "abfqsz");
425 doAssert(set.size() == 22, "size should be equal to 22");
426 set.remove(0x0061, 0x0067);
427 expectPairs(set, "hqsz");
428 set.remove(0x0061, 0x007a);
429 expectPairs(set, "");
430 doAssert(set.isEmpty() == true, "set should be empty");
431 doAssert(set.size() == 0, "size should be 0");
432 set.add(0x0061);
433 doAssert(set.isEmpty() == false, "set should not be empty");
434 doAssert(set.size() == 1, "size should not be equal to 1");
435 set.add(0x0062);
436 set.add(0x0063);
437 expectPairs(set, "ac");
438 doAssert(set.size() == 3, "size should not be equal to 3");
439 set.add(0x0070);
440 set.add(0x0071);
441 expectPairs(set, "acpq");
442 doAssert(set.size() == 5, "size should not be equal to 5");
443 set.clear();
444 expectPairs(set, "");
445 doAssert(set.isEmpty() == true, "set should be empty");
446 doAssert(set.size() == 0, "size should be 0");
447
448 // Try removing an entire set from another set
449 expectPattern(set, "[c-x]", "cx");
450 UnicodeSet set2;
451 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
452 set.removeAll(set2);
453 expectPairs(set, "deluxx");
454
455 // Try adding an entire set to another set
456 expectPattern(set, "[jackiemclean]", "aacceein");
457 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
458 set.addAll(set2);
459 expectPairs(set, "aacehort");
460 doAssert(set.containsAll(set2) == true, "set should contain all the elements in set2");
461
462 // Try retaining an set of elements contained in another set (intersection)
463 UnicodeSet set3;
464 expectPattern(set3, "[a-c]", "ac");
465 doAssert(set.containsAll(set3) == false, "set doesn't contain all the elements in set3");
466 set3.remove(0x0062);
467 expectPairs(set3, "aacc");
468 doAssert(set.containsAll(set3) == true, "set should contain all the elements in set3");
469 set.retainAll(set3);
470 expectPairs(set, "aacc");
471 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
472 doAssert(set.containsAll(set3) == true, "set should contain all the elements in set3");
473 set.clear();
474 doAssert(set.size() != set3.size(), "set.size() != set3.size()");
475
476 // Test commutativity
477 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
478 expectPattern(set2, "[jackiemclean]", "aacceein");
479 set.addAll(set2);
480 expectPairs(set, "aacehort");
481 doAssert(set.containsAll(set2) == true, "set should contain all the elements in set2");
482
483
484
485
486 }
487
488 /**
489 * Make sure minimal representation is maintained.
490 */
TestMinimalRep()491 void UnicodeSetTest::TestMinimalRep() {
492 UErrorCode status = U_ZERO_ERROR;
493 // This is pretty thoroughly tested by checkCanonicalRep()
494 // run against the exhaustive operation results. Use the code
495 // here for debugging specific spot problems.
496
497 // 1 overlap against 2
498 UnicodeSet set("[h-km-q]", status);
499 if (U_FAILURE(status)) { errln("FAIL"); return; }
500 UnicodeSet set2("[i-o]", status);
501 if (U_FAILURE(status)) { errln("FAIL"); return; }
502 set.addAll(set2);
503 expectPairs(set, "hq");
504 // right
505 set.applyPattern("[a-m]", status);
506 if (U_FAILURE(status)) { errln("FAIL"); return; }
507 set2.applyPattern("[e-o]", status);
508 if (U_FAILURE(status)) { errln("FAIL"); return; }
509 set.addAll(set2);
510 expectPairs(set, "ao");
511 // left
512 set.applyPattern("[e-o]", status);
513 if (U_FAILURE(status)) { errln("FAIL"); return; }
514 set2.applyPattern("[a-m]", status);
515 if (U_FAILURE(status)) { errln("FAIL"); return; }
516 set.addAll(set2);
517 expectPairs(set, "ao");
518 // 1 overlap against 3
519 set.applyPattern("[a-eg-mo-w]", status);
520 if (U_FAILURE(status)) { errln("FAIL"); return; }
521 set2.applyPattern("[d-q]", status);
522 if (U_FAILURE(status)) { errln("FAIL"); return; }
523 set.addAll(set2);
524 expectPairs(set, "aw");
525 }
526
TestAPI()527 void UnicodeSetTest::TestAPI() {
528 UErrorCode status = U_ZERO_ERROR;
529 // default ct
530 UnicodeSet set;
531 if (!set.isEmpty() || set.getRangeCount() != 0) {
532 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
533 set);
534 }
535
536 // clear(), isEmpty()
537 set.add(0x0061);
538 if (set.isEmpty()) {
539 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
540 set);
541 }
542 set.clear();
543 if (!set.isEmpty()) {
544 errln((UnicodeString)"FAIL, set should be empty but isn't: " +
545 set);
546 }
547
548 // size()
549 set.clear();
550 if (set.size() != 0) {
551 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
552 ": " + set);
553 }
554 set.add(0x0061);
555 if (set.size() != 1) {
556 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
557 ": " + set);
558 }
559 set.add(0x0031, 0x0039);
560 if (set.size() != 10) {
561 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
562 ": " + set);
563 }
564
565 // contains(first, last)
566 set.clear();
567 set.applyPattern("[A-Y 1-8 b-d l-y]", status);
568 if (U_FAILURE(status)) { errln("FAIL"); return; }
569 for (int32_t i = 0; i<set.getRangeCount(); ++i) {
570 UChar32 a = set.getRangeStart(i);
571 UChar32 b = set.getRangeEnd(i);
572 if (!set.contains(a, b)) {
573 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
574 " but doesn't: " + set);
575 }
576 if (set.contains((UChar32)(a-1), b)) {
577 errln((UnicodeString)"FAIL, shouldn't contain " +
578 (unsigned short)(a-1) + '-' + (unsigned short)b +
579 " but does: " + set);
580 }
581 if (set.contains(a, (UChar32)(b+1))) {
582 errln((UnicodeString)"FAIL, shouldn't contain " +
583 (unsigned short)a + '-' + (unsigned short)(b+1) +
584 " but does: " + set);
585 }
586 }
587
588 // Ported InversionList test.
589 UnicodeSet a((UChar32)3,(UChar32)10);
590 UnicodeSet b((UChar32)7,(UChar32)15);
591 UnicodeSet c;
592
593 logln((UnicodeString)"a [3-10]: " + a);
594 logln((UnicodeString)"b [7-15]: " + b);
595 c = a;
596 c.addAll(b);
597 UnicodeSet exp((UChar32)3,(UChar32)15);
598 if (c == exp) {
599 logln((UnicodeString)"c.set(a).add(b): " + c);
600 } else {
601 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
602 }
603 c.complement();
604 exp.set((UChar32)0, (UChar32)2);
605 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
606 if (c == exp) {
607 logln((UnicodeString)"c.complement(): " + c);
608 } else {
609 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
610 }
611 c.complement();
612 exp.set((UChar32)3, (UChar32)15);
613 if (c == exp) {
614 logln((UnicodeString)"c.complement(): " + c);
615 } else {
616 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
617 }
618 c = a;
619 c.complementAll(b);
620 exp.set((UChar32)3,(UChar32)6);
621 exp.add((UChar32)11,(UChar32) 15);
622 if (c == exp) {
623 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
624 } else {
625 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
626 }
627
628 exp = c;
629 bitsToSet(setToBits(c), c);
630 if (c == exp) {
631 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
632 } else {
633 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
634 }
635
636 // Additional tests for coverage JB#2118
637 //UnicodeSet::complement(class UnicodeString const &)
638 //UnicodeSet::complementAll(class UnicodeString const &)
639 //UnicodeSet::containsNone(class UnicodeSet const &)
640 //UnicodeSet::containsNone(long,long)
641 //UnicodeSet::containsSome(class UnicodeSet const &)
642 //UnicodeSet::containsSome(long,long)
643 //UnicodeSet::removeAll(class UnicodeString const &)
644 //UnicodeSet::retain(long)
645 //UnicodeSet::retainAll(class UnicodeString const &)
646 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
647 //UnicodeSetIterator::getString()
648 set.clear();
649 set.complement("ab");
650 exp.applyPattern("[{ab}]", status);
651 if (U_FAILURE(status)) { errln("FAIL"); return; }
652 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
653
654 UnicodeSetIterator iset(set);
655 if (!iset.next() || !iset.isString()) {
656 errln("FAIL: UnicodeSetIterator::next/isString");
657 } else if (iset.getString() != "ab") {
658 errln("FAIL: UnicodeSetIterator::getString");
659 }
660
661 set.add(u'a', u'z');
662 set.complementAll("alan");
663 exp.applyPattern("[{ab}b-kmo-z]", status);
664 if (U_FAILURE(status)) { errln("FAIL"); return; }
665 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
666
667 exp.applyPattern("[a-z]", status);
668 if (U_FAILURE(status)) { errln("FAIL"); return; }
669 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
670 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
671 exp.applyPattern("[aln]", status);
672 if (U_FAILURE(status)) { errln("FAIL"); return; }
673 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
674 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
675
676 if (set.containsNone(u'a', u'z')) {
677 errln("FAIL: containsNone(UChar32, UChar32)");
678 }
679 if (!set.containsSome(u'a', u'z')) {
680 errln("FAIL: containsSome(UChar32, UChar32)");
681 }
682 if (!set.containsNone(u'A', u'Z')) {
683 errln("FAIL: containsNone(UChar32, UChar32)");
684 }
685 if (set.containsSome(u'A', u'Z')) {
686 errln("FAIL: containsSome(UChar32, UChar32)");
687 }
688
689 set.removeAll("liu");
690 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
691 if (U_FAILURE(status)) { errln("FAIL"); return; }
692 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
693
694 set.retainAll("star");
695 exp.applyPattern("[rst]", status);
696 if (U_FAILURE(status)) { errln("FAIL"); return; }
697 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
698
699 set.retain(u's');
700 exp.applyPattern("[s]", status);
701 if (U_FAILURE(status)) { errln("FAIL"); return; }
702 if (set != exp) { errln("FAIL: retain('s')"); return; }
703
704 // ICU 2.6 coverage tests
705 // public final UnicodeSet retain(String s);
706 // public final UnicodeSet remove(int c);
707 // public final UnicodeSet remove(String s);
708 // public int hashCode();
709 set.applyPattern(u"[a-z{ab}{cd}]", status);
710 if (U_FAILURE(status)) { errln("FAIL"); return; }
711 set.retain(u"cd");
712 exp.applyPattern(u"[{cd}]", status);
713 if (U_FAILURE(status)) { errln("FAIL"); return; }
714 if (set != exp) { errln("FAIL: (with cd).retain(\"cd\")"); return; }
715
716 set.applyPattern(u"[a-z{ab}{yz}]", status);
717 if (U_FAILURE(status)) { errln("FAIL"); return; }
718 set.retain(u"cd");
719 exp.clear();
720 if (set != exp) { errln("FAIL: (without cd).retain(\"cd\")"); return; }
721
722 set.applyPattern(u"[a-z{ab}{cd}]", status);
723 if (U_FAILURE(status)) { errln("FAIL"); return; }
724 set.remove(u'c');
725 exp.applyPattern(u"[abd-z{ab}{cd}]", status);
726 if (set != exp) { errln("FAIL: remove('c')"); return; }
727
728 set.remove(u"cd");
729 exp.applyPattern(u"[abd-z{ab}]", status);
730 if (U_FAILURE(status)) { errln("FAIL"); return; }
731 if (set != exp) { errln("FAIL: remove(\"cd\")"); return; }
732
733 set.applyPattern("[s]", status);
734 if (U_FAILURE(status)) { errln("FAIL"); return; }
735 uint16_t buf[32];
736 int32_t slen = set.serialize(buf, UPRV_LENGTHOF(buf), status);
737 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
738 if (slen != 3 || buf[0] != 2 || buf[1] != u's' || buf[2] != u't') {
739 errln("FAIL: serialize");
740 return;
741 }
742
743 // Conversions to and from USet
744 UnicodeSet *uniset = &set;
745 USet *uset = uniset->toUSet();
746 TEST_ASSERT((void *)uset == (void *)uniset);
747 UnicodeSet *setx = UnicodeSet::fromUSet(uset);
748 TEST_ASSERT((void *)setx == (void *)uset);
749 const UnicodeSet *constSet = uniset;
750 const USet *constUSet = constSet->toUSet();
751 TEST_ASSERT((void *)constUSet == (void *)constSet);
752 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
753 TEST_ASSERT((void *)constSetx == (void *)constUSet);
754
755 // span(UnicodeString) and spanBack(UnicodeString) convenience methods
756 UnicodeString longString=u"aaaaaaaaaabbbbbbbbbbcccccccccc";
757 UnicodeSet ac(0x61, 0x63);
758 ac.remove(0x62).freeze();
759 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
760 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
761 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
762 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
763 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
764 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
765 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
766 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
767 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
768 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
769 ) {
770 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
771 }
772 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
773 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
774 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
775 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
776 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
777 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
778 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
779 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
780 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
781 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
782 ) {
783 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
784 }
785 }
786
TestIteration()787 void UnicodeSetTest::TestIteration() {
788 UErrorCode ec = U_ZERO_ERROR;
789 int i = 0;
790 int outerLoop;
791
792 // 6 code points, 3 ranges, 2 strings, 8 total elements
793 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"
794 UnicodeSet set(u"[zabyc\\U0001abcd{str1}{str2}]", ec);
795 TEST_ASSERT_SUCCESS(ec);
796 UnicodeSetIterator it(set);
797
798 for (outerLoop=0; outerLoop<3; outerLoop++) {
799 // Run the test multiple times, to check that iterator.reset() is working.
800 for (i=0; i<10; i++) {
801 UBool nextv = it.next();
802 UBool isString = it.isString();
803 int32_t codePoint = it.getCodepoint();
804 //int32_t codePointEnd = it.getCodepointEnd();
805 UnicodeString s = it.getString();
806 switch (i) {
807 case 0:
808 TEST_ASSERT(nextv == true);
809 TEST_ASSERT(isString == false);
810 TEST_ASSERT(codePoint==0x61);
811 TEST_ASSERT(s == "a");
812 break;
813 case 1:
814 TEST_ASSERT(nextv == true);
815 TEST_ASSERT(isString == false);
816 TEST_ASSERT(codePoint==0x62);
817 TEST_ASSERT(s == "b");
818 break;
819 case 2:
820 TEST_ASSERT(nextv == true);
821 TEST_ASSERT(isString == false);
822 TEST_ASSERT(codePoint==0x63);
823 TEST_ASSERT(s == "c");
824 break;
825 case 3:
826 TEST_ASSERT(nextv == true);
827 TEST_ASSERT(isString == false);
828 TEST_ASSERT(codePoint==0x79);
829 TEST_ASSERT(s == "y");
830 break;
831 case 4:
832 TEST_ASSERT(nextv == true);
833 TEST_ASSERT(isString == false);
834 TEST_ASSERT(codePoint==0x7a);
835 TEST_ASSERT(s == "z");
836 break;
837 case 5:
838 TEST_ASSERT(nextv == true);
839 TEST_ASSERT(isString == false);
840 TEST_ASSERT(codePoint==0x1abcd);
841 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
842 break;
843 case 6:
844 TEST_ASSERT(nextv == true);
845 TEST_ASSERT(isString == true);
846 TEST_ASSERT(s == "str1");
847 break;
848 case 7:
849 TEST_ASSERT(nextv == true);
850 TEST_ASSERT(isString == true);
851 TEST_ASSERT(s == "str2");
852 break;
853 case 8:
854 TEST_ASSERT(nextv == false);
855 break;
856 case 9:
857 TEST_ASSERT(nextv == false);
858 break;
859 }
860 }
861 it.reset(); // prepare to run the iteration again.
862 }
863 }
864
865
866
867
TestStrings()868 void UnicodeSetTest::TestStrings() {
869 UErrorCode ec = U_ZERO_ERROR;
870
871 UnicodeSet* testList[] = {
872 UnicodeSet::createFromAll("abc"),
873 new UnicodeSet("[a-c]", ec),
874
875 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
876 new UnicodeSet("[{ll}{ch}a-z]", ec),
877
878 UnicodeSet::createFrom("ab}c"),
879 new UnicodeSet("[{ab\\}c}]", ec),
880
881 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
882 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
883
884 nullptr
885 };
886
887 if (U_FAILURE(ec)) {
888 errln("FAIL: couldn't construct test sets");
889 }
890 assertFalse("[a-c].hasStrings()", testList[0]->hasStrings());
891 assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings());
892
893 for (int32_t i = 0; testList[i] != nullptr; i+=2) {
894 if (U_SUCCESS(ec)) {
895 UnicodeString pat0, pat1;
896 testList[i]->toPattern(pat0, true);
897 testList[i+1]->toPattern(pat1, true);
898 if (*testList[i] == *testList[i+1]) {
899 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
900 } else {
901 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
902 }
903 }
904 delete testList[i];
905 delete testList[i+1];
906 }
907 }
908
909 /**
910 * Test the [:Latin:] syntax.
911 */
TestScriptSet()912 void UnicodeSetTest::TestScriptSet() {
913 expectContainment(u"[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1"));
914
915 expectContainment(u"[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
916
917 /* Jitterbug 1423 */
918 expectContainment(u"[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
919
920 }
921
922 /**
923 * Test the [:Latin:] syntax.
924 */
TestPropertySet()925 void UnicodeSetTest::TestPropertySet() {
926 static const char* const DATA[] = {
927 // Pattern, Chars IN, Chars NOT in
928
929 "[:Latin:]",
930 "aA",
931 "\\u0391\\u03B1",
932
933 "[\\p{Greek}]",
934 "\\u0391\\u03B1",
935 "aA",
936
937 "\\P{ GENERAL Category = upper case letter }",
938 "abc",
939 "ABC",
940
941 #if !UCONFIG_NO_NORMALIZATION
942 // Combining class: @since ICU 2.2
943 // Check both symbolic and numeric
944 "\\p{ccc=Nukta}",
945 "\\u0ABC",
946 "abc",
947
948 "\\p{Canonical Combining Class = 11}",
949 "\\u05B1",
950 "\\u05B2",
951
952 "[:c c c = iota subscript :]",
953 "\\u0345",
954 "xyz",
955 #endif
956
957 // Bidi class: @since ICU 2.2
958 "\\p{bidiclass=lefttoright}",
959 "abc",
960 "\\u0671\\u0672",
961
962 // Binary properties: @since ICU 2.2
963 "\\p{ideographic}",
964 "\\u4E0A",
965 "x",
966
967 "[:math=false:]",
968 "q)*(",
969 // weiv: )(and * were removed from math in Unicode 4.0.1
970 //"(*+)",
971 "+<>^",
972
973 // JB#1767 \N{}, \p{ASCII}
974 "[:Ascii:]",
975 "abc\\u0000\\u007F",
976 "\\u0080\\u4E00",
977
978 "[\\N{ latin small letter a }[:name= latin small letter z:]]",
979 "az",
980 "qrs",
981
982 // JB#2015
983 "[:any:]",
984 "a\\U0010FFFF",
985 "",
986
987 "[:nv=0.5:]",
988 "\\u00BD\\u0F2A",
989 "\\u00BC",
990
991 // JB#2653: Age
992 "[:Age=1.1:]",
993 "\\u03D6", // 1.1
994 "\\u03D8\\u03D9", // 3.2
995
996 "[:Age=3.1:]",
997 "\\u1800\\u3400\\U0002f800",
998 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
999
1000 // JB#2350: Case_Sensitive
1001 "[:Case Sensitive:]",
1002 "A\\u1FFC\\U00010410",
1003 ";\\u00B4\\U00010500",
1004
1005 // JB#2832: C99-compatibility props
1006 "[:blank:]",
1007 " \\u0009",
1008 "1-9A-Z",
1009
1010 "[:graph:]",
1011 "19AZ",
1012 " \\u0003\\u0007\\u0009\\u000A\\u000D",
1013
1014 "[:punct:]",
1015 "!@#%&*()[]{}-_\\/;:,.?'\"",
1016 "09azAZ",
1017
1018 "[:xdigit:]",
1019 "09afAF",
1020 "gG!",
1021
1022 // Regex compatibility test
1023 "[-b]", // leading '-' is literal
1024 "-b",
1025 "ac",
1026
1027 "[^-b]", // leading '-' is literal
1028 "ac",
1029 "-b",
1030
1031 "[b-]", // trailing '-' is literal
1032 "-b",
1033 "ac",
1034
1035 "[^b-]", // trailing '-' is literal
1036 "ac",
1037 "-b",
1038
1039 "[a-b-]", // trailing '-' is literal
1040 "ab-",
1041 "c=",
1042
1043 "[[a-q]&[p-z]-]", // trailing '-' is literal
1044 "pq-",
1045 "or=",
1046
1047 "[\\s|\\)|:|$|\\>]", // from regex tests
1048 "s|):$>",
1049 "abc",
1050
1051 "[\\uDC00cd]", // JB#2906: isolated trail at start
1052 "cd\\uDC00",
1053 "ab\\uD800\\U00010000",
1054
1055 "[ab\\uD800]", // JB#2906: isolated trail at start
1056 "ab\\uD800",
1057 "cd\\uDC00\\U00010000",
1058
1059 "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1060 "abcd\\uD800",
1061 "ef\\uDC00\\U00010000",
1062
1063 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1064 "abcd\\uDC00",
1065 "ef\\uD800\\U00010000",
1066
1067 #if !UCONFIG_NO_NORMALIZATION
1068 "[:^lccc=0:]", // Lead canonical class
1069 "\\u0300\\u0301",
1070 "abcd\\u00c0\\u00c5",
1071
1072 "[:^tccc=0:]", // Trail canonical class
1073 "\\u0300\\u0301\\u00c0\\u00c5",
1074 "abcd",
1075
1076 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1077 "\\u0300\\u0301\\u00c0\\u00c5",
1078 "abcd",
1079
1080 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1081 "",
1082 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1083
1084 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1085 "\\u0F73\\u0F75\\u0F81",
1086 "abcd\\u0300\\u0301\\u00c0\\u00c5",
1087 #endif /* !UCONFIG_NO_NORMALIZATION */
1088
1089 "[:Assigned:]",
1090 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1091 "\\u0558\\uFDD3\\uFFFE\\U00050005",
1092
1093 // Script_Extensions, new in Unicode 6.0
1094 "[:scx=Arab:]",
1095 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1096 "\\u088F\\uFDEF\\uFEFE",
1097
1098 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1099 // so scx-sc is missing U+FDF2.
1100 "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1101 "\\u0640\\u064B\\u0650\\u0655",
1102 "\\uFDF2"
1103 };
1104
1105 static const int32_t DATA_LEN = UPRV_LENGTHOF(DATA);
1106
1107 for (int32_t i=0; i<DATA_LEN; i+=3) {
1108 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1109 CharsToUnicodeString(DATA[i+2]));
1110 }
1111 }
1112
1113 /**
1114 * Test that Posix style character classes [:digit:], etc.
1115 * have the Unicode definitions from TR 18.
1116 */
TestPosixClasses()1117 void UnicodeSetTest::TestPosixClasses() {
1118 {
1119 UErrorCode status = U_ZERO_ERROR;
1120 UnicodeSet s1("[:alpha:]", status);
1121 UnicodeSet s2(u"\\p{Alphabetic}", status);
1122 TEST_ASSERT_SUCCESS(status);
1123 TEST_ASSERT(s1==s2);
1124 }
1125 {
1126 UErrorCode status = U_ZERO_ERROR;
1127 UnicodeSet s1("[:lower:]", status);
1128 UnicodeSet s2(u"\\p{lowercase}", status);
1129 TEST_ASSERT_SUCCESS(status);
1130 TEST_ASSERT(s1==s2);
1131 }
1132 {
1133 UErrorCode status = U_ZERO_ERROR;
1134 UnicodeSet s1("[:upper:]", status);
1135 UnicodeSet s2(u"\\p{Uppercase}", status);
1136 TEST_ASSERT_SUCCESS(status);
1137 TEST_ASSERT(s1==s2);
1138 }
1139 {
1140 UErrorCode status = U_ZERO_ERROR;
1141 UnicodeSet s1("[:punct:]", status);
1142 UnicodeSet s2(u"\\p{gc=Punctuation}", status);
1143 TEST_ASSERT_SUCCESS(status);
1144 TEST_ASSERT(s1==s2);
1145 }
1146 {
1147 UErrorCode status = U_ZERO_ERROR;
1148 UnicodeSet s1("[:digit:]", status);
1149 UnicodeSet s2(u"\\p{gc=DecimalNumber}", status);
1150 TEST_ASSERT_SUCCESS(status);
1151 TEST_ASSERT(s1==s2);
1152 }
1153 {
1154 UErrorCode status = U_ZERO_ERROR;
1155 UnicodeSet s1("[:xdigit:]", status);
1156 UnicodeSet s2(u"[\\p{DecimalNumber}\\p{HexDigit}]", status);
1157 TEST_ASSERT_SUCCESS(status);
1158 TEST_ASSERT(s1==s2);
1159 }
1160 {
1161 UErrorCode status = U_ZERO_ERROR;
1162 UnicodeSet s1("[:alnum:]", status);
1163 UnicodeSet s2(u"[\\p{Alphabetic}\\p{DecimalNumber}]", status);
1164 TEST_ASSERT_SUCCESS(status);
1165 TEST_ASSERT(s1==s2);
1166 }
1167 {
1168 UErrorCode status = U_ZERO_ERROR;
1169 UnicodeSet s1("[:space:]", status);
1170 UnicodeSet s2(u"\\p{Whitespace}", status);
1171 TEST_ASSERT_SUCCESS(status);
1172 TEST_ASSERT(s1==s2);
1173 }
1174 {
1175 UErrorCode status = U_ZERO_ERROR;
1176 UnicodeSet s1("[:blank:]", status);
1177 TEST_ASSERT_SUCCESS(status);
1178 UnicodeSet s2(u"[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]",
1179 status);
1180 TEST_ASSERT_SUCCESS(status);
1181 TEST_ASSERT(s1==s2);
1182 }
1183 {
1184 UErrorCode status = U_ZERO_ERROR;
1185 UnicodeSet s1("[:cntrl:]", status);
1186 TEST_ASSERT_SUCCESS(status);
1187 UnicodeSet s2(u"\\p{Control}", status);
1188 TEST_ASSERT_SUCCESS(status);
1189 TEST_ASSERT(s1==s2);
1190 }
1191 {
1192 UErrorCode status = U_ZERO_ERROR;
1193 UnicodeSet s1("[:graph:]", status);
1194 TEST_ASSERT_SUCCESS(status);
1195 UnicodeSet s2(u"[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
1196 TEST_ASSERT_SUCCESS(status);
1197 TEST_ASSERT(s1==s2);
1198 }
1199 {
1200 UErrorCode status = U_ZERO_ERROR;
1201 UnicodeSet s1("[:print:]", status);
1202 TEST_ASSERT_SUCCESS(status);
1203 UnicodeSet s2(u"[[:graph:][:blank:]-[\\p{Control}]]", status);
1204 TEST_ASSERT_SUCCESS(status);
1205 TEST_ASSERT(s1==s2);
1206 }
1207 }
1208 /**
1209 * Test cloning of UnicodeSet. For C++, we test the copy constructor.
1210 */
TestClone()1211 void UnicodeSetTest::TestClone() {
1212 UErrorCode ec = U_ZERO_ERROR;
1213 UnicodeSet s("[abcxyz]", ec);
1214 UnicodeSet t(s);
1215 expectContainment(t, "abc", "def");
1216 }
1217
1218 /**
1219 * Test the indexOf() and charAt() methods.
1220 */
TestIndexOf()1221 void UnicodeSetTest::TestIndexOf() {
1222 UErrorCode ec = U_ZERO_ERROR;
1223 UnicodeSet set("[a-cx-y3578]", ec);
1224 if (U_FAILURE(ec)) {
1225 errln("FAIL: UnicodeSet constructor");
1226 return;
1227 }
1228 for (int32_t i=0; i<set.size(); ++i) {
1229 UChar32 c = set.charAt(i);
1230 if (set.indexOf(c) != i) {
1231 errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1232 i, c, set.indexOf(c));
1233 }
1234 }
1235 UChar32 c = set.charAt(set.size());
1236 if (c != -1) {
1237 errln("FAIL: charAt(<out of range>) = %X", c);
1238 }
1239 int32_t j = set.indexOf(u'q');
1240 if (j != -1) {
1241 errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1242 }
1243 }
1244
1245 /**
1246 * Test closure API.
1247 */
TestCloseOver()1248 void UnicodeSetTest::TestCloseOver() {
1249 static constexpr char CASE[] = {(char)USET_CASE_INSENSITIVE};
1250 static constexpr char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1251 static constexpr char SIMPLE_CASE_INSENSITIVE[] = {(char)USET_SIMPLE_CASE_INSENSITIVE};
1252 static const char* DATA[] = {
1253 // selector, input, output
1254 CASE,
1255 "[aq\\u00DF{Bc}{bC}{Fi}]",
1256 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1257
1258 SIMPLE_CASE_INSENSITIVE,
1259 "[aq\\u00DF{Bc}{bC}{Fi}]",
1260 "[aAqQ\\u00DF\\u1E9E{bc}{fi}]",
1261
1262 CASE,
1263 "[\\u01F1]", // 'DZ'
1264 "[\\u01F1\\u01F2\\u01F3]",
1265
1266 SIMPLE_CASE_INSENSITIVE,
1267 "[\\u01F1]", // 'DZ'
1268 "[\\u01F1\\u01F2\\u01F3]",
1269
1270 CASE,
1271 "[\\u1FB4]",
1272 "[\\u1FB4{\\u03AC\\u03B9}]",
1273
1274 SIMPLE_CASE_INSENSITIVE,
1275 "[\\u1FB4]",
1276 "[\\u1FB4]",
1277
1278 CASE,
1279 "[{F\\uFB01}]",
1280 "[\\uFB03{ffi}]",
1281
1282 CASE, // make sure binary search finds limits
1283 "[a\\uFF3A]",
1284 "[aA\\uFF3A\\uFF5A]",
1285
1286 CASE,
1287 "[a-z]","[A-Za-z\\u017F\\u212A]",
1288
1289 SIMPLE_CASE_INSENSITIVE,
1290 "[a-z]","[A-Za-z\\u017F\\u212A]",
1291
1292 CASE,
1293 "[abc]","[A-Ca-c]",
1294 CASE,
1295 "[ABC]","[A-Ca-c]",
1296
1297 CASE, "[i]", "[iI]",
1298
1299 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
1300 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
1301
1302 CASE, "[\\u0131]", "[\\u0131]", // dotless i
1303
1304 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1305
1306 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
1307
1308 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
1309
1310 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
1311
1312 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1313
1314 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
1315 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
1316
1317 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
1318
1319 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
1320
1321 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1322
1323 #if !UCONFIG_NO_FILE_IO
1324 CASE_MAPPINGS,
1325 "[aq\\u00DF{Bc}{bC}{Fi}]",
1326 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1327 #endif
1328
1329 CASE_MAPPINGS,
1330 "[\\u01F1]", // 'DZ'
1331 "[\\u01F1\\u01F2\\u01F3]",
1332
1333 CASE_MAPPINGS,
1334 "[a-z]",
1335 "[A-Za-z]",
1336
1337 nullptr
1338 };
1339
1340 UnicodeSet s;
1341 UnicodeSet t;
1342 UnicodeString buf;
1343 for (int32_t i=0; DATA[i]!=nullptr; i+=3) {
1344 int32_t selector = DATA[i][0];
1345 UnicodeString pat(DATA[i+1], -1, US_INV);
1346 UnicodeString exp(DATA[i+2], -1, US_INV);
1347
1348 UErrorCode ec = U_ZERO_ERROR;
1349 s.applyPattern(pat, ec);
1350 s.closeOver(selector);
1351 t.applyPattern(exp, ec);
1352 if (U_FAILURE(ec)) {
1353 errln("FAIL: applyPattern failed");
1354 continue;
1355 }
1356 if (s == t) {
1357 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1358 } else {
1359 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1360 s.toPattern(buf, true) + ", expected " + exp);
1361 }
1362 }
1363
1364 // Test the pattern API
1365 UErrorCode ec = U_ZERO_ERROR;
1366 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, nullptr, ec);
1367 if (U_FAILURE(ec)) {
1368 errln("FAIL: applyPattern failed");
1369 } else {
1370 expectContainment(s, "abcABC", "defDEF");
1371 }
1372 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, nullptr, ec);
1373 if (U_FAILURE(ec)) {
1374 errln("FAIL: constructor failed");
1375 } else {
1376 expectContainment(v, "defDEF", "abcABC");
1377 }
1378 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, nullptr, ec);
1379 if (U_FAILURE(ec)) {
1380 errln("FAIL: construct w/case mappings failed");
1381 } else {
1382 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1383 }
1384 }
1385
1386 namespace {
1387
addIfAbsent(const std::unordered_multimap<UChar32,UChar32> & closure,UChar32 c,UChar32 t,std::unordered_multimap<UChar32,UChar32> & additions)1388 void addIfAbsent(const std::unordered_multimap<UChar32, UChar32> &closure, UChar32 c, UChar32 t,
1389 std::unordered_multimap<UChar32, UChar32> &additions) {
1390 for (auto it = closure.find(c);; ++it) {
1391 if (it == closure.end() || it->first != c) {
1392 // absent
1393 additions.insert({c, t});
1394 break;
1395 } else if (it->second == t) {
1396 // present
1397 break;
1398 }
1399 }
1400 }
1401
1402 } // namespace
1403
TestCloseOverSimpleCaseFolding()1404 void UnicodeSetTest::TestCloseOverSimpleCaseFolding() {
1405 IcuTestErrorCode errorCode(*this, "TestCloseOverSimpleCaseFolding");
1406 const UnicodeSet *sensitive =
1407 UnicodeSet::fromUSet(u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode));
1408 if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(UCHAR_CASE_SENSITIVE) failed")) {
1409 return;
1410 }
1411 // Compute the scf=Simple_Case_Folding closure:
1412 // For each scf(c)=t, start with mappings c->t and t->c.
1413 std::unordered_multimap<UChar32, UChar32> closure;
1414 UnicodeSetIterator iter(*sensitive);
1415 while (iter.next()) {
1416 UChar32 c = iter.getCodepoint();
1417 UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT);
1418 if (scfChar != c) {
1419 closure.insert({c, scfChar});
1420 closure.insert({scfChar, c});
1421 }
1422 }
1423 // Complete the closure: Add mappings of mappings.
1424 for (;;) {
1425 std::unordered_multimap<UChar32, UChar32> additions;
1426 // for each mapping c->t
1427 for (auto mapping : closure) {
1428 UChar32 c = mapping.first;
1429 UChar32 t = mapping.second;
1430 // enumerate each t->u
1431 for (auto it = closure.find(t); it != closure.end() && it->first == t; ++it) {
1432 UChar32 u = it->second;
1433 if (u != c) {
1434 addIfAbsent(closure, c, u, additions);
1435 addIfAbsent(closure, u, c, additions);
1436 }
1437 }
1438 }
1439 if (additions.empty()) {
1440 break; // The closure is complete.
1441 }
1442 closure.insert(additions.begin(), additions.end());
1443 }
1444 // Compare closeOver(USET_SIMPLE_CASE_INSENSITIVE) with an unoptimized implementation.
1445 // Here we focus on single code points as input.
1446 // Other examples, including strings, are tested in TestCloseOver().
1447 int32_t errors = 0;
1448 iter.reset();
1449 UnicodeSet set, expected;
1450 while (iter.next()) {
1451 UChar32 c = iter.getCodepoint();
1452 // closeOver()
1453 set.clear().add(c);
1454 set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
1455 // From-first-principles implementation.
1456 expected.clear().add(c);
1457 for (auto it = closure.find(c); it != closure.end() && it->first == c; ++it) {
1458 expected.add(it->second);
1459 }
1460 // compare
1461 if (!checkEqual(expected, set, "closeOver() vs. test impl")) {
1462 errln(" c=U+%04X", c);
1463 if (++errors == 10) {
1464 break;
1465 }
1466 }
1467 }
1468 }
1469
TestCloseOverLargeSets()1470 void UnicodeSetTest::TestCloseOverLargeSets() {
1471 IcuTestErrorCode errorCode(*this, "TestCloseOverLargeSets");
1472 // Check that an optimization for large sets does not change the result.
1473
1474 // Most code points except ones that are boring for case mappings.
1475 UnicodeSet manyCp(u"[^[:C:][:Ideographic:][:Hang:]]", errorCode);
1476 // Main Unihan block.
1477 constexpr UChar32 LARGE_START = 0x4E00;
1478 constexpr UChar32 LARGE_END = 0x9FFF;
1479
1480 static constexpr int32_t OPTIONS[] = {
1481 USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE
1482 };
1483 UnicodeSet input, small, large;
1484 for (int32_t option : OPTIONS) {
1485 UnicodeSetIterator iter(manyCp);
1486 while (iter.next()) {
1487 UChar32 c = iter.getCodepoint();
1488 input.clear().add(c);
1489 small = input;
1490 small.closeOver(option);
1491 large = input;
1492 large.add(LARGE_START, LARGE_END);
1493 large.closeOver(option);
1494 large.remove(LARGE_START, LARGE_END);
1495 if (!checkEqual(small, large, "small != large")) {
1496 errln(" option=%d c=U+%04X", option, c);
1497 break;
1498 }
1499 }
1500 }
1501 }
1502
TestEscapePattern()1503 void UnicodeSetTest::TestEscapePattern() {
1504 const char pattern[] =
1505 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1506 const char exp[] =
1507 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1508 // We test this with two passes; in the second pass we
1509 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,
1510 // this fails -- which is what we expect.
1511 for (int32_t pass=1; pass<=2; ++pass) {
1512 UErrorCode ec = U_ZERO_ERROR;
1513 UnicodeString pat(pattern, -1, US_INV);
1514 if (pass==2) {
1515 pat = pat.unescape();
1516 }
1517 // Pattern is only good for pass 1
1518 UBool isPatternValid = (pass==1);
1519
1520 UnicodeSet set(pat, ec);
1521 if (U_SUCCESS(ec) != isPatternValid){
1522 errln((UnicodeString)"FAIL: applyPattern(" +
1523 escape(pat) + ") => " +
1524 u_errorName(ec));
1525 continue;
1526 }
1527 if (U_FAILURE(ec)) {
1528 continue;
1529 }
1530 if (set.contains(u'\u0644')){
1531 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1532 }
1533
1534 UnicodeString newpat;
1535 set.toPattern(newpat, true);
1536 if (newpat == UnicodeString(exp, -1, US_INV)) {
1537 logln(escape(pat) + " => " + newpat);
1538 } else {
1539 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1540 }
1541
1542 for (int32_t i=0; i<set.getRangeCount(); ++i) {
1543 UnicodeString str("Range ");
1544 str.append((char16_t)(u'0' + i))
1545 .append(": ")
1546 .append((UChar32)set.getRangeStart(i))
1547 .append(" - ")
1548 .append((UChar32)set.getRangeEnd(i));
1549 str = str + " (" + set.getRangeStart(i) + " - " +
1550 set.getRangeEnd(i) + ")";
1551 if (set.getRangeStart(i) < 0) {
1552 errln((UnicodeString)"FAIL: " + escape(str));
1553 } else {
1554 logln(escape(str));
1555 }
1556 }
1557 }
1558 }
1559
expectRange(const UnicodeString & label,const UnicodeSet & set,UChar32 start,UChar32 end)1560 void UnicodeSetTest::expectRange(const UnicodeString& label,
1561 const UnicodeSet& set,
1562 UChar32 start, UChar32 end) {
1563 UnicodeSet exp(start, end);
1564 UnicodeString pat;
1565 if (set == exp) {
1566 logln(label + " => " + set.toPattern(pat, true));
1567 } else {
1568 UnicodeString xpat;
1569 errln((UnicodeString)"FAIL: " + label + " => " +
1570 set.toPattern(pat, true) +
1571 ", expected " + exp.toPattern(xpat, true));
1572 }
1573 }
1574
TestInvalidCodePoint()1575 void UnicodeSetTest::TestInvalidCodePoint() {
1576
1577 const UChar32 DATA[] = {
1578 // Test range Expected range
1579 0, 0x10FFFF, 0, 0x10FFFF,
1580 (UChar32)-1, 8, 0, 8,
1581 8, 0x110000, 8, 0x10FFFF
1582 };
1583 const int32_t DATA_LENGTH = UPRV_LENGTHOF(DATA);
1584
1585 UnicodeString pat;
1586 int32_t i;
1587
1588 for (i=0; i<DATA_LENGTH; i+=4) {
1589 UChar32 start = DATA[i];
1590 UChar32 end = DATA[i+1];
1591 UChar32 xstart = DATA[i+2];
1592 UChar32 xend = DATA[i+3];
1593
1594 // Try various API using the test code points
1595
1596 UnicodeSet set(start, end);
1597 expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1598 set, xstart, xend);
1599
1600 set.clear();
1601 set.set(start, end);
1602 expectRange((UnicodeString)"set(" + start + "," + end + ")",
1603 set, xstart, xend);
1604
1605 UBool b = set.contains(start);
1606 b = set.contains(start, end);
1607 b = set.containsNone(start, end);
1608 b = set.containsSome(start, end);
1609 (void)b; // Suppress set but not used warning.
1610
1611 /*int32_t index = set.indexOf(start);*/
1612
1613 set.clear();
1614 set.add(start);
1615 set.add(start, end);
1616 expectRange((UnicodeString)"add(" + start + "," + end + ")",
1617 set, xstart, xend);
1618
1619 set.set(0, 0x10FFFF);
1620 set.retain(start, end);
1621 expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1622 set, xstart, xend);
1623 set.retain(start);
1624
1625 set.set(0, 0x10FFFF);
1626 set.remove(start);
1627 set.remove(start, end);
1628 set.complement();
1629 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1630 set, xstart, xend);
1631
1632 set.set(0, 0x10FFFF);
1633 set.complement(start, end);
1634 set.complement();
1635 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1636 set, xstart, xend);
1637 set.complement(start);
1638 }
1639
1640 const UChar32 DATA2[] = {
1641 0,
1642 0x10FFFF,
1643 (UChar32)-1,
1644 0x110000
1645 };
1646 const int32_t DATA2_LENGTH = UPRV_LENGTHOF(DATA2);
1647
1648 for (i=0; i<DATA2_LENGTH; ++i) {
1649 UChar32 c = DATA2[i], end = 0x10FFFF;
1650 UBool valid = (c >= 0 && c <= 0x10FFFF);
1651
1652 UnicodeSet set(0, 0x10FFFF);
1653
1654 // For single-codepoint contains, invalid codepoints are NOT contained
1655 UBool b = set.contains(c);
1656 if (b == valid) {
1657 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1658 ") = " + b);
1659 } else {
1660 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1661 ") = " + b);
1662 }
1663
1664 // For codepoint range contains, containsNone, and containsSome,
1665 // invalid or empty (start > end) ranges have UNDEFINED behavior.
1666 b = set.contains(c, end);
1667 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1668 "," + end + ") = " + b);
1669
1670 b = set.containsNone(c, end);
1671 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1672 "," + end + ") = " + b);
1673
1674 b = set.containsSome(c, end);
1675 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1676 "," + end + ") = " + b);
1677
1678 int32_t index = set.indexOf(c);
1679 if ((index >= 0) == valid) {
1680 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1681 ") = " + index);
1682 } else {
1683 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1684 ") = " + index);
1685 }
1686 }
1687 }
1688
1689 // Used by TestSymbolTable
1690 class TokenSymbolTable : public SymbolTable {
1691 public:
1692 Hashtable contents;
1693
TokenSymbolTable(UErrorCode & ec)1694 TokenSymbolTable(UErrorCode& ec) : contents(false, ec) {
1695 contents.setValueDeleter(uprv_deleteUObject);
1696 }
1697
~TokenSymbolTable()1698 ~TokenSymbolTable() {}
1699
1700 /**
1701 * (Non-SymbolTable API) Add the given variable and value to
1702 * the table. Variable should NOT contain leading '$'.
1703 */
add(const UnicodeString & var,const UnicodeString & value,UErrorCode & ec)1704 void add(const UnicodeString& var, const UnicodeString& value,
1705 UErrorCode& ec) {
1706 if (U_SUCCESS(ec)) {
1707 contents.put(var, new UnicodeString(value), ec);
1708 }
1709 }
1710
1711 /**
1712 * SymbolTable API
1713 */
lookup(const UnicodeString & s) const1714 virtual const UnicodeString* lookup(const UnicodeString& s) const override {
1715 return static_cast<const UnicodeString*>(contents.get(s));
1716 }
1717
1718 /**
1719 * SymbolTable API
1720 */
lookupMatcher(UChar32) const1721 virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const override {
1722 return nullptr;
1723 }
1724
1725 /**
1726 * SymbolTable API
1727 */
parseReference(const UnicodeString & text,ParsePosition & pos,int32_t limit) const1728 virtual UnicodeString parseReference(const UnicodeString& text,
1729 ParsePosition& pos, int32_t limit) const override {
1730 int32_t start = pos.getIndex();
1731 int32_t i = start;
1732 UnicodeString result;
1733 while (i < limit) {
1734 char16_t c = text.charAt(i);
1735 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1736 break;
1737 }
1738 ++i;
1739 }
1740 if (i == start) { // No valid name chars
1741 return result; // Indicate failure with empty string
1742 }
1743 pos.setIndex(i);
1744 text.extractBetween(start, i, result);
1745 return result;
1746 }
1747 };
1748
TestSymbolTable()1749 void UnicodeSetTest::TestSymbolTable() {
1750 // Multiple test cases can be set up here. Each test case
1751 // is terminated by null:
1752 // var, value, var, value,..., input pat., exp. output pat., null
1753 const char* DATA[] = {
1754 "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr,
1755 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr,
1756 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr,
1757 nullptr
1758 };
1759
1760 for (int32_t i=0; DATA[i]!=nullptr; ++i) {
1761 UErrorCode ec = U_ZERO_ERROR;
1762 TokenSymbolTable sym(ec);
1763 if (U_FAILURE(ec)) {
1764 errln("FAIL: couldn't construct TokenSymbolTable");
1765 continue;
1766 }
1767
1768 // Set up variables
1769 while (DATA[i+2] != nullptr) {
1770 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1771 if (U_FAILURE(ec)) {
1772 errln("FAIL: couldn't add to TokenSymbolTable");
1773 continue;
1774 }
1775 i += 2;
1776 }
1777
1778 // Input pattern and expected output pattern
1779 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1780 i += 2;
1781
1782 ParsePosition pos(0);
1783 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1784 if (U_FAILURE(ec)) {
1785 errln("FAIL: couldn't construct UnicodeSet");
1786 continue;
1787 }
1788
1789 // results
1790 if (pos.getIndex() != inpat.length()) {
1791 errln((UnicodeString)"Failed to read to end of string \""
1792 + inpat + "\": read to "
1793 + pos.getIndex() + ", length is "
1794 + inpat.length());
1795 }
1796
1797 UnicodeSet us2(exppat, ec);
1798 if (U_FAILURE(ec)) {
1799 errln("FAIL: couldn't construct expected UnicodeSet");
1800 continue;
1801 }
1802
1803 UnicodeString a, b;
1804 if (us != us2) {
1805 errln((UnicodeString)"Failed, got " + us.toPattern(a, true) +
1806 ", expected " + us2.toPattern(b, true));
1807 } else {
1808 logln((UnicodeString)"Ok, got " + us.toPattern(a, true));
1809 }
1810 }
1811 }
1812
TestSurrogate()1813 void UnicodeSetTest::TestSurrogate() {
1814 const char* DATA[] = {
1815 // These should all behave identically
1816 "[abc\\uD800\\uDC00]",
1817 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1818 "[abc\\U00010000]",
1819 0
1820 };
1821 for (int i=0; DATA[i] != 0; ++i) {
1822 UErrorCode ec = U_ZERO_ERROR;
1823 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1824 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1825 UnicodeSet set(str, ec);
1826 if (U_FAILURE(ec)) {
1827 errln("FAIL: UnicodeSet constructor");
1828 continue;
1829 }
1830 expectContainment(set,
1831 CharsToUnicodeString("abc\\U00010000"),
1832 CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1833 if (set.size() != 4) {
1834 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1835 set.size() + ", expected 4");
1836 }
1837
1838 {
1839 UErrorCode subErr = U_ZERO_ERROR;
1840 checkRoundTrip(set);
1841 checkSerializeRoundTrip(set, subErr);
1842 }
1843 }
1844 }
1845
TestExhaustive()1846 void UnicodeSetTest::TestExhaustive() {
1847 // exhaustive tests. Simulate UnicodeSets with integers.
1848 // That gives us very solid tests (except for large memory tests).
1849
1850 int32_t limit = 128;
1851
1852 UnicodeSet x, y, z, aa;
1853
1854 for (int32_t i = 0; i < limit; ++i) {
1855 bitsToSet(i, x);
1856 logln((UnicodeString)"Testing " + i + ", " + x);
1857 _testComplement(i, x, y);
1858
1859 UnicodeSet &toTest = bitsToSet(i, aa);
1860
1861 // AS LONG AS WE ARE HERE, check roundtrip
1862 checkRoundTrip(toTest);
1863 UErrorCode ec = U_ZERO_ERROR;
1864 checkSerializeRoundTrip(toTest, ec);
1865
1866 for (int32_t j = 0; j < limit; ++j) {
1867 _testAdd(i,j, x,y,z);
1868 _testXor(i,j, x,y,z);
1869 _testRetain(i,j, x,y,z);
1870 _testRemove(i,j, x,y,z);
1871 }
1872 }
1873 }
1874
_testComplement(int32_t a,UnicodeSet & x,UnicodeSet & z)1875 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1876 bitsToSet(a, x);
1877 z = x;
1878 z.complement();
1879 int32_t c = setToBits(z);
1880 if (c != (~a)) {
1881 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);
1882 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1883 }
1884 checkCanonicalRep(z, (UnicodeString)"complement " + a);
1885 }
1886
_testAdd(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1887 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1888 bitsToSet(a, x);
1889 bitsToSet(b, y);
1890 z = x;
1891 z.addAll(y);
1892 int32_t c = setToBits(z);
1893 if (c != (a | b)) {
1894 errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1895 errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1896 }
1897 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1898 }
1899
_testRetain(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1900 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1901 bitsToSet(a, x);
1902 bitsToSet(b, y);
1903 z = x;
1904 z.retainAll(y);
1905 int32_t c = setToBits(z);
1906 if (c != (a & b)) {
1907 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1908 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1909 }
1910 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1911 }
1912
_testRemove(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1913 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1914 bitsToSet(a, x);
1915 bitsToSet(b, y);
1916 z = x;
1917 z.removeAll(y);
1918 int32_t c = setToBits(z);
1919 if (c != (a &~ b)) {
1920 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1921 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1922 }
1923 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1924 }
1925
_testXor(int32_t a,int32_t b,UnicodeSet & x,UnicodeSet & y,UnicodeSet & z)1926 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1927 bitsToSet(a, x);
1928 bitsToSet(b, y);
1929 z = x;
1930 z.complementAll(y);
1931 int32_t c = setToBits(z);
1932 if (c != (a ^ b)) {
1933 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1934 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1935 }
1936 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1937 }
1938
1939 /**
1940 * Check that ranges are monotonically increasing and non-
1941 * overlapping.
1942 */
checkCanonicalRep(const UnicodeSet & set,const UnicodeString & msg)1943 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1944 int32_t n = set.getRangeCount();
1945 if (n < 0) {
1946 errln((UnicodeString)"FAIL result of " + msg +
1947 ": range count should be >= 0 but is " +
1948 n /*+ " for " + set.toPattern())*/);
1949 return;
1950 }
1951 UChar32 last = 0;
1952 for (int32_t i=0; i<n; ++i) {
1953 UChar32 start = set.getRangeStart(i);
1954 UChar32 end = set.getRangeEnd(i);
1955 if (start > end) {
1956 errln((UnicodeString)"FAIL result of " + msg +
1957 ": range " + (i+1) +
1958 " start > end: " + (int)start + ", " + (int)end +
1959 " for " + set);
1960 }
1961 if (i > 0 && start <= last) {
1962 errln((UnicodeString)"FAIL result of " + msg +
1963 ": range " + (i+1) +
1964 " overlaps previous range: " + (int)start + ", " + (int)end +
1965 " for " + set);
1966 }
1967 last = end;
1968 }
1969 }
1970
1971 /**
1972 * Convert a bitmask to a UnicodeSet.
1973 */
bitsToSet(int32_t a,UnicodeSet & result)1974 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1975 result.clear();
1976 for (UChar32 i = 0; i < 32; ++i) {
1977 if ((a & (1<<i)) != 0) {
1978 result.add(i);
1979 }
1980 }
1981 return result;
1982 }
1983
1984 /**
1985 * Convert a UnicodeSet to a bitmask. Only the characters
1986 * U+0000 to U+0020 are represented in the bitmask.
1987 */
setToBits(const UnicodeSet & x)1988 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1989 int32_t result = 0;
1990 for (int32_t i = 0; i < 32; ++i) {
1991 if (x.contains((UChar32)i)) {
1992 result |= (1<<i);
1993 }
1994 }
1995 return result;
1996 }
1997
1998 /**
1999 * Return the representation of an inversion list based UnicodeSet
2000 * as a pairs list. Ranges are listed in ascending Unicode order.
2001 * For example, the set [a-zA-M3] is represented as "33AMaz".
2002 */
getPairs(const UnicodeSet & set)2003 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
2004 UnicodeString pairs;
2005 for (int32_t i=0; i<set.getRangeCount(); ++i) {
2006 UChar32 start = set.getRangeStart(i);
2007 UChar32 end = set.getRangeEnd(i);
2008 if (end > 0xFFFF) {
2009 end = 0xFFFF;
2010 i = set.getRangeCount(); // Should be unnecessary
2011 }
2012 pairs.append((char16_t)start).append((char16_t)end);
2013 }
2014 return pairs;
2015 }
2016
2017 /**
2018 * Basic consistency check for a few items.
2019 * That the iterator works, and that we can create a pattern and
2020 * get the same thing back
2021 */
checkRoundTrip(const UnicodeSet & s)2022 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
2023 {
2024 UnicodeSet t(s);
2025 checkEqual(s, t, "copy ct");
2026 }
2027
2028 {
2029 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten
2030 t = s;
2031 checkEqual(s, t, "operator=");
2032 }
2033
2034 {
2035 UnicodeSet t;
2036 copyWithIterator(t, s, false);
2037 checkEqual(s, t, "iterator roundtrip");
2038 }
2039
2040 {
2041 UnicodeSet t;
2042 copyWithIterator(t, s, true); // try range
2043 checkEqual(s, t, "iterator roundtrip");
2044 }
2045
2046 {
2047 UnicodeSet t;
2048 UnicodeString pat;
2049 UErrorCode ec = U_ZERO_ERROR;
2050 s.toPattern(pat, false);
2051 t.applyPattern(pat, ec);
2052 if (U_FAILURE(ec)) {
2053 errln("FAIL: toPattern(escapeUnprintable=false), applyPattern - %s", u_errorName(ec));
2054 return;
2055 } else {
2056 checkEqual(s, t, "toPattern(false)");
2057 }
2058 }
2059
2060 {
2061 UnicodeSet t;
2062 UnicodeString pat;
2063 UErrorCode ec = U_ZERO_ERROR;
2064 s.toPattern(pat, true);
2065 t.applyPattern(pat, ec);
2066 if (U_FAILURE(ec)) {
2067 errln("FAIL: toPattern(escapeUnprintable=true), applyPattern - %s", u_errorName(ec));
2068 return;
2069 } else {
2070 checkEqual(s, t, "toPattern(true)");
2071 }
2072 }
2073 }
2074
checkSerializeRoundTrip(const UnicodeSet & t,UErrorCode & status)2075 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
2076 if(U_FAILURE(status)) return;
2077 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2078 if(status == U_BUFFER_OVERFLOW_ERROR) {
2079 status = U_ZERO_ERROR;
2080 serializeBuffer.resize(len);
2081 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
2082 // let 2nd error stand
2083 }
2084 if(U_FAILURE(status)) {
2085 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
2086 return;
2087 }
2088 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
2089 if(U_FAILURE(status)) {
2090 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
2091 return;
2092 }
2093
2094 checkEqual(t, deserialized, "Set was unequal when deserialized");
2095 }
2096
copyWithIterator(UnicodeSet & t,const UnicodeSet & s,UBool withRange)2097 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
2098 t.clear();
2099 UnicodeSetIterator it(s);
2100 if (withRange) {
2101 while (it.nextRange()) {
2102 if (it.isString()) {
2103 t.add(it.getString());
2104 } else {
2105 t.add(it.getCodepoint(), it.getCodepointEnd());
2106 }
2107 }
2108 } else {
2109 while (it.next()) {
2110 if (it.isString()) {
2111 t.add(it.getString());
2112 } else {
2113 t.add(it.getCodepoint());
2114 }
2115 }
2116 }
2117 }
2118
checkEqual(const UnicodeSet & s,const UnicodeSet & t,const char * message)2119 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
2120 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
2121 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
2122 UnicodeString source; s.toPattern(source, true);
2123 UnicodeString result; t.toPattern(result, true);
2124 if (s != t) {
2125 errln((UnicodeString)"FAIL: " + message
2126 + "; source = " + source
2127 + "; result = " + result
2128 );
2129 return false;
2130 } else {
2131 logln((UnicodeString)"Ok: " + message
2132 + "; source = " + source
2133 + "; result = " + result
2134 );
2135 }
2136 return true;
2137 }
2138
2139 void
expectContainment(const UnicodeString & pat,const UnicodeString & charsIn,const UnicodeString & charsOut)2140 UnicodeSetTest::expectContainment(const UnicodeString& pat,
2141 const UnicodeString& charsIn,
2142 const UnicodeString& charsOut) {
2143 UErrorCode ec = U_ZERO_ERROR;
2144 UnicodeSet set(pat, ec);
2145 if (U_FAILURE(ec)) {
2146 dataerrln((UnicodeString)"FAIL: pattern \"" +
2147 pat + "\" => " + u_errorName(ec));
2148 return;
2149 }
2150 expectContainment(set, pat, charsIn, charsOut);
2151 }
2152
2153 void
expectContainment(const UnicodeSet & set,const UnicodeString & charsIn,const UnicodeString & charsOut)2154 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2155 const UnicodeString& charsIn,
2156 const UnicodeString& charsOut) {
2157 UnicodeString pat;
2158 set.toPattern(pat);
2159 expectContainment(set, pat, charsIn, charsOut);
2160 }
2161
2162 void
expectContainment(const UnicodeSet & set,const UnicodeString & setName,const UnicodeString & charsIn,const UnicodeString & charsOut)2163 UnicodeSetTest::expectContainment(const UnicodeSet& set,
2164 const UnicodeString& setName,
2165 const UnicodeString& charsIn,
2166 const UnicodeString& charsOut) {
2167 UnicodeString bad;
2168 UChar32 c;
2169 int32_t i;
2170
2171 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2172 c = charsIn.char32At(i);
2173 if (!set.contains(c)) {
2174 bad.append(c);
2175 }
2176 }
2177 if (bad.length() > 0) {
2178 errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2179 ", expected containment of " + prettify(charsIn));
2180 } else {
2181 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2182 }
2183
2184 bad.truncate(0);
2185 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2186 c = charsOut.char32At(i);
2187 if (set.contains(c)) {
2188 bad.append(c);
2189 }
2190 }
2191 if (bad.length() > 0) {
2192 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2193 ", expected non-containment of " + prettify(charsOut));
2194 } else {
2195 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2196 }
2197 }
2198
2199 void
expectPattern(UnicodeSet & set,const UnicodeString & pattern,const UnicodeString & expectedPairs)2200 UnicodeSetTest::expectPattern(UnicodeSet& set,
2201 const UnicodeString& pattern,
2202 const UnicodeString& expectedPairs){
2203 UErrorCode status = U_ZERO_ERROR;
2204 set.applyPattern(pattern, status);
2205 if (U_FAILURE(status)) {
2206 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2207 "\") failed");
2208 return;
2209 } else {
2210 if (getPairs(set) != expectedPairs ) {
2211 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2212 "\") => pairs \"" +
2213 escape(getPairs(set)) + "\", expected \"" +
2214 escape(expectedPairs) + "\"");
2215 } else {
2216 logln(UnicodeString("Ok: applyPattern(\"") + pattern +
2217 "\") => pairs \"" +
2218 escape(getPairs(set)) + "\"");
2219 }
2220 }
2221 // the result of calling set.toPattern(), which is the string representation of
2222 // this set(set), is passed to a UnicodeSet constructor, and tested that it
2223 // will produce another set that is equal to this one.
2224 UnicodeString temppattern;
2225 set.toPattern(temppattern);
2226 UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2227 if (U_FAILURE(status)) {
2228 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2229 return;
2230 }
2231 if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2232 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2233 escape(getPairs(set)) + "\""));
2234 } else{
2235 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2236 }
2237
2238 delete tempset;
2239
2240 }
2241
2242 void
expectPairs(const UnicodeSet & set,const UnicodeString & expectedPairs)2243 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2244 if (getPairs(set) != expectedPairs) {
2245 errln(UnicodeString("FAIL: Expected pair list \"") +
2246 escape(expectedPairs) + "\", got \"" +
2247 escape(getPairs(set)) + "\"");
2248 }
2249 }
2250
expectToPattern(const UnicodeSet & set,const UnicodeString & expPat,const char ** expStrings)2251 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2252 const UnicodeString& expPat,
2253 const char** expStrings) {
2254 UnicodeString pat;
2255 set.toPattern(pat, true);
2256 if (pat == expPat) {
2257 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");
2258 } else {
2259 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2260 return;
2261 }
2262 if (expStrings == nullptr) {
2263 return;
2264 }
2265 UBool in = true;
2266 for (int32_t i=0; expStrings[i] != nullptr; ++i) {
2267 if (expStrings[i] == NOT) { // sic; pointer comparison
2268 in = false;
2269 continue;
2270 }
2271 UnicodeString s = CharsToUnicodeString(expStrings[i]);
2272 UBool contained = set.contains(s);
2273 if (contained == in) {
2274 logln((UnicodeString)"Ok: " + expPat +
2275 (contained ? " contains {" : " does not contain {") +
2276 escape(expStrings[i]) + "}");
2277 } else {
2278 errln((UnicodeString)"FAIL: " + expPat +
2279 (contained ? " contains {" : " does not contain {") +
2280 escape(expStrings[i]) + "}");
2281 }
2282 }
2283 }
2284
toHexString(int32_t i)2285 static char16_t toHexString(int32_t i) { return (char16_t)(i + (i < 10 ? u'0' : (u'A' - 10))); }
2286
2287 void
doAssert(UBool condition,const char * message)2288 UnicodeSetTest::doAssert(UBool condition, const char *message)
2289 {
2290 if (!condition) {
2291 errln(UnicodeString("ERROR : ") + message);
2292 }
2293 }
2294
2295 UnicodeString
escape(const UnicodeString & s)2296 UnicodeSetTest::escape(const UnicodeString& s) {
2297 UnicodeString buf;
2298 for (int32_t i=0; i<s.length(); )
2299 {
2300 UChar32 c = s.char32At(i);
2301 if (0x0020 <= c && c <= 0x007F) {
2302 buf += c;
2303 } else {
2304 if (c <= 0xFFFF) {
2305 buf += u"\\u";
2306 } else {
2307 buf += u"\\U";
2308 buf += toHexString((c & 0xF0000000) >> 28);
2309 buf += toHexString((c & 0x0F000000) >> 24);
2310 buf += toHexString((c & 0x00F00000) >> 20);
2311 buf += toHexString((c & 0x000F0000) >> 16);
2312 }
2313 buf += toHexString((c & 0xF000) >> 12);
2314 buf += toHexString((c & 0x0F00) >> 8);
2315 buf += toHexString((c & 0x00F0) >> 4);
2316 buf += toHexString(c & 0x000F);
2317 }
2318 i += U16_LENGTH(c);
2319 }
2320 return buf;
2321 }
2322
TestFreezable()2323 void UnicodeSetTest::TestFreezable() {
2324 UErrorCode errorCode=U_ZERO_ERROR;
2325 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2326 UnicodeSet idSet(idPattern, errorCode);
2327 if(U_FAILURE(errorCode)) {
2328 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2329 return;
2330 }
2331
2332 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2333 UnicodeSet wsSet(wsPattern, errorCode);
2334 if(U_FAILURE(errorCode)) {
2335 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2336 return;
2337 }
2338
2339 idSet.add(idPattern);
2340 UnicodeSet frozen(idSet);
2341 frozen.freeze();
2342
2343 if(idSet.isFrozen() || !frozen.isFrozen()) {
2344 errln("FAIL: isFrozen() is wrong");
2345 }
2346 if(frozen!=idSet || !(frozen==idSet)) {
2347 errln("FAIL: a copy-constructed frozen set differs from its original");
2348 }
2349
2350 frozen=wsSet;
2351 if(frozen!=idSet || !(frozen==idSet)) {
2352 errln("FAIL: a frozen set was modified by operator=");
2353 }
2354
2355 UnicodeSet frozen2(frozen);
2356 if(frozen2!=frozen || frozen2!=idSet) {
2357 errln("FAIL: a copied frozen set differs from its frozen original");
2358 }
2359 if(!frozen2.isFrozen()) {
2360 errln("FAIL: copy-constructing a frozen set results in a thawed one");
2361 }
2362 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.
2363 if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2364 errln("FAIL: UnicodeSet(5, 55) failed");
2365 }
2366 frozen3=frozen;
2367 if(!frozen3.isFrozen()) {
2368 errln("FAIL: copying a frozen set results in a thawed one");
2369 }
2370
2371 UnicodeSet *cloned=frozen.clone();
2372 if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2373 errln("FAIL: clone() failed");
2374 }
2375 cloned->add(0xd802, 0xd805);
2376 if(cloned->containsSome(0xd802, 0xd805)) {
2377 errln("FAIL: unable to modify clone");
2378 }
2379 delete cloned;
2380
2381 UnicodeSet *thawed=frozen.cloneAsThawed();
2382 if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2383 errln("FAIL: cloneAsThawed() failed");
2384 }
2385 thawed->add(0xd802, 0xd805);
2386 if(!thawed->contains(0xd802, 0xd805)) {
2387 errln("FAIL: unable to modify thawed clone");
2388 }
2389 delete thawed;
2390
2391 frozen.set(5, 55);
2392 if(frozen!=idSet || !(frozen==idSet)) {
2393 errln("FAIL: UnicodeSet::set() modified a frozen set");
2394 }
2395
2396 frozen.clear();
2397 if(frozen!=idSet || !(frozen==idSet)) {
2398 errln("FAIL: UnicodeSet::clear() modified a frozen set");
2399 }
2400
2401 frozen.closeOver(USET_CASE_INSENSITIVE);
2402 if(frozen!=idSet || !(frozen==idSet)) {
2403 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2404 }
2405
2406 frozen.compact();
2407 if(frozen!=idSet || !(frozen==idSet)) {
2408 errln("FAIL: UnicodeSet::compact() modified a frozen set");
2409 }
2410
2411 ParsePosition pos;
2412 frozen.
2413 applyPattern(wsPattern, errorCode).
2414 applyPattern(wsPattern, USET_IGNORE_SPACE, nullptr, errorCode).
2415 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, nullptr, errorCode).
2416 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2417 applyPropertyAlias(u"Assigned", UnicodeString(), errorCode);
2418 if(frozen!=idSet || !(frozen==idSet)) {
2419 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2420 }
2421
2422 frozen.
2423 add(0xd800).
2424 add(0xd802, 0xd805).
2425 add(wsPattern).
2426 addAll(idPattern).
2427 addAll(wsSet);
2428 if(frozen!=idSet || !(frozen==idSet)) {
2429 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2430 }
2431
2432 frozen.
2433 retain(0x62).
2434 retain(0x64, 0x69).
2435 retainAll(wsPattern).
2436 retainAll(wsSet);
2437 if(frozen!=idSet || !(frozen==idSet)) {
2438 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2439 }
2440
2441 frozen.
2442 remove(0x62).
2443 remove(0x64, 0x69).
2444 remove(idPattern).
2445 removeAll(idPattern).
2446 removeAll(idSet);
2447 if(frozen!=idSet || !(frozen==idSet)) {
2448 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2449 }
2450
2451 frozen.
2452 complement().
2453 complement(0x62).
2454 complement(0x64, 0x69).
2455 complement(idPattern).
2456 complementAll(idPattern).
2457 complementAll(idSet);
2458 if(frozen!=idSet || !(frozen==idSet)) {
2459 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2460 }
2461 }
2462
2463 // Test span() etc. -------------------------------------------------------- ***
2464
2465 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2466 static int32_t
appendUTF8(const char16_t * s,int32_t length,char * t,int32_t capacity)2467 appendUTF8(const char16_t *s, int32_t length, char *t, int32_t capacity) {
2468 UErrorCode errorCode=U_ZERO_ERROR;
2469 int32_t length8=0;
2470 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2471 if(U_SUCCESS(errorCode)) {
2472 return length8;
2473 } else {
2474 // The string contains an unpaired surrogate.
2475 // Ignore this string.
2476 return 0;
2477 }
2478 }
2479
2480 class UnicodeSetWithStringsIterator;
2481
2482 // Make the strings in a UnicodeSet easily accessible.
2483 class UnicodeSetWithStrings {
2484 public:
UnicodeSetWithStrings(const UnicodeSet & normalSet)2485 UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2486 set(normalSet), stringsLength(0), hasSurrogates(false) {
2487 int32_t size=set.size();
2488 if(size>0 && set.charAt(size-1)<0) {
2489 // If a set's last element is not a code point, then it must contain strings.
2490 // Iterate over the set, skip all code point ranges, and cache the strings.
2491 // Convert them to UTF-8 for spanUTF8().
2492 UnicodeSetIterator iter(set);
2493 const UnicodeString *s;
2494 char *s8=utf8;
2495 int32_t length8, utf8Count=0;
2496 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2497 if(iter.isString()) {
2498 // Store the pointer to the set's string element
2499 // which we happen to know is a stable pointer.
2500 strings[stringsLength]=s=&iter.getString();
2501 utf8Count+=
2502 utf8Lengths[stringsLength]=length8=
2503 appendUTF8(s->getBuffer(), s->length(),
2504 s8, (int32_t)(sizeof(utf8)-utf8Count));
2505 if(length8==0) {
2506 hasSurrogates=true; // Contains unpaired surrogates.
2507 }
2508 s8+=length8;
2509 ++stringsLength;
2510 }
2511 }
2512 }
2513 }
2514
getSet() const2515 const UnicodeSet &getSet() const {
2516 return set;
2517 }
2518
hasStrings() const2519 UBool hasStrings() const {
2520 return (UBool)(stringsLength>0);
2521 }
2522
hasStringsWithSurrogates() const2523 UBool hasStringsWithSurrogates() const {
2524 return hasSurrogates;
2525 }
2526
2527 private:
2528 friend class UnicodeSetWithStringsIterator;
2529
2530 const UnicodeSet &set;
2531
2532 const UnicodeString *strings[20];
2533 int32_t stringsLength;
2534 UBool hasSurrogates;
2535
2536 char utf8[1024];
2537 int32_t utf8Lengths[20];
2538 };
2539
2540 class UnicodeSetWithStringsIterator {
2541 public:
UnicodeSetWithStringsIterator(const UnicodeSetWithStrings & set)2542 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2543 fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2544 }
2545
reset()2546 void reset() {
2547 nextStringIndex=nextUTF8Start=0;
2548 }
2549
nextString()2550 const UnicodeString *nextString() {
2551 if(nextStringIndex<fSet.stringsLength) {
2552 return fSet.strings[nextStringIndex++];
2553 } else {
2554 return nullptr;
2555 }
2556 }
2557
2558 // Do not mix with calls to nextString().
nextUTF8(int32_t & length)2559 const char *nextUTF8(int32_t &length) {
2560 if(nextStringIndex<fSet.stringsLength) {
2561 const char *s8=fSet.utf8+nextUTF8Start;
2562 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2563 return s8;
2564 } else {
2565 length=0;
2566 return nullptr;
2567 }
2568 }
2569
2570 private:
2571 const UnicodeSetWithStrings &fSet;
2572 int32_t nextStringIndex;
2573 int32_t nextUTF8Start;
2574 };
2575
2576 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
2577 // at code point boundaries.
2578 // That is, each edge of a match must not be in the middle of a surrogate pair.
2579 static inline UBool
matches16CPB(const char16_t * s,int32_t start,int32_t limit,const UnicodeString & t)2580 matches16CPB(const char16_t *s, int32_t start, int32_t limit, const UnicodeString &t) {
2581 s+=start;
2582 limit-=start;
2583 int32_t length=t.length();
2584 return 0==t.compare(s, length) &&
2585 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2586 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2587 }
2588
2589 // Implement span() with contains() for comparison.
containsSpanUTF16(const UnicodeSetWithStrings & set,const char16_t * s,int32_t length,USetSpanCondition spanCondition)2590 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const char16_t *s, int32_t length,
2591 USetSpanCondition spanCondition) {
2592 const UnicodeSet &realSet(set.getSet());
2593 if(!set.hasStrings()) {
2594 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2595 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2596 }
2597
2598 UChar32 c;
2599 int32_t start=0, prev;
2600 while((prev=start)<length) {
2601 U16_NEXT(s, start, length, c);
2602 if(realSet.contains(c)!=spanCondition) {
2603 break;
2604 }
2605 }
2606 return prev;
2607 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2608 UnicodeSetWithStringsIterator iter(set);
2609 UChar32 c;
2610 int32_t start, next;
2611 for(start=next=0; start<length;) {
2612 U16_NEXT(s, next, length, c);
2613 if(realSet.contains(c)) {
2614 break;
2615 }
2616 const UnicodeString *str;
2617 iter.reset();
2618 while((str=iter.nextString())!=nullptr) {
2619 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2620 // spanNeedsStrings=true;
2621 return start;
2622 }
2623 }
2624 start=next;
2625 }
2626 return start;
2627 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2628 UnicodeSetWithStringsIterator iter(set);
2629 UChar32 c;
2630 int32_t start, next, maxSpanLimit=0;
2631 for(start=next=0; start<length;) {
2632 U16_NEXT(s, next, length, c);
2633 if(!realSet.contains(c)) {
2634 next=start; // Do not span this single, not-contained code point.
2635 }
2636 const UnicodeString *str;
2637 iter.reset();
2638 while((str=iter.nextString())!=nullptr) {
2639 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2640 // spanNeedsStrings=true;
2641 int32_t matchLimit=start+str->length();
2642 if(matchLimit==length) {
2643 return length;
2644 }
2645 if(spanCondition==USET_SPAN_CONTAINED) {
2646 // Iterate for the shortest match at each position.
2647 // Recurse for each but the shortest match.
2648 if(next==start) {
2649 next=matchLimit; // First match from start.
2650 } else {
2651 if(matchLimit<next) {
2652 // Remember shortest match from start for iteration.
2653 int32_t temp=next;
2654 next=matchLimit;
2655 matchLimit=temp;
2656 }
2657 // Recurse for non-shortest match from start.
2658 int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2659 USET_SPAN_CONTAINED);
2660 if((matchLimit+spanLength)>maxSpanLimit) {
2661 maxSpanLimit=matchLimit+spanLength;
2662 if(maxSpanLimit==length) {
2663 return length;
2664 }
2665 }
2666 }
2667 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2668 if(matchLimit>next) {
2669 // Remember longest match from start.
2670 next=matchLimit;
2671 }
2672 }
2673 }
2674 }
2675 if(next==start) {
2676 break; // No match from start.
2677 }
2678 start=next;
2679 }
2680 if(start>maxSpanLimit) {
2681 return start;
2682 } else {
2683 return maxSpanLimit;
2684 }
2685 }
2686 }
2687
containsSpanBackUTF16(const UnicodeSetWithStrings & set,const char16_t * s,int32_t length,USetSpanCondition spanCondition)2688 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const char16_t *s, int32_t length,
2689 USetSpanCondition spanCondition) {
2690 if(length==0) {
2691 return 0;
2692 }
2693 const UnicodeSet &realSet(set.getSet());
2694 if(!set.hasStrings()) {
2695 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2696 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2697 }
2698
2699 UChar32 c;
2700 int32_t prev=length;
2701 do {
2702 U16_PREV(s, 0, length, c);
2703 if(realSet.contains(c)!=spanCondition) {
2704 break;
2705 }
2706 } while((prev=length)>0);
2707 return prev;
2708 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2709 UnicodeSetWithStringsIterator iter(set);
2710 UChar32 c;
2711 int32_t prev=length, length0=length;
2712 do {
2713 U16_PREV(s, 0, length, c);
2714 if(realSet.contains(c)) {
2715 break;
2716 }
2717 const UnicodeString *str;
2718 iter.reset();
2719 while((str=iter.nextString())!=nullptr) {
2720 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2721 // spanNeedsStrings=true;
2722 return prev;
2723 }
2724 }
2725 } while((prev=length)>0);
2726 return prev;
2727 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2728 UnicodeSetWithStringsIterator iter(set);
2729 UChar32 c;
2730 int32_t prev=length, minSpanStart=length, length0=length;
2731 do {
2732 U16_PREV(s, 0, length, c);
2733 if(!realSet.contains(c)) {
2734 length=prev; // Do not span this single, not-contained code point.
2735 }
2736 const UnicodeString *str;
2737 iter.reset();
2738 while((str=iter.nextString())!=nullptr) {
2739 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2740 // spanNeedsStrings=true;
2741 int32_t matchStart=prev-str->length();
2742 if(matchStart==0) {
2743 return 0;
2744 }
2745 if(spanCondition==USET_SPAN_CONTAINED) {
2746 // Iterate for the shortest match at each position.
2747 // Recurse for each but the shortest match.
2748 if(length==prev) {
2749 length=matchStart; // First match from prev.
2750 } else {
2751 if(matchStart>length) {
2752 // Remember shortest match from prev for iteration.
2753 int32_t temp=length;
2754 length=matchStart;
2755 matchStart=temp;
2756 }
2757 // Recurse for non-shortest match from prev.
2758 int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2759 USET_SPAN_CONTAINED);
2760 if(spanStart<minSpanStart) {
2761 minSpanStart=spanStart;
2762 if(minSpanStart==0) {
2763 return 0;
2764 }
2765 }
2766 }
2767 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2768 if(matchStart<length) {
2769 // Remember longest match from prev.
2770 length=matchStart;
2771 }
2772 }
2773 }
2774 }
2775 if(length==prev) {
2776 break; // No match from prev.
2777 }
2778 } while((prev=length)>0);
2779 if(prev<minSpanStart) {
2780 return prev;
2781 } else {
2782 return minSpanStart;
2783 }
2784 }
2785 }
2786
containsSpanUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2787 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2788 USetSpanCondition spanCondition) {
2789 const UnicodeSet &realSet(set.getSet());
2790 if(!set.hasStrings()) {
2791 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2792 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2793 }
2794
2795 UChar32 c;
2796 int32_t start=0, prev;
2797 while((prev=start)<length) {
2798 U8_NEXT_OR_FFFD(s, start, length, c);
2799 if(realSet.contains(c)!=spanCondition) {
2800 break;
2801 }
2802 }
2803 return prev;
2804 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2805 UnicodeSetWithStringsIterator iter(set);
2806 UChar32 c;
2807 int32_t start, next;
2808 for(start=next=0; start<length;) {
2809 U8_NEXT_OR_FFFD(s, next, length, c);
2810 if(realSet.contains(c)) {
2811 break;
2812 }
2813 const char *s8;
2814 int32_t length8;
2815 iter.reset();
2816 while((s8=iter.nextUTF8(length8))!=nullptr) {
2817 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2818 // spanNeedsStrings=true;
2819 return start;
2820 }
2821 }
2822 start=next;
2823 }
2824 return start;
2825 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2826 UnicodeSetWithStringsIterator iter(set);
2827 UChar32 c;
2828 int32_t start, next, maxSpanLimit=0;
2829 for(start=next=0; start<length;) {
2830 U8_NEXT_OR_FFFD(s, next, length, c);
2831 if(!realSet.contains(c)) {
2832 next=start; // Do not span this single, not-contained code point.
2833 }
2834 const char *s8;
2835 int32_t length8;
2836 iter.reset();
2837 while((s8=iter.nextUTF8(length8))!=nullptr) {
2838 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2839 // spanNeedsStrings=true;
2840 int32_t matchLimit=start+length8;
2841 if(matchLimit==length) {
2842 return length;
2843 }
2844 if(spanCondition==USET_SPAN_CONTAINED) {
2845 // Iterate for the shortest match at each position.
2846 // Recurse for each but the shortest match.
2847 if(next==start) {
2848 next=matchLimit; // First match from start.
2849 } else {
2850 if(matchLimit<next) {
2851 // Remember shortest match from start for iteration.
2852 int32_t temp=next;
2853 next=matchLimit;
2854 matchLimit=temp;
2855 }
2856 // Recurse for non-shortest match from start.
2857 int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2858 USET_SPAN_CONTAINED);
2859 if((matchLimit+spanLength)>maxSpanLimit) {
2860 maxSpanLimit=matchLimit+spanLength;
2861 if(maxSpanLimit==length) {
2862 return length;
2863 }
2864 }
2865 }
2866 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2867 if(matchLimit>next) {
2868 // Remember longest match from start.
2869 next=matchLimit;
2870 }
2871 }
2872 }
2873 }
2874 if(next==start) {
2875 break; // No match from start.
2876 }
2877 start=next;
2878 }
2879 if(start>maxSpanLimit) {
2880 return start;
2881 } else {
2882 return maxSpanLimit;
2883 }
2884 }
2885 }
2886
containsSpanBackUTF8(const UnicodeSetWithStrings & set,const char * s,int32_t length,USetSpanCondition spanCondition)2887 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2888 USetSpanCondition spanCondition) {
2889 if(length==0) {
2890 return 0;
2891 }
2892 const UnicodeSet &realSet(set.getSet());
2893 if(!set.hasStrings()) {
2894 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2895 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2896 }
2897
2898 UChar32 c;
2899 int32_t prev=length;
2900 do {
2901 U8_PREV_OR_FFFD(s, 0, length, c);
2902 if(realSet.contains(c)!=spanCondition) {
2903 break;
2904 }
2905 } while((prev=length)>0);
2906 return prev;
2907 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2908 UnicodeSetWithStringsIterator iter(set);
2909 UChar32 c;
2910 int32_t prev=length;
2911 do {
2912 U8_PREV_OR_FFFD(s, 0, length, c);
2913 if(realSet.contains(c)) {
2914 break;
2915 }
2916 const char *s8;
2917 int32_t length8;
2918 iter.reset();
2919 while((s8=iter.nextUTF8(length8))!=nullptr) {
2920 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2921 // spanNeedsStrings=true;
2922 return prev;
2923 }
2924 }
2925 } while((prev=length)>0);
2926 return prev;
2927 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2928 UnicodeSetWithStringsIterator iter(set);
2929 UChar32 c;
2930 int32_t prev=length, minSpanStart=length;
2931 do {
2932 U8_PREV_OR_FFFD(s, 0, length, c);
2933 if(!realSet.contains(c)) {
2934 length=prev; // Do not span this single, not-contained code point.
2935 }
2936 const char *s8;
2937 int32_t length8;
2938 iter.reset();
2939 while((s8=iter.nextUTF8(length8))!=nullptr) {
2940 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2941 // spanNeedsStrings=true;
2942 int32_t matchStart=prev-length8;
2943 if(matchStart==0) {
2944 return 0;
2945 }
2946 if(spanCondition==USET_SPAN_CONTAINED) {
2947 // Iterate for the shortest match at each position.
2948 // Recurse for each but the shortest match.
2949 if(length==prev) {
2950 length=matchStart; // First match from prev.
2951 } else {
2952 if(matchStart>length) {
2953 // Remember shortest match from prev for iteration.
2954 int32_t temp=length;
2955 length=matchStart;
2956 matchStart=temp;
2957 }
2958 // Recurse for non-shortest match from prev.
2959 int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2960 USET_SPAN_CONTAINED);
2961 if(spanStart<minSpanStart) {
2962 minSpanStart=spanStart;
2963 if(minSpanStart==0) {
2964 return 0;
2965 }
2966 }
2967 }
2968 } else /* spanCondition==USET_SPAN_SIMPLE */ {
2969 if(matchStart<length) {
2970 // Remember longest match from prev.
2971 length=matchStart;
2972 }
2973 }
2974 }
2975 }
2976 if(length==prev) {
2977 break; // No match from prev.
2978 }
2979 } while((prev=length)>0);
2980 if(prev<minSpanStart) {
2981 return prev;
2982 } else {
2983 return minSpanStart;
2984 }
2985 }
2986 }
2987
2988 // spans to be performed and compared
2989 enum {
2990 SPAN_UTF16 =1,
2991 SPAN_UTF8 =2,
2992 SPAN_UTFS =3,
2993
2994 SPAN_SET =4,
2995 SPAN_COMPLEMENT =8,
2996 SPAN_POLARITY =0xc,
2997
2998 SPAN_FWD =0x10,
2999 SPAN_BACK =0x20,
3000 SPAN_DIRS =0x30,
3001
3002 SPAN_CONTAINED =0x100,
3003 SPAN_SIMPLE =0x200,
3004 SPAN_CONDITION =0x300,
3005
3006 SPAN_ALL =0x33f
3007 };
3008
invertSpanCondition(USetSpanCondition spanCondition,USetSpanCondition contained)3009 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
3010 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
3011 }
3012
slen(const void * s,UBool isUTF16)3013 static inline int32_t slen(const void *s, UBool isUTF16) {
3014 return isUTF16 ? u_strlen((const char16_t *)s) : static_cast<int32_t>(strlen((const char *)s));
3015 }
3016
3017 /*
3018 * Count spans on a string with the method according to type and set the span limits.
3019 * The set may be the complement of the original.
3020 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
3021 * according to the expected number of spans.
3022 * Sets typeName to an empty string if there is no such type.
3023 * Returns -1 if the span option is filtered out.
3024 */
getSpans(const UnicodeSetWithStrings & set,UBool isComplement,const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int type,const char * & typeName,int32_t limits[],int32_t limitsCapacity,int32_t expectCount)3025 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
3026 const void *s, int32_t length, UBool isUTF16,
3027 uint32_t whichSpans,
3028 int type, const char *&typeName,
3029 int32_t limits[], int32_t limitsCapacity,
3030 int32_t expectCount) {
3031 const UnicodeSet &realSet(set.getSet());
3032 int32_t start, count;
3033 USetSpanCondition spanCondition, firstSpanCondition, contained;
3034 UBool isForward;
3035
3036 if(type<0 || 7<type) {
3037 typeName="";
3038 return 0;
3039 }
3040
3041 static const char *const typeNames16[]={
3042 "contains", "contains(LM)",
3043 "span", "span(LM)",
3044 "containsBack", "containsBack(LM)",
3045 "spanBack", "spanBack(LM)"
3046 };
3047
3048 static const char *const typeNames8[]={
3049 "containsUTF8", "containsUTF8(LM)",
3050 "spanUTF8", "spanUTF8(LM)",
3051 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
3052 "spanBackUTF8", "spanBackUTF8(LM)"
3053 };
3054
3055 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
3056
3057 // filter span options
3058 if(type<=3) {
3059 // span forward
3060 if((whichSpans&SPAN_FWD)==0) {
3061 return -1;
3062 }
3063 isForward=true;
3064 } else {
3065 // span backward
3066 if((whichSpans&SPAN_BACK)==0) {
3067 return -1;
3068 }
3069 isForward=false;
3070 }
3071 if((type&1)==0) {
3072 // use USET_SPAN_CONTAINED
3073 if((whichSpans&SPAN_CONTAINED)==0) {
3074 return -1;
3075 }
3076 contained=USET_SPAN_CONTAINED;
3077 } else {
3078 // use USET_SPAN_SIMPLE
3079 if((whichSpans&SPAN_SIMPLE)==0) {
3080 return -1;
3081 }
3082 contained=USET_SPAN_SIMPLE;
3083 }
3084
3085 // Default first span condition for going forward with an uncomplemented set.
3086 spanCondition=USET_SPAN_NOT_CONTAINED;
3087 if(isComplement) {
3088 spanCondition=invertSpanCondition(spanCondition, contained);
3089 }
3090
3091 // First span condition for span(), used to terminate the spanBack() iteration.
3092 firstSpanCondition=spanCondition;
3093
3094 // spanBack(): Its initial span condition is span()'s last span condition,
3095 // which is the opposite of span()'s first span condition
3096 // if we expect an even number of spans.
3097 // (The loop inverts spanCondition (expectCount-1) times
3098 // before the expectCount'th span() call.)
3099 // If we do not compare forward and backward directions, then we do not have an
3100 // expectCount and just start with firstSpanCondition.
3101 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
3102 spanCondition=invertSpanCondition(spanCondition, contained);
3103 }
3104
3105 count=0;
3106 switch(type) {
3107 case 0:
3108 case 1:
3109 start=0;
3110 if(length<0) {
3111 length=slen(s, isUTF16);
3112 }
3113 for(;;) {
3114 start+= isUTF16 ? containsSpanUTF16(set, (const char16_t *)s+start, length-start, spanCondition) :
3115 containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
3116 if(count<limitsCapacity) {
3117 limits[count]=start;
3118 }
3119 ++count;
3120 if(start>=length) {
3121 break;
3122 }
3123 spanCondition=invertSpanCondition(spanCondition, contained);
3124 }
3125 break;
3126 case 2:
3127 case 3:
3128 start=0;
3129 for(;;) {
3130 start+= isUTF16 ? realSet.span((const char16_t *)s+start, length>=0 ? length-start : length, spanCondition) :
3131 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
3132 if(count<limitsCapacity) {
3133 limits[count]=start;
3134 }
3135 ++count;
3136 if(length>=0 ? start>=length :
3137 isUTF16 ? ((const char16_t *)s)[start]==0 :
3138 ((const char *)s)[start]==0
3139 ) {
3140 break;
3141 }
3142 spanCondition=invertSpanCondition(spanCondition, contained);
3143 }
3144 break;
3145 case 4:
3146 case 5:
3147 if(length<0) {
3148 length=slen(s, isUTF16);
3149 }
3150 for(;;) {
3151 ++count;
3152 if(count<=limitsCapacity) {
3153 limits[limitsCapacity-count]=length;
3154 }
3155 length= isUTF16 ? containsSpanBackUTF16(set, (const char16_t *)s, length, spanCondition) :
3156 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3157 if(length==0 && spanCondition==firstSpanCondition) {
3158 break;
3159 }
3160 spanCondition=invertSpanCondition(spanCondition, contained);
3161 }
3162 if(count<limitsCapacity) {
3163 memmove(limits, limits+(limitsCapacity-count), count*4);
3164 }
3165 break;
3166 case 6:
3167 case 7:
3168 for(;;) {
3169 ++count;
3170 if(count<=limitsCapacity) {
3171 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3172 }
3173 // Note: Length<0 is tested only for the first spanBack().
3174 // If we wanted to keep length<0 for all spanBack()s, we would have to
3175 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3176 length= isUTF16 ? realSet.spanBack((const char16_t *)s, length, spanCondition) :
3177 realSet.spanBackUTF8((const char *)s, length, spanCondition);
3178 if(length==0 && spanCondition==firstSpanCondition) {
3179 break;
3180 }
3181 spanCondition=invertSpanCondition(spanCondition, contained);
3182 }
3183 if(count<limitsCapacity) {
3184 memmove(limits, limits+(limitsCapacity-count), count*4);
3185 }
3186 break;
3187 default:
3188 typeName="";
3189 return -1;
3190 }
3191
3192 return count;
3193 }
3194
3195 // sets to be tested; odd index=isComplement
3196 enum {
3197 SLOW,
3198 SLOW_NOT,
3199 FAST,
3200 FAST_NOT,
3201 SET_COUNT
3202 };
3203
3204 static const char *const setNames[SET_COUNT]={
3205 "slow",
3206 "slow.not",
3207 "fast",
3208 "fast.not"
3209 };
3210
3211 /*
3212 * Verify that we get the same results whether we look at text with contains(),
3213 * span() or spanBack(), using unfrozen or frozen versions of the set,
3214 * and using the set or its complement (switching the spanConditions accordingly).
3215 * The latter verifies that
3216 * set.span(spanCondition) == set.complement().span(!spanCondition).
3217 *
3218 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3219 * or returned to the caller (with an input expectCount<0).
3220 */
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,int32_t expectLimits[],int32_t & expectCount,const char * testName,int32_t index)3221 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3222 const void *s, int32_t length, UBool isUTF16,
3223 uint32_t whichSpans,
3224 int32_t expectLimits[], int32_t &expectCount,
3225 const char *testName, int32_t index) {
3226 int32_t limits[500];
3227 int32_t limitsCount;
3228 int i, j;
3229
3230 const char *typeName;
3231 int type;
3232
3233 for(i=0; i<SET_COUNT; ++i) {
3234 if((i&1)==0) {
3235 // Even-numbered sets are original, uncomplemented sets.
3236 if((whichSpans&SPAN_SET)==0) {
3237 continue;
3238 }
3239 } else {
3240 // Odd-numbered sets are complemented.
3241 if((whichSpans&SPAN_COMPLEMENT)==0) {
3242 continue;
3243 }
3244 }
3245 for(type=0;; ++type) {
3246 limitsCount=getSpans(*sets[i], (UBool)(i&1),
3247 s, length, isUTF16,
3248 whichSpans,
3249 type, typeName,
3250 limits, UPRV_LENGTHOF(limits), expectCount);
3251 if(typeName[0]==0) {
3252 break; // All types tried.
3253 }
3254 if(limitsCount<0) {
3255 continue; // Span option filtered out.
3256 }
3257 if(expectCount<0) {
3258 expectCount=limitsCount;
3259 if(limitsCount>UPRV_LENGTHOF(limits)) {
3260 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3261 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3262 return;
3263 }
3264 memcpy(expectLimits, limits, limitsCount*4);
3265 } else if(limitsCount!=expectCount) {
3266 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3267 testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3268 } else {
3269 for(j=0; j<limitsCount; ++j) {
3270 if(limits[j]!=expectLimits[j]) {
3271 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3272 testName, (long)index, setNames[i], typeName, (long)limitsCount,
3273 j, (long)limits[j], (long)expectLimits[j]);
3274 break;
3275 }
3276 }
3277 }
3278 }
3279 }
3280
3281 // Compare span() with containsAll()/containsNone(),
3282 // but only if we have expectLimits[] from the uncomplemented set.
3283 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3284 const char16_t *s16=(const char16_t *)s;
3285 UnicodeString string;
3286 int32_t prev=0, limit, length;
3287 for(i=0; i<expectCount; ++i) {
3288 limit=expectLimits[i];
3289 length=limit-prev;
3290 if(length>0) {
3291 string.setTo(false, s16+prev, length); // read-only alias
3292 if(i&1) {
3293 if(!sets[SLOW]->getSet().containsAll(string)) {
3294 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==false contradicts span()",
3295 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3296 return;
3297 }
3298 if(!sets[FAST]->getSet().containsAll(string)) {
3299 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==false contradicts span()",
3300 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3301 return;
3302 }
3303 } else {
3304 if(!sets[SLOW]->getSet().containsNone(string)) {
3305 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==false contradicts span()",
3306 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3307 return;
3308 }
3309 if(!sets[FAST]->getSet().containsNone(string)) {
3310 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==false contradicts span()",
3311 testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3312 return;
3313 }
3314 }
3315 }
3316 prev=limit;
3317 }
3318 }
3319 }
3320
3321 // Specifically test either UTF-16 or UTF-8.
testSpan(const UnicodeSetWithStrings * sets[4],const void * s,int32_t length,UBool isUTF16,uint32_t whichSpans,const char * testName,int32_t index)3322 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3323 const void *s, int32_t length, UBool isUTF16,
3324 uint32_t whichSpans,
3325 const char *testName, int32_t index) {
3326 int32_t expectLimits[500];
3327 int32_t expectCount=-1;
3328 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3329 }
3330
stringContainsUnpairedSurrogate(const char16_t * s,int32_t length)3331 UBool stringContainsUnpairedSurrogate(const char16_t *s, int32_t length) {
3332 char16_t c, c2;
3333
3334 if(length>=0) {
3335 while(length>0) {
3336 c=*s++;
3337 --length;
3338 if(0xd800<=c && c<0xe000) {
3339 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3340 return true;
3341 }
3342 --length;
3343 }
3344 }
3345 } else {
3346 while((c=*s++)!=0) {
3347 if(0xd800<=c && c<0xe000) {
3348 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3349 return true;
3350 }
3351 }
3352 }
3353 }
3354 return false;
3355 }
3356
3357 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3358 // unless either UTF is turned off in whichSpans.
3359 // Testing UTF-16 and UTF-8 together requires that surrogate code points
3360 // have the same contains(c) value as U+FFFD.
testSpanBothUTFs(const UnicodeSetWithStrings * sets[4],const char16_t * s16,int32_t length16,uint32_t whichSpans,const char * testName,int32_t index)3361 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3362 const char16_t *s16, int32_t length16,
3363 uint32_t whichSpans,
3364 const char *testName, int32_t index) {
3365 int32_t expectLimits[500];
3366 int32_t expectCount;
3367
3368 expectCount=-1; // Get expectLimits[] from testSpan().
3369
3370 if((whichSpans&SPAN_UTF16)!=0) {
3371 testSpan(sets, s16, length16, true, whichSpans, expectLimits, expectCount, testName, index);
3372 }
3373 if((whichSpans&SPAN_UTF8)==0) {
3374 return;
3375 }
3376
3377 // Convert s16[] and expectLimits[] to UTF-8.
3378 uint8_t s8[3000];
3379 int32_t offsets[3000];
3380
3381 const char16_t *s16Limit=s16+length16;
3382 char *t=(char *)s8;
3383 char *tLimit=t+sizeof(s8);
3384 int32_t *o=offsets;
3385 UErrorCode errorCode=U_ZERO_ERROR;
3386
3387 // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3388 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, true, &errorCode);
3389 if(U_FAILURE(errorCode)) {
3390 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3391 testName, (long)index, u_errorName(errorCode));
3392 ucnv_resetFromUnicode(utf8Cnv);
3393 return;
3394 }
3395 int32_t length8=(int32_t)(t-(char *)s8);
3396
3397 // Convert expectLimits[].
3398 int32_t i, j, expect;
3399 for(i=j=0; i<expectCount; ++i) {
3400 expect=expectLimits[i];
3401 if(expect==length16) {
3402 expectLimits[i]=length8;
3403 } else {
3404 while(offsets[j]<expect) {
3405 ++j;
3406 }
3407 expectLimits[i]=j;
3408 }
3409 }
3410
3411 testSpan(sets, s8, length8, false, whichSpans, expectLimits, expectCount, testName, index);
3412 }
3413
nextCodePoint(UChar32 c)3414 static UChar32 nextCodePoint(UChar32 c) {
3415 // Skip some large and boring ranges.
3416 switch(c) {
3417 case 0x3441:
3418 return 0x4d7f;
3419 case 0x5100:
3420 return 0x9f00;
3421 case 0xb040:
3422 return 0xd780;
3423 case 0xe041:
3424 return 0xf8fe;
3425 case 0x10100:
3426 return 0x20000;
3427 case 0x20041:
3428 return 0xe0000;
3429 case 0xe0101:
3430 return 0x10fffd;
3431 default:
3432 return c+1;
3433 }
3434 }
3435
3436 // Verify that all implementations represent the same set.
testSpanContents(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3437 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3438 // contains(U+FFFD) is inconsistent with contains(some surrogates),
3439 // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3440 // Skip the UTF-8 part of the test - if the string contains surrogates -
3441 // because it is likely to produce a different result.
3442 UBool inconsistentSurrogates=
3443 (!(sets[0]->getSet().contains(0xfffd) ?
3444 sets[0]->getSet().contains(0xd800, 0xdfff) :
3445 sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3446 sets[0]->hasStringsWithSurrogates());
3447
3448 char16_t s[1000];
3449 int32_t length=0;
3450 uint32_t localWhichSpans;
3451
3452 UChar32 c, first;
3453 for(first=c=0;; c=nextCodePoint(c)) {
3454 if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3455 localWhichSpans=whichSpans;
3456 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3457 localWhichSpans&=~SPAN_UTF8;
3458 }
3459 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3460 if(c>0x10ffff) {
3461 break;
3462 }
3463 length=0;
3464 first=c;
3465 }
3466 U16_APPEND_UNSAFE(s, length, c);
3467 }
3468 }
3469
3470 // Test with a particular, interesting string.
3471 // Specify length and try NUL-termination.
testSpanUTF16String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3472 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3473 static const char16_t s[]={
3474 0x61, 0x62, 0x20, // Latin, space
3475 0x3b1, 0x3b2, 0x3b3, // Greek
3476 0xd900, // lead surrogate
3477 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
3478 0xdc05, // trail surrogate
3479 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
3480 0xd900, 0xdc05, // unassigned supplementary
3481 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
3482 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wrong order, LS
3483 0 // NUL
3484 };
3485
3486 if((whichSpans&SPAN_UTF16)==0) {
3487 return;
3488 }
3489 testSpan(sets, s, -1, true, (whichSpans&~SPAN_UTF8), testName, 0);
3490 testSpan(sets, s, UPRV_LENGTHOF(s)-1, true, (whichSpans&~SPAN_UTF8), testName, 1);
3491 }
3492
testSpanUTF8String(const UnicodeSetWithStrings * sets[4],uint32_t whichSpans,const char * testName)3493 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3494 static const char s[]={
3495 "abc" // Latin
3496
3497 /* trail byte in lead position */
3498 "\x80"
3499
3500 " " // space
3501
3502 /* truncated multi-byte sequences */
3503 "\xd0"
3504 "\xe0"
3505 "\xe1"
3506 "\xed"
3507 "\xee"
3508 "\xf0"
3509 "\xf1"
3510 "\xf4"
3511 "\xf8"
3512 "\xfc"
3513
3514 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek
3515
3516 /* trail byte in lead position */
3517 "\x80"
3518
3519 "\xe0\x80"
3520 "\xe0\xa0"
3521 "\xe1\x80"
3522 "\xed\x80"
3523 "\xed\xa0"
3524 "\xee\x80"
3525 "\xf0\x80"
3526 "\xf0\x90"
3527 "\xf1\x80"
3528 "\xf4\x80"
3529 "\xf4\x90"
3530 "\xf8\x80"
3531 "\xfc\x80"
3532
3533 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana
3534
3535 /* trail byte in lead position */
3536 "\x80"
3537
3538 "\xf0\x80\x80"
3539 "\xf0\x90\x80"
3540 "\xf1\x80\x80"
3541 "\xf4\x80\x80"
3542 "\xf4\x90\x80"
3543 "\xf8\x80\x80"
3544 "\xfc\x80\x80"
3545
3546 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul
3547
3548 /* trail byte in lead position */
3549 "\x80"
3550
3551 "\xf8\x80\x80\x80"
3552 "\xfc\x80\x80\x80"
3553
3554 "\xF1\x90\x80\x85" // unassigned supplementary
3555
3556 /* trail byte in lead position */
3557 "\x80"
3558
3559 "\xfc\x80\x80\x80\x80"
3560
3561 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary
3562
3563 /* trail byte in lead position */
3564 "\x80"
3565
3566 /* complete sequences but non-shortest forms or out of range etc. */
3567 "\xc0\x80"
3568 "\xe0\x80\x80"
3569 "\xed\xa0\x80"
3570 "\xf0\x80\x80\x80"
3571 "\xf4\x90\x80\x80"
3572 "\xf8\x80\x80\x80\x80"
3573 "\xfc\x80\x80\x80\x80\x80"
3574 "\xfe"
3575 "\xff"
3576
3577 /* trail byte in lead position */
3578 "\x80"
3579
3580 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminated
3581 };
3582
3583 if((whichSpans&SPAN_UTF8)==0) {
3584 return;
3585 }
3586 testSpan(sets, s, -1, false, (whichSpans&~SPAN_UTF16), testName, 0);
3587 testSpan(sets, s, UPRV_LENGTHOF(s)-1, false, (whichSpans&~SPAN_UTF16), testName, 1);
3588 }
3589
3590 // Take a set of span options and multiply them so that
3591 // each portion only has one of the options a, b and c.
3592 // If b==0, then the set of options is just modified with mask and a.
3593 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3594 static int32_t
addAlternative(uint32_t whichSpans[],int32_t whichSpansCount,uint32_t mask,uint32_t a,uint32_t b,uint32_t c)3595 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3596 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3597 uint32_t s;
3598 int32_t i;
3599
3600 for(i=0; i<whichSpansCount; ++i) {
3601 s=whichSpans[i]&mask;
3602 whichSpans[i]=s|a;
3603 if(b!=0) {
3604 whichSpans[whichSpansCount+i]=s|b;
3605 if(c!=0) {
3606 whichSpans[2*whichSpansCount+i]=s|c;
3607 }
3608 }
3609 }
3610 return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3611 }
3612
3613 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3614 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3615 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3616 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3617
TestSpan()3618 void UnicodeSetTest::TestSpan() {
3619 // "[...]" is a UnicodeSet pattern.
3620 // "*" performs tests on all Unicode code points and on a selection of
3621 // malformed UTF-8/16 strings.
3622 // "-options" limits the scope of testing for the current set.
3623 // By default, the test verifies that equivalent boundaries are found
3624 // for UTF-16 and UTF-8, going forward and backward,
3625 // alternating USET_SPAN_NOT_CONTAINED with
3626 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3627 // Single-character options:
3628 // 8 -- UTF-16 and UTF-8 boundaries may differ.
3629 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3630 // or the set contains strings with unpaired surrogates
3631 // which do not translate to valid UTF-8.
3632 // c -- set.span() and set.complement().span() boundaries may differ.
3633 // Cause: Set strings are not complemented.
3634 // b -- span() and spanBack() boundaries may differ.
3635 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3636 // and spanBack(USET_SPAN_SIMPLE) are defined to
3637 // match with non-overlapping substrings.
3638 // For example, with a set containing "ab" and "ba",
3639 // span() of "aba" yields boundaries { 0, 2, 3 }
3640 // because the initial "ab" matches from 0 to 2,
3641 // while spanBack() yields boundaries { 0, 1, 3 }
3642 // because the final "ba" matches from 1 to 3.
3643 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3644 // Cause: Strings in the set overlap, and a longer match may
3645 // require a sequence including non-longest substrings.
3646 // For example, with a set containing "ab", "abc" and "cd",
3647 // span(contained) of "abcd" spans the entire string
3648 // but span(longest match) only spans the first 3 characters.
3649 // Each "-options" first resets all options and then applies the specified options.
3650 // A "-" without options resets the options.
3651 // The options are also reset for each new set.
3652 // Other strings will be spanned.
3653 static const char *const testdata[]={
3654 "[:ID_Continue:]",
3655 "*",
3656 "[:White_Space:]",
3657 "*",
3658 "[]",
3659 "*",
3660 "[\\u0000-\\U0010FFFF]",
3661 "*",
3662 "[\\u0000\\u0080\\u0800\\U00010000]",
3663 "*",
3664 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3665 "*",
3666 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3667 "-c",
3668 "*",
3669 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3670 "-c",
3671 "*",
3672
3673 // Overlapping strings cause overlapping attempts to match.
3674 "[x{xy}{xya}{axy}{ax}]",
3675 "-cl",
3676
3677 // More repetitions of "xya" would take too long with the recursive
3678 // reference implementation.
3679 // containsAll()=false
3680 // test_string 0x14
3681 "xx"
3682 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.
3683 "xx" // set.complement().span(contained) will stop between the two 'x'es.
3684 "xyaxyaxyaxya"
3685 "xx"
3686 "xyaxyaxyaxya" // span() ends here.
3687 "aaa",
3688
3689 // containsAll()=true
3690 // test_string 0x15
3691 "xx"
3692 "xyaxyaxyaxya"
3693 "xx"
3694 "xyaxyaxyaxya"
3695 "xx"
3696 "xyaxyaxyaxy",
3697
3698 "-bc",
3699 // test_string 0x17
3700 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
3701 "-c",
3702 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
3703 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
3704 "-",
3705 "byaya", // span() -> { 5 }
3706 "byay", // span() -> { 4 }
3707 "bya", // span() -> { 3 }
3708
3709 // span(longest match) will not span the whole string.
3710 "[a{ab}{bc}]",
3711 "-cl",
3712 // test_string 0x21
3713 "abc",
3714
3715 "[a{ab}{abc}{cd}]",
3716 "-cl",
3717 "acdabcdabccd",
3718
3719 // spanBack(longest match) will not span the whole string.
3720 "[c{ab}{bc}]",
3721 "-cl",
3722 "abc",
3723
3724 "[d{cd}{bcd}{ab}]",
3725 "-cl",
3726 "abbcdabcdabd",
3727
3728 // Test with non-ASCII set strings - test proper handling of surrogate pairs
3729 // and UTF-8 trail bytes.
3730 // Copies of above test sets and strings, but transliterated to have
3731 // different code points with similar trail units.
3732 // Previous: a b c d
3733 // Unicode: 042B 30AB 200AB 204AB
3734 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
3735 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
3736 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3737 "-cl",
3738 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3739
3740 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3741 "-cl",
3742 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3743
3744 // Stress bookkeeping and recursion.
3745 // The following strings are barely doable with the recursive
3746 // reference implementation.
3747 // The not-contained character at the end prevents an early exit from the span().
3748 "[b{bb}]",
3749 "-c",
3750 // test_string 0x33
3751 "bbbbbbbbbbbbbbbbbbbbbbbb-",
3752 // On complement sets, span() and spanBack() get different results
3753 // because b is not in the complement set and there is an odd number of b's
3754 // in the test string.
3755 "-bc",
3756 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3757
3758 // Test with set strings with an initial or final code point span
3759 // longer than 254.
3760 "[a{" _64_a _64_a _64_a _64_a "b}"
3761 "{a" _64_b _64_b _64_b _64_b "}]",
3762 "-c",
3763 _64_a _64_a _64_a _63_a "b",
3764 _64_a _64_a _64_a _64_a "b",
3765 _64_a _64_a _64_a _64_a "aaaabbbb",
3766 "a" _64_b _64_b _64_b _63_b,
3767 "a" _64_b _64_b _64_b _64_b,
3768 "aaaabbbb" _64_b _64_b _64_b _64_b,
3769
3770 // Test with strings containing unpaired surrogates.
3771 // They are not representable in UTF-8, and a leading trail surrogate
3772 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3773 // U+20001 == \\uD840\\uDC01
3774 // U+20400 == \\uD841\\uDC00
3775 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3776 "-8cl",
3777 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3778 };
3779 uint32_t whichSpans[96]={ SPAN_ALL };
3780 int32_t whichSpansCount=1;
3781
3782 UnicodeSet *sets[SET_COUNT]={ nullptr };
3783 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ nullptr };
3784
3785 char testName[1024];
3786 char *testNameLimit=testName;
3787
3788 int32_t i, j;
3789 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3790 const char *s=testdata[i];
3791 if(s[0]=='[') {
3792 // Create new test sets from this pattern.
3793 for(j=0; j<SET_COUNT; ++j) {
3794 delete sets_with_str[j];
3795 delete sets[j];
3796 }
3797 UErrorCode errorCode=U_ZERO_ERROR;
3798 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3799 if(U_FAILURE(errorCode)) {
3800 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3801 break;
3802 }
3803 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3804 sets[SLOW_NOT]->complement();
3805 // Intermediate set: Test cloning of a frozen set.
3806 UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3807 fast->freeze();
3808 sets[FAST]=fast->clone();
3809 delete fast;
3810 UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3811 fastNot->freeze();
3812 sets[FAST_NOT]=fastNot->clone();
3813 delete fastNot;
3814
3815 for(j=0; j<SET_COUNT; ++j) {
3816 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3817 }
3818
3819 strcpy(testName, s);
3820 testNameLimit=strchr(testName, 0);
3821 *testNameLimit++=':';
3822 *testNameLimit=0;
3823
3824 whichSpans[0]=SPAN_ALL;
3825 whichSpansCount=1;
3826 } else if(s[0]=='-') {
3827 whichSpans[0]=SPAN_ALL;
3828 whichSpansCount=1;
3829
3830 while(*++s!=0) {
3831 switch(*s) {
3832 case 'c':
3833 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3834 ~SPAN_POLARITY,
3835 SPAN_SET,
3836 SPAN_COMPLEMENT,
3837 0);
3838 break;
3839 case 'b':
3840 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3841 ~SPAN_DIRS,
3842 SPAN_FWD,
3843 SPAN_BACK,
3844 0);
3845 break;
3846 case 'l':
3847 // test USET_SPAN_CONTAINED FWD & BACK, and separately
3848 // USET_SPAN_SIMPLE only FWD, and separately
3849 // USET_SPAN_SIMPLE only BACK
3850 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3851 ~(SPAN_DIRS|SPAN_CONDITION),
3852 SPAN_DIRS|SPAN_CONTAINED,
3853 SPAN_FWD|SPAN_SIMPLE,
3854 SPAN_BACK|SPAN_SIMPLE);
3855 break;
3856 case '8':
3857 whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3858 ~SPAN_UTFS,
3859 SPAN_UTF16,
3860 SPAN_UTF8,
3861 0);
3862 break;
3863 default:
3864 errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3865 break;
3866 }
3867 }
3868 } else if(0==strcmp(s, "*")) {
3869 strcpy(testNameLimit, "bad_string");
3870 for(j=0; j<whichSpansCount; ++j) {
3871 if(whichSpansCount>1) {
3872 snprintf(testNameLimit+10 /* strlen("bad_string") */,
3873 sizeof(testName) - (testNameLimit+10-testName),
3874 "%%0x%3x",
3875 whichSpans[j]);
3876 }
3877 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3878 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3879 }
3880
3881 strcpy(testNameLimit, "contents");
3882 for(j=0; j<whichSpansCount; ++j) {
3883 if(whichSpansCount>1) {
3884 snprintf(testNameLimit+8 /* strlen("contents") */,
3885 sizeof(testName) - (testNameLimit+8-testName),
3886 "%%0x%3x",
3887 whichSpans[j]);
3888 }
3889 testSpanContents(sets_with_str, whichSpans[j], testName);
3890 }
3891 } else {
3892 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3893 strcpy(testNameLimit, "test_string");
3894 for(j=0; j<whichSpansCount; ++j) {
3895 if(whichSpansCount>1) {
3896 snprintf(testNameLimit+11 /* strlen("test_string") */,
3897 sizeof(testName) - (testNameLimit+11-testName),
3898 "%%0x%3x",
3899 whichSpans[j]);
3900 }
3901 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3902 }
3903 }
3904 }
3905 for(j=0; j<SET_COUNT; ++j) {
3906 delete sets_with_str[j];
3907 delete sets[j];
3908 }
3909 }
3910
3911 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
TestStringSpan()3912 void UnicodeSetTest::TestStringSpan() {
3913 static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3914 static const char *const string=
3915 "xx"
3916 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3917 "xx"
3918 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3919 "xx"
3920 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3921 "aaaa";
3922
3923 UErrorCode errorCode=U_ZERO_ERROR;
3924 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3925 UnicodeSet set(pattern16, errorCode);
3926 if(U_FAILURE(errorCode)) {
3927 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3928 return;
3929 }
3930
3931 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3932
3933 if(set.containsAll(string16)) {
3934 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be false", pattern, string);
3935 }
3936
3937 // Remove trailing "aaaa".
3938 string16.truncate(string16.length()-4);
3939 if(!set.containsAll(string16)) {
3940 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be true", pattern, string);
3941 }
3942
3943 string16=u"byayaxya";
3944 const char16_t *s16=string16.getBuffer();
3945 int32_t length16=string16.length();
3946 (void)length16; // Suppress set but not used warning.
3947 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3948 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3949 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3950 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3951 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3952 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3953 ) {
3954 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3955 }
3956
3957 pattern="[a{ab}{abc}{cd}]";
3958 pattern16=UnicodeString(pattern, -1, US_INV);
3959 set.applyPattern(pattern16, errorCode);
3960 if(U_FAILURE(errorCode)) {
3961 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3962 return;
3963 }
3964 string16=u"acdabcdabccd";
3965 s16=string16.getBuffer();
3966 length16=string16.length();
3967 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3968 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3969 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3970 ) {
3971 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3972 }
3973
3974 pattern="[d{cd}{bcd}{ab}]";
3975 pattern16=UnicodeString(pattern, -1, US_INV);
3976 set.applyPattern(pattern16, errorCode).freeze();
3977 if(U_FAILURE(errorCode)) {
3978 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3979 return;
3980 }
3981 string16=u"abbcdabcdabd";
3982 s16=string16.getBuffer();
3983 length16=string16.length();
3984 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3985 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3986 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3987 ) {
3988 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3989 }
3990 }
3991
TestPatternWithSurrogates()3992 void UnicodeSetTest::TestPatternWithSurrogates() {
3993 IcuTestErrorCode errorCode(*this, "TestPatternWithSurrogates");
3994 // Regression test for ICU-11891
3995 UnicodeSet surrogates;
3996 surrogates.add(0xd000, 0xd82f); // a range ending with a lead surrogate code point
3997 surrogates.add(0xd83a); // a lead surrogate
3998 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates
3999 UnicodeString pat;
4000 surrogates.toPattern(pat, false); // bad if U+D83A is immediately followed by U+DC00
4001 UnicodeSet s2;
4002 // was: U_MALFORMED_SET
4003 // Java: IllegalArgumentException: Error: Invalid range at "[...\U0001E800-\uDFFF|...]"
4004 s2.applyPattern(pat, errorCode);
4005 if (errorCode.errIfFailureAndReset("surrogates (1) to/from pattern")) { return; }
4006 checkEqual(surrogates, s2, "surrogates (1) to/from pattern");
4007
4008 // create a range of DBFF-DC00, and in the complement form a range of DC01-DC03
4009 surrogates.add(0xdbff).remove(0xdc01, 0xdc03);
4010 // add a beyond-surrogates range, up to the last code point
4011 surrogates.add(0x10affe, 0x10ffff);
4012 surrogates.toPattern(pat, false); // bad if U+DBFF is immediately followed by U+DC00
4013 s2.applyPattern(pat, errorCode);
4014 if (errorCode.errIfFailureAndReset("surrogates (2) to/from pattern")) { return; }
4015 checkEqual(surrogates, s2, "surrogates (2) to/from pattern");
4016
4017 // Test the toPattern() code path when the pattern is shorter in complement form:
4018 // [^opposite-ranges]
4019 surrogates.add(0, 0x6789);
4020 surrogates.toPattern(pat, false);
4021 s2.applyPattern(pat, errorCode);
4022 if (errorCode.errIfFailureAndReset("surrogates (3) to/from pattern")) { return; }
4023 checkEqual(surrogates, s2, "surrogates (3) to/from pattern");
4024
4025 // Start with a pattern, in case the original pattern is kept but
4026 // without the extra white space.
4027 surrogates.applyPattern(u"[\\uD83A \\uDC00-\\uDFFF]", errorCode);
4028 if (errorCode.errIfFailureAndReset("surrogates from pattern")) { return; }
4029 surrogates.toPattern(pat, false);
4030 s2.applyPattern(pat, errorCode);
4031 if (errorCode.errIfFailureAndReset("surrogates from/to/from pattern")) { return; }
4032 checkEqual(surrogates, s2, "surrogates from/to/from pattern");
4033 }
4034
TestIntOverflow()4035 void UnicodeSetTest::TestIntOverflow() {
4036 // This test triggers undefined double->int conversion behavior
4037 // if the implementation is not careful.
4038 IcuTestErrorCode errorCode(*this, "TestIntOverflow");
4039 UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
4040 assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
4041 assertEquals("[:ccc=int_overflow:] -> illegal argument",
4042 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4043 }
4044
TestUnusedCcc()4045 void UnicodeSetTest::TestUnusedCcc() {
4046 #if !UCONFIG_NO_NORMALIZATION
4047 // All numeric ccc values 0..255 are valid, but many are unused.
4048 IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
4049 UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
4050 assertSuccess("[:ccc=2:]", errorCode);
4051 assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
4052
4053 UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
4054 assertSuccess("[:ccc=255:]", errorCode);
4055 assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
4056
4057 // Non-integer values and values outside 0..255 are invalid.
4058 UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
4059 assertEquals("[:ccc=-1:] -> illegal argument",
4060 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4061 assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
4062
4063 UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
4064 assertEquals("[:ccc=256:] -> illegal argument",
4065 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4066 assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
4067
4068 UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
4069 assertEquals("[:ccc=1.1:] -> illegal argument",
4070 U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
4071 assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
4072 #endif
4073 }
4074
TestDeepPattern()4075 void UnicodeSetTest::TestDeepPattern() {
4076 IcuTestErrorCode errorCode(*this, "TestDeepPattern");
4077 // Nested ranges are parsed via recursion which can use a lot of stack space.
4078 // After a reasonable limit, we should get an error.
4079 constexpr int32_t DEPTH = 20000;
4080 UnicodeString pattern, suffix;
4081 for (int32_t i = 0; i < DEPTH; ++i) {
4082 pattern.append(u"[a", 2);
4083 suffix.append(']');
4084 }
4085 pattern.append(suffix);
4086 UnicodeSet set(pattern, errorCode);
4087 assertTrue("[a[a[a...1000s...]]] -> error", errorCode.isFailure());
4088 errorCode.reset();
4089 }
4090
TestEmptyString()4091 void UnicodeSetTest::TestEmptyString() {
4092 IcuTestErrorCode errorCode(*this, "TestEmptyString");
4093 // Starting with ICU 69, the empty string is allowed in UnicodeSet. ICU-13702
4094 UnicodeSet set(u"[{}]", errorCode);
4095 if (!assertSuccess("set from pattern with {}", errorCode)) { return; }
4096 assertTrue("set from pattern with {}", set.contains(u""));
4097 assertEquals("set from pattern with {}: size", 1, set.size());
4098 assertFalse("set from pattern with {}: isEmpty", set.isEmpty());
4099
4100 // Remove, add back, ...
4101 assertFalse("remove empty string", set.remove(u"").contains(u""));
4102 assertEquals("remove empty string: size", 0, set.size());
4103 assertTrue("remove empty string: isEmpty", set.isEmpty());
4104 assertTrue("add empty string", set.add(u"").contains(u""));
4105 // missing API -- assertTrue("retain empty string", set.retain(u"").contains(u""));
4106 assertFalse("complement-remove empty string", set.complement(u"").contains(u""));
4107 assertTrue("complement-add empty string", set.complement(u"").contains(u""));
4108
4109 assertFalse("clear", set.clear().contains(u""));
4110 assertTrue("add empty string 2", set.add(u"").contains(u""));
4111 assertFalse("removeAllStrings", set.removeAllStrings().contains(u""));
4112 assertTrue("add empty string 3", set.add(u"").contains(u""));
4113 // Note that this leaves the set containing exactly the empty string.
4114
4115 // strings() access and iteration
4116 // no C++ equivalent for Java strings() -- assertTrue("strings()", set.strings().contains(u""));
4117 UnicodeSetIterator sit(set);
4118 assertTrue("set iterator.next()", sit.next());
4119 assertTrue("set iterator has empty string", sit.isString() && sit.getString().isEmpty());
4120
4121 // The empty string is ignored in matching.
4122 set.add(u'a').add(u'c');
4123 assertEquals("span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4124 assertEquals("spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4125 assertTrue("containsNone", set.containsNone(u"def"));
4126 assertFalse("containsSome", set.containsSome(u"def"));
4127 set.freeze();
4128 assertEquals("frozen span", 1, set.span(u"abc", 3, USET_SPAN_SIMPLE));
4129 assertEquals("frozen spanBack", 2, set.spanBack(u"abc", 3, USET_SPAN_SIMPLE));
4130 assertTrue("frozen containsNone", set.containsNone(u"def"));
4131 assertFalse("frozen containsSome", set.containsSome(u"def"));
4132 }
4133
assertNext(UnicodeSetIterator & iter,const UnicodeString & expected)4134 void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) {
4135 assertTrue(expected + ".next()", iter.next());
4136 assertEquals(expected + ".getString()", expected, iter.getString());
4137 }
4138
TestSkipToStrings()4139 void UnicodeSetTest::TestSkipToStrings() {
4140 IcuTestErrorCode errorCode(*this, "TestSkipToStrings");
4141 UnicodeSet set(u"[0189{}{ch}]", errorCode);
4142 UnicodeSetIterator iter(set);
4143 assertNext(iter.skipToStrings(), u"");
4144 assertNext(iter, u"ch");
4145 assertFalse("no next", iter.next());
4146
4147 iter.reset();
4148 assertNext(iter, u"0");
4149 assertNext(iter, u"1");
4150 assertNext(iter, u"8");
4151 assertNext(iter, u"9");
4152 assertNext(iter, u"");
4153 assertNext(iter, u"ch");
4154 assertFalse("no next", iter.next());
4155
4156 iter.reset();
4157 assertNext(iter, u"0");
4158 iter.skipToStrings();
4159 assertNext(iter, u"");
4160 assertNext(iter, u"ch");
4161 assertFalse("no next", iter.next());
4162
4163 iter.reset();
4164 iter.nextRange();
4165 assertNext(iter, u"8");
4166 iter.skipToStrings();
4167 assertNext(iter, u"");
4168 assertNext(iter, u"ch");
4169 assertFalse("no next", iter.next());
4170
4171 iter.reset();
4172 iter.nextRange();
4173 iter.nextRange();
4174 iter.nextRange();
4175 iter.skipToStrings();
4176 assertNext(iter, u"ch");
4177 assertFalse("no next", iter.next());
4178 }
4179
TestPatternCodePointComplement()4180 void UnicodeSetTest::TestPatternCodePointComplement() {
4181 IcuTestErrorCode errorCode(*this, "TestPatternCodePointComplement");
4182 // ICU-21524 changes pattern ^ and equivalent functions to perform a "code point complement".
4183 // [^abc{ch}] = [[:Any:]-[abc{ch}]] which removes all strings.
4184 {
4185 UnicodeSet simple(u"[^abc{ch}]", errorCode);
4186 assertEquals("[^abc{ch}] --> lots of elements", 0x110000 - 3, simple.size());
4187 assertFalse("[^abc{ch}] --> no strings", simple.hasStrings());
4188 assertFalse("[^abc{ch}] --> no 'a'", simple.contains(u'a'));
4189 }
4190
4191 {
4192 UnicodeSet notBasic(u"[:^Basic_Emoji:]", errorCode);
4193 if (errorCode.errDataIfFailureAndReset("[:^Basic_Emoji:]")) {
4194 return;
4195 }
4196 assertTrue("[:^Basic_Emoji:] --> lots of elements", notBasic.size() > 1000);
4197 assertFalse("[:^Basic_Emoji:] --> no strings", notBasic.hasStrings());
4198 assertFalse("[:^Basic_Emoji:] --> no bicycle", notBasic.contains(U''));
4199 }
4200
4201 {
4202 UnicodeSet notBasic(u"[:Basic_Emoji=No:]", errorCode);
4203 assertTrue("[:Basic_Emoji=No:] --> lots of elements", notBasic.size() > 1000);
4204 assertFalse("[:Basic_Emoji=No:] --> no strings", notBasic.hasStrings());
4205 assertFalse("[:Basic_Emoji=No:] --> no bicycle", notBasic.contains(U''));
4206 }
4207
4208 {
4209 UnicodeSet notBasic;
4210 notBasic.applyIntPropertyValue(UCHAR_BASIC_EMOJI, 0, errorCode);
4211 assertTrue("[].applyIntPropertyValue(Basic_Emoji, 0) --> lots of elements",
4212 notBasic.size() > 1000);
4213 assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no strings",
4214 notBasic.hasStrings());
4215 assertFalse("[].applyIntPropertyValue(Basic_Emoji, 0) --> no bicycle",
4216 notBasic.contains(U''));
4217 }
4218
4219 {
4220 UnicodeSet notBasic;
4221 notBasic.applyPropertyAlias("Basic_Emoji", "No", errorCode);
4222 assertTrue("[].applyPropertyAlias(Basic_Emoji, No) --> lots of elements",
4223 notBasic.size() > 1000);
4224 assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no strings",
4225 notBasic.hasStrings());
4226 assertFalse("[].applyPropertyAlias(Basic_Emoji, No) --> no bicycle",
4227 notBasic.contains(U''));
4228 }
4229
4230 // When there are strings, we must not use the complement for a more compact toPattern().
4231 {
4232 UnicodeSet set;
4233 set.add(0, u'Y').add(u'b', u'q').add(u'x', 0x10ffff);
4234 UnicodeString pattern;
4235 set.toPattern(pattern, true);
4236 UnicodeSet set2(pattern, errorCode);
4237 checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
4238 assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern);
4239
4240 set.add("ch").add("ss");
4241 set.toPattern(pattern, true);
4242 set2 = UnicodeSet(pattern, errorCode);
4243 checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
4244 assertEquals("set(with 0 & max, with strings).toPattern()",
4245 u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
4246 }
4247
4248 // The complement() API behavior does not change under this ticket.
4249 {
4250 UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
4251 notBasic.complement();
4252 assertTrue("[:Basic_Emoji:].complement() --> lots of elements", notBasic.size() > 1000);
4253 assertTrue("[:Basic_Emoji:].complement() --> has strings", notBasic.hasStrings());
4254 assertTrue("[:Basic_Emoji:].complement().contains(chipmunk+emoji)",
4255 notBasic.contains(u"\uFE0F"));
4256 assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U''));
4257 }
4258 }
4259