1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9 #include "unicode/uset.h"
10 #include "unicode/ustring.h"
11 #include "cintltst.h"
12 #include "cmemory.h"
13 #include <stdlib.h>
14 #include <string.h>
15
16 #define TEST(x) addTest(root, &x, "uset/" # x)
17
18 static void TestAPI(void);
19 static void Testj2269(void);
20 static void TestSerialized(void);
21 static void TestNonInvariantPattern(void);
22 static void TestBadPattern(void);
23 static void TestFreezable(void);
24 static void TestSpan(void);
25
26 void addUSetTest(TestNode** root);
27
28 static void expect(const USet* set,
29 const char* inList,
30 const char* outList,
31 UErrorCode* ec);
32 static void expectContainment(const USet* set,
33 const char* list,
34 UBool isIn);
35 static char oneUCharToChar(UChar32 c);
36 static void expectItems(const USet* set,
37 const char* items);
38
39 void
addUSetTest(TestNode ** root)40 addUSetTest(TestNode** root) {
41 TEST(TestAPI);
42 TEST(Testj2269);
43 TEST(TestSerialized);
44 TEST(TestNonInvariantPattern);
45 TEST(TestBadPattern);
46 TEST(TestFreezable);
47 TEST(TestSpan);
48 }
49
50 /*------------------------------------------------------------------
51 * Tests
52 *------------------------------------------------------------------*/
53
Testj2269()54 static void Testj2269() {
55 UErrorCode status = U_ZERO_ERROR;
56 UChar a[4] = { 0x61, 0x62, 0x63, 0 };
57 USet *s = uset_open(1, 0);
58 uset_addString(s, a, 3);
59 a[0] = 0x63; a[1] = 0x63;
60 expect(s, "{abc}", "{ccc}", &status);
61 uset_close(s);
62 }
63
64 static const UChar PAT[] = {91,97,45,99,123,97,98,125,93,0}; /* "[a-c{ab}]" */
65 static const int32_t PAT_LEN = UPRV_LENGTHOF(PAT) - 1;
66
67 static const UChar PAT_lb[] = {0x6C, 0x62, 0}; /* "lb" */
68 static const int32_t PAT_lb_LEN = UPRV_LENGTHOF(PAT_lb) - 1;
69
70 static const UChar VAL_SP[] = {0x53, 0x50, 0}; /* "SP" */
71 static const int32_t VAL_SP_LEN = UPRV_LENGTHOF(VAL_SP) - 1;
72
73 static const UChar STR_bc[] = {98,99,0}; /* "bc" */
74 static const int32_t STR_bc_LEN = UPRV_LENGTHOF(STR_bc) - 1;
75
76 static const UChar STR_ab[] = {97,98,0}; /* "ab" */
77 static const int32_t STR_ab_LEN = UPRV_LENGTHOF(STR_ab) - 1;
78
79 /**
80 * Basic API test for uset.x
81 */
TestAPI()82 static void TestAPI() {
83 USet* set;
84 USet* set2;
85 UErrorCode ec;
86
87 /* [] */
88 set = uset_openEmpty();
89 expect(set, "", "abc{ab}", NULL);
90 uset_close(set);
91
92 set = uset_open(1, 0);
93 expect(set, "", "abc{ab}", NULL);
94 uset_close(set);
95
96 set = uset_open(1, 1);
97 uset_clear(set);
98 expect(set, "", "abc{ab}", NULL);
99 uset_close(set);
100
101 /* [ABC] */
102 set = uset_open(0x0041, 0x0043);
103 expect(set, "ABC", "DEF{ab}", NULL);
104 uset_close(set);
105
106 /* [a-c{ab}] */
107 ec = U_ZERO_ERROR;
108 set = uset_openPattern(PAT, PAT_LEN, &ec);
109 if(U_FAILURE(ec)) {
110 log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec));
111 return;
112 }
113 if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
114 log_err("uset_resemblesPattern of PAT failed\n");
115 }
116 expect(set, "abc{ab}", "def{bc}", &ec);
117
118 /* [a-d{ab}] */
119 uset_add(set, 0x64);
120 expect(set, "abcd{ab}", "ef{bc}", NULL);
121
122 /* [acd{ab}{bc}] */
123 uset_remove(set, 0x62);
124 uset_addString(set, STR_bc, STR_bc_LEN);
125 expect(set, "acd{ab}{bc}", "bef{cd}", NULL);
126
127 /* [acd{bc}] */
128 uset_removeString(set, STR_ab, STR_ab_LEN);
129 expect(set, "acd{bc}", "bfg{ab}", NULL);
130
131 /* [^acd{bc}] */
132 uset_complement(set);
133 expect(set, "bef{bc}", "acd{ac}", NULL);
134
135 /* [a-e{bc}] */
136 uset_complement(set);
137 uset_addRange(set, 0x0062, 0x0065);
138 expect(set, "abcde{bc}", "fg{ab}", NULL);
139
140 /* [de{bc}] */
141 uset_removeRange(set, 0x0050, 0x0063);
142 expect(set, "de{bc}", "bcfg{ab}", NULL);
143
144 /* [g-l] */
145 uset_set(set, 0x0067, 0x006C);
146 expect(set, "ghijkl", "de{bc}", NULL);
147
148 if (uset_indexOf(set, 0x0067) != 0) {
149 log_err("uset_indexOf failed finding correct index of 'g'\n");
150 }
151
152 if (uset_charAt(set, 0) != 0x0067) {
153 log_err("uset_charAt failed finding correct char 'g' at index 0\n");
154 }
155
156 /* How to test this one...? */
157 uset_compact(set);
158
159 /* [g-i] */
160 uset_retain(set, 0x0067, 0x0069);
161 expect(set, "ghi", "dejkl{bc}", NULL);
162
163 /* UCHAR_ASCII_HEX_DIGIT */
164 uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec);
165 if(U_FAILURE(ec)) {
166 log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec));
167 return;
168 }
169 expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);
170
171 /* [ab] */
172 uset_clear(set);
173 uset_addAllCodePoints(set, STR_ab, STR_ab_LEN);
174 expect(set, "ab", "def{ab}", NULL);
175 if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){
176 log_err("set should not conatin all characters of \"bc\" \n");
177 }
178
179 /* [] */
180 set2 = uset_open(1, 1);
181 uset_clear(set2);
182
183 /* space */
184 uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec);
185 expect(set2, " ", "abcdefghi{bc}", NULL);
186
187 /* [a-c] */
188 uset_set(set2, 0x0061, 0x0063);
189 /* [g-i] */
190 uset_set(set, 0x0067, 0x0069);
191
192 /* [a-c g-i] */
193 if (uset_containsSome(set, set2)) {
194 log_err("set should not contain some of set2 yet\n");
195 }
196 uset_complementAll(set, set2);
197 if (!uset_containsSome(set, set2)) {
198 log_err("set should contain some of set2\n");
199 }
200 expect(set, "abcghi", "def{bc}", NULL);
201
202 /* [g-i] */
203 uset_removeAll(set, set2);
204 expect(set, "ghi", "abcdef{bc}", NULL);
205
206 /* [a-c g-i] */
207 uset_addAll(set2, set);
208 expect(set2, "abcghi", "def{bc}", NULL);
209
210 /* [g-i] */
211 uset_retainAll(set2, set);
212 expect(set2, "ghi", "abcdef{bc}", NULL);
213
214 uset_close(set);
215 uset_close(set2);
216 }
217
218 /*------------------------------------------------------------------
219 * Support
220 *------------------------------------------------------------------*/
221
222 /**
223 * Verifies that the given set contains the characters and strings in
224 * inList, and does not contain those in outList. Also verifies that
225 * 'set' is not NULL and that 'ec' succeeds.
226 * @param set the set to test, or NULL (on error)
227 * @param inList list of set contents, in iteration order. Format is
228 * list of individual strings, in iteration order, followed by sorted
229 * list of strings, delimited by {}. This means we do not test
230 * characters '{' or '}' and we do not test strings containing those
231 * characters either.
232 * @param outList list of things not in the set. Same format as
233 * inList.
234 * @param ec an error code, checked for success. May be NULL in which
235 * case it is ignored.
236 */
expect(const USet * set,const char * inList,const char * outList,UErrorCode * ec)237 static void expect(const USet* set,
238 const char* inList,
239 const char* outList,
240 UErrorCode* ec) {
241 if (ec!=NULL && U_FAILURE(*ec)) {
242 log_err("FAIL: %s\n", u_errorName(*ec));
243 return;
244 }
245 if (set == NULL) {
246 log_err("FAIL: USet is NULL\n");
247 return;
248 }
249 expectContainment(set, inList, TRUE);
250 expectContainment(set, outList, FALSE);
251 expectItems(set, inList);
252 }
253
expectContainment(const USet * set,const char * list,UBool isIn)254 static void expectContainment(const USet* set,
255 const char* list,
256 UBool isIn) {
257 const char* p = list;
258 UChar ustr[4096];
259 char *pat;
260 UErrorCode ec;
261 int32_t rangeStart = -1, rangeEnd = -1, length;
262
263 ec = U_ZERO_ERROR;
264 length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
265 if(U_FAILURE(ec)) {
266 log_err("FAIL: uset_toPattern() fails in expectContainment() - %s\n", u_errorName(ec));
267 return;
268 }
269 pat=aescstrdup(ustr, length);
270
271 while (*p) {
272 if (*p=='{') {
273 const char* stringStart = ++p;
274 int32_t stringLength = 0;
275 char strCopy[64];
276
277 while (*p++ != '}') {
278 }
279 stringLength = (int32_t)(p - stringStart - 1);
280 strncpy(strCopy, stringStart, stringLength);
281 strCopy[stringLength] = 0;
282
283 u_charsToUChars(stringStart, ustr, stringLength);
284
285 if (uset_containsString(set, ustr, stringLength) == isIn) {
286 log_verbose("Ok: %s %s \"%s\"\n", pat,
287 (isIn ? "contains" : "does not contain"),
288 strCopy);
289 } else {
290 log_data_err("FAIL: %s %s \"%s\" (Are you missing data?)\n", pat,
291 (isIn ? "does not contain" : "contains"),
292 strCopy);
293 }
294 }
295
296 else {
297 UChar32 c;
298
299 u_charsToUChars(p, ustr, 1);
300 c = ustr[0];
301
302 if (uset_contains(set, c) == isIn) {
303 log_verbose("Ok: %s %s '%c'\n", pat,
304 (isIn ? "contains" : "does not contain"),
305 *p);
306 } else {
307 log_data_err("FAIL: %s %s '%c' (Are you missing data?)\n", pat,
308 (isIn ? "does not contain" : "contains"),
309 *p);
310 }
311
312 /* Test the range API too by looking for ranges */
313 if (c == rangeEnd+1) {
314 rangeEnd = c;
315 } else {
316 if (rangeStart >= 0) {
317 if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
318 log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
319 (isIn ? "contains" : "does not contain"),
320 rangeStart, rangeEnd);
321 } else {
322 log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
323 (isIn ? "does not contain" : "contains"),
324 rangeStart, rangeEnd);
325 }
326 }
327 rangeStart = rangeEnd = c;
328 }
329
330 ++p;
331 }
332 }
333
334 if (rangeStart >= 0) {
335 if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
336 log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
337 (isIn ? "contains" : "does not contain"),
338 rangeStart, rangeEnd);
339 } else {
340 log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
341 (isIn ? "does not contain" : "contains"),
342 rangeStart, rangeEnd);
343 }
344 }
345 }
346
347 /* This only works for invariant BMP chars */
oneUCharToChar(UChar32 c)348 static char oneUCharToChar(UChar32 c) {
349 UChar ubuf[1];
350 char buf[1];
351 ubuf[0] = (UChar) c;
352 u_UCharsToChars(ubuf, buf, 1);
353 return buf[0];
354 }
355
expectItems(const USet * set,const char * items)356 static void expectItems(const USet* set,
357 const char* items) {
358 const char* p = items;
359 UChar ustr[4096], itemStr[4096];
360 char buf[4096];
361 char *pat;
362 UErrorCode ec;
363 int32_t expectedSize = 0;
364 int32_t itemCount = uset_getItemCount(set);
365 int32_t itemIndex = 0;
366 UChar32 start = 1, end = 0;
367 int32_t itemLen = 0, length;
368
369 ec = U_ZERO_ERROR;
370 length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
371 if (U_FAILURE(ec)) {
372 log_err("FAIL: uset_toPattern => %s\n", u_errorName(ec));
373 return;
374 }
375 pat=aescstrdup(ustr, length);
376
377 if (uset_isEmpty(set) != (strlen(items)==0)) {
378 log_data_err("FAIL: %s should return %s from isEmpty (Are you missing data?)\n",
379 pat,
380 strlen(items)==0 ? "TRUE" : "FALSE");
381 }
382
383 /* Don't test patterns starting with "[^" */
384 if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
385 return;
386 }
387
388 while (*p) {
389
390 ++expectedSize;
391
392 if (start > end || start == -1) {
393 /* Fetch our next item */
394 if (itemIndex >= itemCount) {
395 log_data_err("FAIL: ran out of items iterating %s (Are you missing data?)\n", pat);
396 return;
397 }
398
399 itemLen = uset_getItem(set, itemIndex, &start, &end,
400 itemStr, sizeof(itemStr), &ec);
401 if (U_FAILURE(ec) || itemLen < 0) {
402 log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
403 return;
404 }
405
406 if (itemLen == 0) {
407 log_verbose("Ok: %s item %d is %c-%c\n", pat,
408 itemIndex, oneUCharToChar(start),
409 oneUCharToChar(end));
410 } else {
411 itemStr[itemLen] = 0;
412 u_UCharsToChars(itemStr, buf, itemLen+1);
413 log_verbose("Ok: %s item %d is \"%s\"\n", pat, itemIndex, buf);
414 }
415
416 ++itemIndex;
417 }
418
419 if (*p=='{') {
420 const char* stringStart = ++p;
421 int32_t stringLength = 0;
422 char strCopy[64];
423
424 while (*p++ != '}') {
425 }
426 stringLength = (int32_t)(p - stringStart - 1);
427 strncpy(strCopy, stringStart, stringLength);
428 strCopy[stringLength] = 0;
429
430 u_charsToUChars(stringStart, ustr, stringLength);
431 ustr[stringLength] = 0;
432
433 if (itemLen == 0) {
434 log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
435 pat, strCopy);
436 return;
437 }
438
439 if (u_strcmp(ustr, itemStr) != 0) {
440 log_err("FAIL: for %s expect \"%s\" next\n",
441 pat, strCopy);
442 return;
443 }
444 }
445
446 else {
447 UChar32 c;
448
449 u_charsToUChars(p, ustr, 1);
450 c = ustr[0];
451
452 if (itemLen != 0) {
453 log_err("FAIL: for %s expect '%c' next, but got a string\n",
454 pat, *p);
455 return;
456 }
457
458 if (c != start++) {
459 log_err("FAIL: for %s expect '%c' next\n",
460 pat, *p);
461 return;
462 }
463
464 ++p;
465 }
466 }
467
468 if (uset_size(set) == expectedSize) {
469 log_verbose("Ok: %s size is %d\n", pat, expectedSize);
470 } else {
471 log_err("FAIL: %s size is %d, expected %d\n",
472 pat, uset_size(set), expectedSize);
473 }
474 }
475
476 static void
TestSerialized()477 TestSerialized() {
478 uint16_t buffer[1000];
479 USerializedSet sset;
480 USet *set;
481 UErrorCode errorCode;
482 UChar32 c;
483 int32_t length;
484
485 /* use a pattern that generates both BMP and supplementary code points */
486 U_STRING_DECL(pattern, "[:Cf:]", 6);
487 U_STRING_INIT(pattern, "[:Cf:]", 6);
488
489 errorCode=U_ZERO_ERROR;
490 set=uset_openPattern(pattern, -1, &errorCode);
491 if(U_FAILURE(errorCode)) {
492 log_data_err("uset_openPattern([:Cf:]) failed - %s (Are you missing data?)\n", u_errorName(errorCode));
493 return;
494 }
495
496 length=uset_serialize(set, buffer, UPRV_LENGTHOF(buffer), &errorCode);
497 if(U_FAILURE(errorCode)) {
498 log_err("unable to uset_serialize([:Cf:]) - %s\n", u_errorName(errorCode));
499 uset_close(set);
500 return;
501 }
502
503 uset_getSerializedSet(&sset, buffer, length);
504 for(c=0; c<=0x10ffff; ++c) {
505 if(uset_contains(set, c)!=uset_serializedContains(&sset, c)) {
506 log_err("uset_contains(U+%04x)!=uset_serializedContains(U+%04x)\n", c);
507 break;
508 }
509 }
510
511 uset_close(set);
512 }
513
514 /**
515 * Make sure that when non-invariant chars are passed to uset_openPattern
516 * they do not cause an ugly failure mode (e.g. assertion failure).
517 * JB#3795.
518 */
519 static void
TestNonInvariantPattern()520 TestNonInvariantPattern() {
521 UErrorCode ec = U_ZERO_ERROR;
522 /* The critical part of this test is that the following pattern
523 must contain a non-invariant character. */
524 static const char *pattern = "[:ccc!=0:]";
525 UChar buf[256];
526 int32_t len = u_unescape(pattern, buf, 256);
527 /* This test 'fails' by having an assertion failure within the
528 following call. It passes by running to completion with no
529 assertion failure. */
530 USet *set = uset_openPattern(buf, len, &ec);
531 uset_close(set);
532 }
533
TestBadPattern(void)534 static void TestBadPattern(void) {
535 UErrorCode status = U_ZERO_ERROR;
536 USet *pat;
537 U_STRING_DECL(pattern, "[", 1);
538 U_STRING_INIT(pattern, "[", 1);
539 pat = uset_openPatternOptions(pattern, u_strlen(pattern), 0, &status);
540 if (pat != NULL || U_SUCCESS(status)) {
541 log_err("uset_openPatternOptions did not fail as expected %s\n", u_errorName(status));
542 }
543 }
544
openIDSet()545 static USet *openIDSet() {
546 UErrorCode errorCode = U_ZERO_ERROR;
547 U_STRING_DECL(pattern, "[:ID_Continue:]", 15);
548 U_STRING_INIT(pattern, "[:ID_Continue:]", 15);
549 return uset_openPattern(pattern, 15, &errorCode);
550 }
551
TestFreezable()552 static void TestFreezable() {
553 USet *idSet;
554 USet *frozen;
555 USet *thawed;
556
557 idSet=openIDSet();
558
559 if (idSet == NULL) {
560 log_data_err("openIDSet() returned NULL. (Are you missing data?)\n");
561 uset_close(idSet);
562 return;
563 }
564
565 frozen=uset_clone(idSet);
566
567 if (frozen == NULL) {
568 log_err("uset_Clone() returned NULL\n");
569 return;
570 }
571
572 if(!uset_equals(frozen, idSet)) {
573 log_err("uset_clone() did not make an equal copy\n");
574 }
575
576 uset_freeze(frozen);
577 uset_addRange(frozen, 0xd802, 0xd805);
578
579 if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) {
580 log_err("uset_freeze() or uset_isFrozen() does not work\n");
581 }
582
583 thawed=uset_cloneAsThawed(frozen);
584
585 if (thawed == NULL) {
586 log_err("uset_cloneAsThawed(frozen) returned NULL");
587 uset_close(frozen);
588 uset_close(idSet);
589 return;
590 }
591
592 uset_addRange(thawed, 0xd802, 0xd805);
593
594 if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) {
595 log_err("uset_cloneAsThawed() does not work\n");
596 }
597
598 uset_close(idSet);
599 uset_close(frozen);
600 uset_close(thawed);
601 }
602
TestSpan()603 static void TestSpan() {
604 static const UChar s16[2]={ 0xe01, 0x3000 };
605 static const char* s8="\xE0\xB8\x81\xE3\x80\x80";
606
607 USet *idSet=openIDSet();
608
609 if (idSet == NULL) {
610 log_data_err("openIDSet() returned NULL (Are you missing data?)\n");
611 return;
612 }
613
614 if(
615 1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
616 0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
617 2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
618 1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
619 ) {
620 log_err("uset_span() or uset_spanBack() does not work\n");
621 }
622
623 if(
624 3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
625 0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
626 6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
627 3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
628 ) {
629 log_err("uset_spanUTF8() or uset_spanBackUTF8() does not work\n");
630 }
631
632 uset_freeze(idSet);
633
634 if(
635 1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
636 0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
637 2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
638 1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
639 ) {
640 log_err("uset_span(frozen) or uset_spanBack(frozen) does not work\n");
641 }
642
643 if(
644 3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
645 0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
646 6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
647 3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
648 ) {
649 log_err("uset_spanUTF8(frozen) or uset_spanBackUTF8(frozen) does not work\n");
650 }
651
652 uset_close(idSet);
653 }
654
655 /*eof*/
656