1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 1997-2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
7
8 #include "unicode/ustring.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/uniset.h"
12 #include "unicode/putil.h"
13 #include "unicode/uscript.h"
14 #include "unicode/uset.h"
15 #include "charstr.h"
16 #include "cstring.h"
17 #include "hash.h"
18 #include "patternprops.h"
19 #include "ppucd.h"
20 #include "normalizer2impl.h"
21 #include "testutil.h"
22 #include "uparse.h"
23 #include "ucdtest.h"
24
25 static const char *ignorePropNames[]={
26 "FC_NFKC",
27 "NFD_QC",
28 "NFC_QC",
29 "NFKD_QC",
30 "NFKC_QC",
31 "Expands_On_NFD",
32 "Expands_On_NFC",
33 "Expands_On_NFKD",
34 "Expands_On_NFKC",
35 "InCB",
36 "NFKC_CF",
37 "NFKC_SCF"
38 };
39
UnicodeTest()40 UnicodeTest::UnicodeTest()
41 {
42 UErrorCode errorCode=U_ZERO_ERROR;
43 unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
44 if(U_FAILURE(errorCode)) {
45 delete unknownPropertyNames;
46 unknownPropertyNames=nullptr;
47 }
48 // Ignore some property names altogether.
49 for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) {
50 unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
51 }
52 }
53
~UnicodeTest()54 UnicodeTest::~UnicodeTest()
55 {
56 delete unknownPropertyNames;
57 }
58
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)59 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
60 {
61 if(exec) {
62 logln("TestSuite UnicodeTest: ");
63 }
64 TESTCASE_AUTO_BEGIN;
65 TESTCASE_AUTO(TestAdditionalProperties);
66 TESTCASE_AUTO(TestBinaryValues);
67 TESTCASE_AUTO(TestConsistency);
68 TESTCASE_AUTO(TestPatternProperties);
69 TESTCASE_AUTO(TestScriptMetadata);
70 TESTCASE_AUTO(TestBidiPairedBracketType);
71 TESTCASE_AUTO(TestEmojiProperties);
72 TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
73 TESTCASE_AUTO(TestIndicPositionalCategory);
74 TESTCASE_AUTO(TestIndicSyllabicCategory);
75 TESTCASE_AUTO(TestVerticalOrientation);
76 TESTCASE_AUTO(TestDefaultScriptExtensions);
77 TESTCASE_AUTO(TestInvalidCodePointFolding);
78 #if !UCONFIG_NO_NORMALIZATION
79 TESTCASE_AUTO(TestBinaryCharacterProperties);
80 TESTCASE_AUTO(TestIntCharacterProperties);
81 #endif
82 TESTCASE_AUTO(TestPropertyNames);
83 TESTCASE_AUTO(TestIDSUnaryOperator);
84 TESTCASE_AUTO(TestIDCompatMath);
85 TESTCASE_AUTO(TestBinaryPropertyUsingPpucd);
86 TESTCASE_AUTO(TestIDStatus);
87 TESTCASE_AUTO(TestIDType);
88 TESTCASE_AUTO_END;
89 }
90
91 //====================================================
92 // private data used by the tests
93 //====================================================
94
95 // test DerivedCoreProperties.txt -------------------------------------------
96
97 // copied from genprops.c
98 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)99 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
100 const char *t, *z;
101 int32_t i, j;
102
103 s=u_skipWhitespace(s);
104 for(i=0; i<countTokens; ++i) {
105 t=tokens[i];
106 if(t!=nullptr) {
107 for(j=0;; ++j) {
108 if(t[j]!=0) {
109 if(s[j]!=t[j]) {
110 break;
111 }
112 } else {
113 z=u_skipWhitespace(s+j);
114 if(*z==';' || *z==0) {
115 return i;
116 } else {
117 break;
118 }
119 }
120 }
121 }
122 }
123 return -1;
124 }
125
126 static const char *const
127 derivedPropsNames[]={
128 "Math",
129 "Alphabetic",
130 "Lowercase",
131 "Uppercase",
132 "ID_Start",
133 "ID_Continue",
134 "XID_Start",
135 "XID_Continue",
136 "Default_Ignorable_Code_Point",
137 "Full_Composition_Exclusion",
138 "Grapheme_Extend",
139 "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
140 "Grapheme_Base",
141 "Cased",
142 "Case_Ignorable",
143 "Changes_When_Lowercased",
144 "Changes_When_Uppercased",
145 "Changes_When_Titlecased",
146 "Changes_When_Casefolded",
147 "Changes_When_Casemapped",
148 "Changes_When_NFKC_Casefolded"
149 };
150
151 static const UProperty
152 derivedPropsIndex[]={
153 UCHAR_MATH,
154 UCHAR_ALPHABETIC,
155 UCHAR_LOWERCASE,
156 UCHAR_UPPERCASE,
157 UCHAR_ID_START,
158 UCHAR_ID_CONTINUE,
159 UCHAR_XID_START,
160 UCHAR_XID_CONTINUE,
161 UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
162 UCHAR_FULL_COMPOSITION_EXCLUSION,
163 UCHAR_GRAPHEME_EXTEND,
164 UCHAR_GRAPHEME_LINK,
165 UCHAR_GRAPHEME_BASE,
166 UCHAR_CASED,
167 UCHAR_CASE_IGNORABLE,
168 UCHAR_CHANGES_WHEN_LOWERCASED,
169 UCHAR_CHANGES_WHEN_UPPERCASED,
170 UCHAR_CHANGES_WHEN_TITLECASED,
171 UCHAR_CHANGES_WHEN_CASEFOLDED,
172 UCHAR_CHANGES_WHEN_CASEMAPPED,
173 UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
174 };
175
176 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 };
177
178 enum { MAX_ERRORS=50 };
179
180 U_CFUNC void U_CALLCONV
derivedPropsLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)181 derivedPropsLineFn(void *context,
182 char *fields[][2], int32_t /* fieldCount */,
183 UErrorCode *pErrorCode)
184 {
185 UnicodeTest *me=static_cast<UnicodeTest*>(context);
186 uint32_t start, end;
187 int32_t i;
188
189 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
190 if(U_FAILURE(*pErrorCode)) {
191 me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
192 return;
193 }
194
195 /* parse derived binary property name, ignore unknown names */
196 i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]);
197 if(i<0) {
198 UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
199 propName.trim();
200 if(me->unknownPropertyNames->find(propName)==nullptr) {
201 UErrorCode errorCode=U_ZERO_ERROR;
202 me->unknownPropertyNames->puti(propName, 1, errorCode);
203 me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
204 }
205 return;
206 }
207
208 me->derivedProps[i].add(start, end);
209 }
210
TestAdditionalProperties()211 void UnicodeTest::TestAdditionalProperties() {
212 #if !UCONFIG_NO_NORMALIZATION
213 // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
214 if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) {
215 errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
216 UPRV_LENGTHOF(derivedPropsNames));
217 return;
218 }
219 if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) {
220 errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n");
221 return;
222 }
223
224 char path[500];
225 if(getUnidataPath(path) == nullptr) {
226 errln("unable to find path to source/data/unidata/");
227 return;
228 }
229 char *basename=strchr(path, 0);
230 strcpy(basename, "DerivedCoreProperties.txt");
231
232 char *fields[2][2];
233 UErrorCode errorCode=U_ZERO_ERROR;
234 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
235 if(U_FAILURE(errorCode)) {
236 errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
237 return;
238 }
239
240 strcpy(basename, "DerivedNormalizationProps.txt");
241 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
242 if(U_FAILURE(errorCode)) {
243 errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
244 return;
245 }
246
247 // now we have all derived core properties in the UnicodeSets
248 // run them all through the API
249 int32_t rangeCount, range;
250 uint32_t i;
251 UChar32 start, end;
252
253 // test all true properties
254 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
255 rangeCount=derivedProps[i].getRangeCount();
256 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
257 start=derivedProps[i].getRangeStart(range);
258 end=derivedProps[i].getRangeEnd(range);
259 for(; start<=end; ++start) {
260 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
261 dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==false is wrong", start, derivedPropsNames[i]);
262 if(++numErrors[i]>=MAX_ERRORS) {
263 dataerrln("Too many errors, moving to the next test");
264 break;
265 }
266 }
267 }
268 }
269 }
270
271 // invert all properties
272 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
273 derivedProps[i].complement();
274 }
275
276 // test all false properties
277 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
278 rangeCount=derivedProps[i].getRangeCount();
279 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
280 start=derivedProps[i].getRangeStart(range);
281 end=derivedProps[i].getRangeEnd(range);
282 for(; start<=end; ++start) {
283 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
284 errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==true is wrong\n", start, derivedPropsNames[i]);
285 if(++numErrors[i]>=MAX_ERRORS) {
286 errln("Too many errors, moving to the next test");
287 break;
288 }
289 }
290 }
291 }
292 }
293 #endif /* !UCONFIG_NO_NORMALIZATION */
294 }
295
TestBinaryValues()296 void UnicodeTest::TestBinaryValues() {
297 /*
298 * Unicode 5.1 explicitly defines binary property value aliases.
299 * Verify that they are all recognized.
300 */
301 UErrorCode errorCode=U_ZERO_ERROR;
302 UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
303 if(U_FAILURE(errorCode)) {
304 dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
305 return;
306 }
307
308 static const char *const falseValues[]={ "N", "No", "F", "False" };
309 static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
310 int32_t i;
311 for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) {
312 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
313 pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
314 errorCode=U_ZERO_ERROR;
315 UnicodeSet set(pattern, errorCode);
316 if(U_FAILURE(errorCode)) {
317 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
318 continue;
319 }
320 set.complement();
321 if(set!=alpha) {
322 errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
323 }
324 }
325 for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) {
326 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
327 pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
328 errorCode=U_ZERO_ERROR;
329 UnicodeSet set(pattern, errorCode);
330 if(U_FAILURE(errorCode)) {
331 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
332 continue;
333 }
334 if(set!=alpha) {
335 errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
336 }
337 }
338 }
339
TestConsistency()340 void UnicodeTest::TestConsistency() {
341 #if !UCONFIG_NO_NORMALIZATION
342 /*
343 * Test for an example that getCanonStartSet() delivers
344 * all characters that compose from the input one,
345 * even in multiple steps.
346 * For example, the set for "I" (0049) should contain both
347 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
348 * In general, the set for the middle such character should be a subset
349 * of the set for the first.
350 */
351 IcuTestErrorCode errorCode(*this, "TestConsistency");
352 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
353 const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
354 if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
355 dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
356 errorCode.errorName());
357 errorCode.reset();
358 return;
359 }
360
361 UnicodeSet set1, set2;
362 if (nfcImpl->getCanonStartSet(0x49, set1)) {
363 /* enumerate all characters that are plausible to be latin letters */
364 for(char16_t start=0xa0; start<0x2000; ++start) {
365 UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
366 if(decomp.length()>1 && decomp[0]==0x49) {
367 set2.add(start);
368 }
369 }
370
371 if (set1!=set2) {
372 errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
373 }
374 // This was available in cucdtst.c but the test had to move to intltest
375 // because the new internal normalization functions are in C++.
376 //compareUSets(set1, set2,
377 // "[canon start set of 0049]", "[all c with canon decomp with 0049]",
378 // true);
379 } else {
380 errln("NFC.getCanonStartSet() returned false");
381 }
382 #endif
383 }
384
385 /**
386 * Test various implementations of Pattern_Syntax & Pattern_White_Space.
387 */
TestPatternProperties()388 void UnicodeTest::TestPatternProperties() {
389 IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
390 UnicodeSet syn_pp;
391 UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
392 UnicodeSet syn_list(
393 "[!-/\\:-@\\[-\\^`\\{-~"
394 "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
395 "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
396 "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
397 UnicodeSet ws_pp;
398 UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
399 UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
400 UnicodeSet syn_ws_pp;
401 UnicodeSet syn_ws_prop(syn_prop);
402 syn_ws_prop.addAll(ws_prop);
403 for(UChar32 c=0; c<=0xffff; ++c) {
404 if(PatternProps::isSyntax(c)) {
405 syn_pp.add(c);
406 }
407 if(PatternProps::isWhiteSpace(c)) {
408 ws_pp.add(c);
409 }
410 if(PatternProps::isSyntaxOrWhiteSpace(c)) {
411 syn_ws_pp.add(c);
412 }
413 }
414 compareUSets(syn_pp, syn_prop,
415 "PatternProps.isSyntax()", "[:Pattern_Syntax:]", true);
416 compareUSets(syn_pp, syn_list,
417 "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", true);
418 compareUSets(ws_pp, ws_prop,
419 "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", true);
420 compareUSets(ws_pp, ws_list,
421 "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", true);
422 compareUSets(syn_ws_pp, syn_ws_prop,
423 "PatternProps.isSyntaxOrWhiteSpace()",
424 "[[:Pattern_Syntax:][:Pattern_White_Space:]]", true);
425 }
426
427 // So far only minimal port of Java & cucdtst.c compareUSets().
428 UBool
compareUSets(const UnicodeSet & a,const UnicodeSet & b,const char * a_name,const char * b_name,UBool diffIsError)429 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
430 const char *a_name, const char *b_name,
431 UBool diffIsError) {
432 UBool same= a==b;
433 if(!same && diffIsError) {
434 errln("Sets are different: %s vs. %s\n", a_name, b_name);
435 }
436 return same;
437 }
438
439 namespace {
440
441 /**
442 * Maps a special script code to the most common script of its encoded characters.
443 */
getCharScript(UScriptCode script)444 UScriptCode getCharScript(UScriptCode script) {
445 switch(script) {
446 case USCRIPT_HAN_WITH_BOPOMOFO:
447 case USCRIPT_SIMPLIFIED_HAN:
448 case USCRIPT_TRADITIONAL_HAN:
449 return USCRIPT_HAN;
450 case USCRIPT_JAPANESE:
451 return USCRIPT_HIRAGANA;
452 case USCRIPT_JAMO:
453 case USCRIPT_KOREAN:
454 return USCRIPT_HANGUL;
455 case USCRIPT_SYMBOLS_EMOJI:
456 return USCRIPT_SYMBOLS;
457 default:
458 return script;
459 }
460 }
461
462 } // namespace
463
TestScriptMetadata()464 void UnicodeTest::TestScriptMetadata() {
465 IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
466 UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
467 // So far, sample characters are uppercase.
468 // Georgian is special.
469 UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
470 for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
471 UScriptCode sc = (UScriptCode)sci;
472 // Run the test with -v to see which script has failures:
473 // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL
474 logln(uscript_getShortName(sc));
475 UScriptUsage usage = uscript_getUsage(sc);
476 UnicodeString sample = uscript_getSampleUnicodeString(sc);
477 UnicodeSet scriptSet;
478 scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
479 if(usage == USCRIPT_USAGE_NOT_ENCODED) {
480 assertTrue("not encoded, no sample", sample.isEmpty());
481 assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
482 assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
483 assertFalse("not encoded, not cased", uscript_isCased(sc));
484 assertTrue("not encoded, no characters", scriptSet.isEmpty());
485 } else {
486 assertFalse("encoded, has a sample character", sample.isEmpty());
487 UChar32 firstChar = sample.char32At(0);
488 UScriptCode charScript = getCharScript(sc);
489 assertEquals("script(sample(script))",
490 (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode));
491 assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc));
492 assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc));
493 assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty()));
494 if(uscript_isRightToLeft(sc)) {
495 rtl.removeAll(scriptSet);
496 }
497 if(uscript_isCased(sc)) {
498 cased.removeAll(scriptSet);
499 }
500 }
501 }
502 UnicodeString pattern;
503 assertEquals("no remaining RTL characters",
504 UnicodeString("[]"), rtl.toPattern(pattern));
505 assertEquals("no remaining cased characters",
506 UnicodeString("[]"), cased.toPattern(pattern));
507
508 assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
509 assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
510 assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
511 }
512
TestBidiPairedBracketType()513 void UnicodeTest::TestBidiPairedBracketType() {
514 // BidiBrackets-6.3.0.txt says:
515 //
516 // The set of code points listed in this file was originally derived
517 // using the character properties General_Category (gc), Bidi_Class (bc),
518 // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
519 // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
520 // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
521 // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
522 // property values are Open and Close, respectively.
523 IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()");
524 UnicodeSet bpt("[:^bpt=n:]", errorCode);
525 assertTrue("bpt!=None is not empty", !bpt.isEmpty());
526 // The following should always be true.
527 UnicodeSet mirrored("[:Bidi_M:]", errorCode);
528 UnicodeSet other_neutral("[:bc=ON:]", errorCode);
529 assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
530 assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
531 // The following are true at least initially in Unicode 6.3.
532 UnicodeSet bpt_open("[:bpt=o:]", errorCode);
533 UnicodeSet bpt_close("[:bpt=c:]", errorCode);
534 UnicodeSet ps("[:Ps:]", errorCode);
535 UnicodeSet pe("[:Pe:]", errorCode);
536 assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
537 assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
538 }
539
TestEmojiProperties()540 void UnicodeTest::TestEmojiProperties() {
541 assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI));
542 assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI));
543 IcuTestErrorCode errorCode(*this, "TestEmojiProperties()");
544 UnicodeSet emoji("[:Emoji:]", errorCode);
545 assertTrue("lots of Emoji", emoji.size() > 700);
546
547 assertTrue("shooting star is Emoji_Presentation",
548 u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION));
549 assertTrue("Fitzpatrick 6 is Emoji_Modifier",
550 u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER));
551 assertTrue("happy person is Emoji_Modifier_Base",
552 u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE));
553 assertTrue("asterisk is Emoji_Component",
554 u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT));
555 assertTrue("copyright is Extended_Pictographic",
556 u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
557 }
558
559 namespace {
560
hbp(const char16_t * s,int32_t length,UProperty which)561 UBool hbp(const char16_t *s, int32_t length, UProperty which) {
562 return u_stringHasBinaryProperty(s, length, which);
563 }
564
hbp(const char16_t * s,UProperty which)565 UBool hbp(const char16_t *s, UProperty which) {
566 return u_stringHasBinaryProperty(s, -1, which);
567 }
568
569 } // namespace
570
TestEmojiPropertiesOfStrings()571 void UnicodeTest::TestEmojiPropertiesOfStrings() {
572 // Property of code points, for coverage
573 assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
574 assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
575 assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
576 assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
577 assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
578 assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
579 assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
580 assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
581 assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
582 assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
583 assertFalse("bicycle is not Ideographic", hbp(u"", 2, UCHAR_IDEOGRAPHIC));
584 assertFalse("bicycle/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
585 assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
586 assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
587
588 // Property of (code points and) strings
589 assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
590 assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
591 assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
592 assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
593 assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
594 assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
595 assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
596 assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
597 assertTrue("bicycle is Basic_Emoji", hbp(u"", 2, UCHAR_BASIC_EMOJI));
598 assertTrue("bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
599 assertFalse("2*bicycle is Basic_Emoji", hbp(u"", 4, UCHAR_BASIC_EMOJI));
600 assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
601 assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
602 assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
603
604 assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
605 assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
606 assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
607 assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
608
609 assertFalse("chipmunk is not Basic_Emoji", hbp(u"", UCHAR_BASIC_EMOJI));
610 assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"\uFE0F", UCHAR_BASIC_EMOJI));
611 assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
612
613 // Properties of strings (only)
614 assertFalse("4+emoji is not Emoji_Keycap_Sequence",
615 hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
616 assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
617 hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
618
619 assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
620 hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
621 assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
622 hbp(u"", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
623
624 assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
625 hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
626 assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
627 hbp(u"", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
628
629 assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
630 hbp(u"", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
631 assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
632 hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
633
634 assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
635 hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
636 assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
637 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
638
639 // RGI_Emoji = all of the above
640 assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
641 assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
642
643 assertFalse("chipmunk is not RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
644 assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"\uFE0F", UCHAR_RGI_EMOJI));
645
646 assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
647 assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
648
649 assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
650 assertTrue("[BE] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
651
652 assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
653 assertTrue("[Scotland] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
654
655 assertTrue("bicyclist is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
656 assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI));
657
658 assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
659 assertTrue("woman pilot: dark skin tone is RGI_Emoji",
660 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
661
662 // UnicodeSet with properties of strings
663 IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
664 UnicodeSet basic("[:Basic_Emoji:]", errorCode);
665 UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
666 UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
667 UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
668 UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
669 UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
670 UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
671 if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
672 return;
673 }
674
675 // union of all sets except for "rgi" -- should be the same as "rgi"
676 UnicodeSet all(basic);
677 all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
678
679 UnicodeSet basicOnlyCp(basic);
680 basicOnlyCp.removeAllStrings();
681
682 UnicodeSet rgiOnlyCp(rgi);
683 rgiOnlyCp.removeAllStrings();
684
685 assertTrue("lots of Basic_Emoji", basic.size() > 1000);
686 assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
687 assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
688 assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
689 assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
690 assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
691 assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
692
693 assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
694 assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
695 assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
696 assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
697 assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
698 assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
699 assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
700
701 assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
702 assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
703 assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
704 rgiOnlyCp.size(), basicOnlyCp.size());
705 assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
706 assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
707 assertTrue("RGI_Emoji == union", rgi == all);
708
709 assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
710 assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"\uFE0F"));
711 assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
712 keycaps.contains(u"4\uFE0F\u20E3"));
713 assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u""));
714 assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u""));
715 assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
716 modified.contains(u"\U0001F3FD"));
717 assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
718 combos.contains(u"\U0001F3FF\u200D✈\uFE0F"));
719 assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
720 assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"\uFE0F"));
721 assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
722 assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u""));
723 assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
724 assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u""));
725 assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u""));
726 assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"\U0001F3FD"));
727 assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"\U0001F3FF\u200D✈\uFE0F"));
728 }
729
TestIndicPositionalCategory()730 void UnicodeTest::TestIndicPositionalCategory() {
731 IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
732 UnicodeSet na(u"[:InPC=NA:]", errorCode);
733 assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500);
734 UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode);
735 assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100);
736 assertEquals("U+08FF: NA", U_INPC_NA,
737 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY));
738 assertEquals("U+0900: Top", U_INPC_TOP,
739 u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY));
740 assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK,
741 u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY));
742 }
743
TestIndicSyllabicCategory()744 void UnicodeTest::TestIndicSyllabicCategory() {
745 IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()");
746 UnicodeSet other(u"[:InSC=Other:]", errorCode);
747 assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500);
748 UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode);
749 assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100);
750 assertEquals("U+08FF: Other", U_INSC_OTHER,
751 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY));
752 assertEquals("U+0900: Bindu", U_INSC_BINDU,
753 u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY));
754 assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER,
755 u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY));
756 }
757
TestVerticalOrientation()758 void UnicodeTest::TestVerticalOrientation() {
759 IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()");
760 UnicodeSet r(u"[:vo=R:]", errorCode);
761 assertTrue("mostly R", 0xc0000 <= r.size() && r.size() <= 0xd0000);
762 UnicodeSet u(u"[:vo=U:]", errorCode);
763 assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x50000);
764 UnicodeSet tu(u"[:vo=Tu:]", errorCode);
765 assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300);
766 assertEquals("U+0E01: Rotated", U_VO_ROTATED,
767 u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION));
768 assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED,
769 u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION));
770 assertEquals("U+33333: Upright", U_VO_UPRIGHT,
771 u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION));
772 }
773
TestDefaultScriptExtensions()774 void UnicodeTest::TestDefaultScriptExtensions() {
775 // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
776 // but some of its characters revert to scx=<script> which is usually Common.
777 IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()");
778 UScriptCode scx[20];
779 scx[0] = USCRIPT_INVALID_CODE;
780 assertEquals("U+3000 num scx", 1, // IDEOGRAPHIC SPACE
781 uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode));
782 assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]);
783 scx[0] = USCRIPT_INVALID_CODE;
784 assertEquals("U+3012 num scx", 1, // POSTAL MARK
785 uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode));
786 assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]);
787 }
788
TestInvalidCodePointFolding()789 void UnicodeTest::TestInvalidCodePointFolding() {
790 // Test behavior when an invalid code point is passed to u_foldCase
791 static const UChar32 invalidCodePoints[] = {
792 0xD800, // lead surrogate
793 0xDFFF, // trail surrogate
794 0xFDD0, // noncharacter
795 0xFFFF, // noncharacter
796 0x110000, // out of range
797 -1 // negative
798 };
799 for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) {
800 UChar32 cp = invalidCodePoints[i];
801 assertEquals("Invalid code points should be echoed back",
802 cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT));
803 assertEquals("Invalid code points should be echoed back",
804 cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
805 }
806 }
807
TestBinaryCharacterProperties()808 void UnicodeTest::TestBinaryCharacterProperties() {
809 #if !UCONFIG_NO_NORMALIZATION
810 IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
811 // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
812 for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
813 const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode);
814 if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) {
815 continue;
816 }
817 const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
818 int32_t count = set.getRangeCount();
819 if (count == 0) {
820 assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
821 u_hasBinaryProperty(0x20, (UProperty)prop));
822 assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
823 u_hasBinaryProperty(0x61, (UProperty)prop));
824 assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
825 u_hasBinaryProperty(0x4e00, (UProperty)prop));
826 } else {
827 UChar32 c = set.getRangeStart(0);
828 if (c > 0) {
829 assertFalse(
830 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
831 u", " + prop + u")",
832 u_hasBinaryProperty(c - 1, (UProperty)prop));
833 }
834 assertTrue(
835 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
836 u", " + prop + u")",
837 u_hasBinaryProperty(c, (UProperty)prop));
838 c = set.getRangeEnd(count - 1);
839 assertTrue(
840 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
841 u", " + prop + u")",
842 u_hasBinaryProperty(c, (UProperty)prop));
843 if (c < 0x10ffff) {
844 assertFalse(
845 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
846 u", " + prop + u")",
847 u_hasBinaryProperty(c + 1, (UProperty)prop));
848 }
849 }
850 }
851 #endif
852 }
853
TestIntCharacterProperties()854 void UnicodeTest::TestIntCharacterProperties() {
855 #if !UCONFIG_NO_NORMALIZATION
856 IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
857 // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
858 for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
859 const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode);
860 if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) {
861 continue;
862 }
863 uint32_t value;
864 UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
865 assertTrue("int property first range", end >= 0);
866 UChar32 c = end / 2;
867 assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
868 u_getIntPropertyValue(c, (UProperty)prop), value);
869 end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
870 assertTrue("int property later range", end >= 0);
871 assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
872 u_getIntPropertyValue(end, (UProperty)prop), value);
873 // ucpmap_get() API coverage
874 // TODO: move to cucdtst.c
875 assertEquals(
876 "int property upcmap_get(U+0061)",
877 u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61));
878 }
879 #endif
880 }
881
882 namespace {
883
getPropName(UProperty property,int32_t nameChoice)884 const char *getPropName(UProperty property, int32_t nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
885 const char *name = u_getPropertyName(property, (UPropertyNameChoice)nameChoice);
886 return name != nullptr ? name : "null";
887 }
888
getValueName(UProperty property,int32_t value,int32_t nameChoice)889 const char *getValueName(UProperty property, int32_t value, int32_t nameChoice)
890 UPRV_NO_SANITIZE_UNDEFINED {
891 const char *name = u_getPropertyValueName(property, value, (UPropertyNameChoice)nameChoice);
892 return name != nullptr ? name : "null";
893 }
894
895 } // namespace
896
TestPropertyNames()897 void UnicodeTest::TestPropertyNames() {
898 IcuTestErrorCode errorCode(*this, "TestPropertyNames()");
899 // Test names of certain properties & values.
900 // The UPropertyNameChoice is really an integer with only a couple of named constants.
901 UProperty prop = UCHAR_WHITE_SPACE;
902 constexpr int32_t SHORT = U_SHORT_PROPERTY_NAME;
903 constexpr int32_t LONG = U_LONG_PROPERTY_NAME;
904 assertEquals("White_Space: index -1", "null", getPropName(prop, -1));
905 assertEquals("White_Space: short", "WSpace", getPropName(prop, SHORT));
906 assertEquals("White_Space: long", "White_Space", getPropName(prop, LONG));
907 assertEquals("White_Space: index 2", "space", getPropName(prop, 2));
908 assertEquals("White_Space: index 3", "null", getPropName(prop, 3));
909
910 prop = UCHAR_SIMPLE_CASE_FOLDING;
911 assertEquals("Simple_Case_Folding: index -1", "null", getPropName(prop, -1));
912 assertEquals("Simple_Case_Folding: short", "scf", getPropName(prop, SHORT));
913 assertEquals("Simple_Case_Folding: long", "Simple_Case_Folding", getPropName(prop, LONG));
914 assertEquals("Simple_Case_Folding: index 2", "sfc", getPropName(prop, 2));
915 assertEquals("Simple_Case_Folding: index 3", "null", getPropName(prop, 3));
916
917 prop = UCHAR_CASED;
918 assertEquals("Cased=Y: index -1", "null", getValueName(prop, 1, -1));
919 assertEquals("Cased=Y: short", "Y", getValueName(prop, 1, SHORT));
920 assertEquals("Cased=Y: long", "Yes", getValueName(prop, 1, LONG));
921 assertEquals("Cased=Y: index 2", "T", getValueName(prop, 1, 2));
922 assertEquals("Cased=Y: index 3", "True", getValueName(prop, 1, 3));
923 assertEquals("Cased=Y: index 4", "null", getValueName(prop, 1, 4));
924
925 prop = UCHAR_DECOMPOSITION_TYPE;
926 int32_t value = U_DT_NOBREAK;
927 assertEquals("dt=Nb: index -1", "null", getValueName(prop, value, -1));
928 assertEquals("dt=Nb: short", "Nb", getValueName(prop, value, SHORT));
929 assertEquals("dt=Nb: long", "Nobreak", getValueName(prop, value, LONG));
930 assertEquals("dt=Nb: index 2", "nb", getValueName(prop, value, 2));
931 assertEquals("dt=Nb: index 3", "null", getValueName(prop, value, 3));
932
933 // Canonical_Combining_Class:
934 // The UCD inserts the numeric values in the second filed of its PropertyValueAliases.txt lines.
935 // In ICU, we don't treat these as names,
936 // they are just the numeric values returned by u_getCombiningClass().
937 // We return the real short and long names for the usual choice constants.
938 prop = UCHAR_CANONICAL_COMBINING_CLASS;
939 assertEquals("ccc=230: index -1", "null", getValueName(prop, 230, -1));
940 assertEquals("ccc=230: short", "A", getValueName(prop, 230, SHORT));
941 assertEquals("ccc=230: long", "Above", getValueName(prop, 230, LONG));
942 assertEquals("ccc=230: index 2", "null", getValueName(prop, 230, 2));
943
944 prop = UCHAR_GENERAL_CATEGORY;
945 value = U_DECIMAL_DIGIT_NUMBER;
946 assertEquals("gc=Nd: index -1", "null", getValueName(prop, value, -1));
947 assertEquals("gc=Nd: short", "Nd", getValueName(prop, value, SHORT));
948 assertEquals("gc=Nd: long", "Decimal_Number", getValueName(prop, value, LONG));
949 assertEquals("gc=Nd: index 2", "digit", getValueName(prop, value, 2));
950 assertEquals("gc=Nd: index 3", "null", getValueName(prop, value, 3));
951
952 prop = UCHAR_GENERAL_CATEGORY_MASK;
953 value = U_GC_P_MASK;
954 assertEquals("gc=P mask: index -1", "null", getValueName(prop, value, -1));
955 assertEquals("gc=P mask: short", "P", getValueName(prop, value, SHORT));
956 assertEquals("gc=P mask: long", "Punctuation", getValueName(prop, value, LONG));
957 assertEquals("gc=P mask: index 2", "punct", getValueName(prop, value, 2));
958 assertEquals("gc=P mask: index 3", "null", getValueName(prop, value, 3));
959 }
960
TestIDSUnaryOperator()961 void UnicodeTest::TestIDSUnaryOperator() {
962 IcuTestErrorCode errorCode(*this, "TestIDSUnaryOperator()");
963 // New in Unicode 15.1 for just two characters.
964 assertFalse("U+2FFC IDSU", u_hasBinaryProperty(0x2ffc, UCHAR_IDS_UNARY_OPERATOR));
965 assertFalse("U+2FFD IDSU", u_hasBinaryProperty(0x2ffd, UCHAR_IDS_UNARY_OPERATOR));
966 assertTrue("U+2FFE IDSU", u_hasBinaryProperty(0x2ffe, UCHAR_IDS_UNARY_OPERATOR));
967 assertTrue("U+2FFF IDSU", u_hasBinaryProperty(0x2fff, UCHAR_IDS_UNARY_OPERATOR));
968 assertFalse("U+3000 IDSU", u_hasBinaryProperty(0x3000, UCHAR_IDS_UNARY_OPERATOR));
969 assertFalse("U+3001 IDSU", u_hasBinaryProperty(0x3001, UCHAR_IDS_UNARY_OPERATOR));
970
971 // Property name works and gets the correct set.
972 UnicodeSet idsu(u"[:IDS_Unary_Operator:]", errorCode);
973 assertEquals("IDSU set number of characters", 2, idsu.size());
974 assertFalse("idsu.contains(U+2FFD)", idsu.contains(0x2ffd));
975 assertTrue("idsu.contains(U+2FFE)", idsu.contains(0x2ffe));
976 assertTrue("idsu.contains(U+2FFF)", idsu.contains(0x2fff));
977 assertFalse("idsu.contains(U+3000)", idsu.contains(0x3000));
978 }
979
980 namespace {
981
isMathStart(UChar32 c)982 bool isMathStart(UChar32 c) {
983 return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_START);
984 }
985
isMathContinue(UChar32 c)986 bool isMathContinue(UChar32 c) {
987 return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_CONTINUE);
988 }
989
990 } // namespace
991
TestIDCompatMath()992 void UnicodeTest::TestIDCompatMath() {
993 IcuTestErrorCode errorCode(*this, "TestIDCompatMath()");
994 assertFalse("U+00B1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb1));
995 assertTrue("U+00B2 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb2));
996 assertTrue("U+00B3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb3));
997 assertFalse("U+00B4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb4));
998 assertFalse("U+207F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x207f));
999 assertTrue("U+2080 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2080));
1000 assertTrue("U+208E UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208e));
1001 assertFalse("U+208F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208f));
1002 assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2201));
1003 assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2202));
1004 assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D6C1));
1005 assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C3));
1006 assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C4));
1007
1008 assertFalse("U+00B2 UCHAR_ID_COMPAT_MATH_START", isMathStart(0xb2));
1009 assertFalse("U+2080 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2080));
1010 assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2201));
1011 assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2202));
1012 assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D6C1));
1013 assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C3));
1014 assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C4));
1015
1016 // Property names work and get the correct sets.
1017 UnicodeSet idcmStart(u"[:ID_Compat_Math_Start:]", errorCode);
1018 UnicodeSet idcmContinue(u"[:ID_Compat_Math_Continue:]", errorCode);
1019 assertEquals("ID_Compat_Math_Start set number of characters", 13, idcmStart.size());
1020 assertEquals("ID_Compat_Math_Continue set number of characters", 43, idcmContinue.size());
1021 assertTrue("ID_Compat_Math_Start is a subset of ID_Compat_Math_Continue",
1022 idcmContinue.containsAll(idcmStart));
1023 assertFalse("idcmContinue.contains(U+207F)", idcmContinue.contains(0x207f));
1024 assertTrue("idcmContinue.contains(U+2080)", idcmContinue.contains(0x2080));
1025 assertTrue("idcmContinue.contains(U+208E)", idcmContinue.contains(0x208e));
1026 assertFalse("idcmContinue.contains(U+208F)", idcmContinue.contains(0x208f));
1027 assertFalse("idcmStart.contains(U+2201)", idcmStart.contains(0x2201));
1028 assertTrue("idcmStart.contains(U+2202)", idcmStart.contains(0x2202));
1029 assertTrue("idcmStart.contains(U+1D7C3)", idcmStart.contains(0x1D7C3));
1030 assertFalse("idcmStart.contains(U+1D7C4)", idcmStart.contains(0x1D7C4));
1031 }
1032
1033 U_NAMESPACE_BEGIN
1034
1035 class BuiltInPropertyNames : public PropertyNames {
1036 public:
~BuiltInPropertyNames()1037 ~BuiltInPropertyNames() override {}
1038
getPropertyEnum(const char * name) const1039 int32_t getPropertyEnum(const char *name) const override {
1040 return u_getPropertyEnum(name);
1041 }
1042
getPropertyValueEnum(int32_t property,const char * name) const1043 int32_t getPropertyValueEnum(int32_t property, const char *name) const override {
1044 return u_getPropertyValueEnum((UProperty) property, name);
1045 }
1046 };
1047
1048 U_NAMESPACE_END
1049
TestBinaryPropertyUsingPpucd()1050 void UnicodeTest::TestBinaryPropertyUsingPpucd() {
1051 IcuTestErrorCode errorCode(*this, "TestBinaryPropertyUsingPpucd()");
1052
1053 // Initialize PPUCD parsing object using file in repo and using
1054 // property names present in built-in data in ICU
1055 char buffer[500];
1056 // get path to `source/data/unidata/` including trailing `/`
1057 char *unidataPath = getUnidataPath(buffer);
1058 if(unidataPath == nullptr) {
1059 errln("exiting early because unable to open ppucd.txt from ICU source tree");
1060 return;
1061 }
1062 CharString ppucdPath(unidataPath, errorCode);
1063 ppucdPath.appendPathPart("ppucd.txt", errorCode);
1064 PreparsedUCD ppucd(ppucdPath.data(), errorCode);
1065 if(errorCode.isFailure()) {
1066 errln("unable to open %s - %s\n",
1067 ppucdPath.data(), errorCode.errorName());
1068 return;
1069 }
1070 BuiltInPropertyNames builtInPropNames;
1071 ppucd.setPropertyNames(&builtInPropNames);
1072
1073 // Define which binary properties we want to compare
1074 constexpr UProperty propsUnderTest[] = {
1075 UCHAR_IDS_UNARY_OPERATOR,
1076 UCHAR_ID_COMPAT_MATH_START,
1077 UCHAR_ID_COMPAT_MATH_CONTINUE,
1078 };
1079
1080 // Allocate & initialize UnicodeSets per binary property from PPUCD data
1081 UnicodeSet ppucdPropSets[std::size(propsUnderTest)];
1082
1083 // Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
1084 PreparsedUCD::LineType lineType;
1085 UnicodeSet newValues;
1086 while((lineType=ppucd.readLine(errorCode))!=PreparsedUCD::NO_LINE && errorCode.isSuccess()) {
1087 if(ppucd.lineHasPropertyValues()) {
1088 const UniProps *lineProps=ppucd.getProps(newValues, errorCode);
1089
1090 for(uint32_t i = 0; i < std::size(propsUnderTest); i++) {
1091 UProperty prop = propsUnderTest[i];
1092 if (!newValues.contains(prop)) {
1093 continue;
1094 }
1095 if (lineProps->binProps[prop]) {
1096 ppucdPropSets[i].add(lineProps->start, lineProps->end);
1097 } else {
1098 ppucdPropSets[i].remove(lineProps->start, lineProps->end);
1099 }
1100 }
1101 }
1102 }
1103
1104 if(errorCode.isFailure()) {
1105 errln("exiting early due to parsing error");
1106 return;
1107 }
1108
1109 // Assert that the PPUCD data and the ICU data are equivalent for all properties
1110 for(uint32_t i = 0; i < std::size(propsUnderTest); i++) {
1111 UnicodeSet icuPropSet;
1112 UProperty prop = propsUnderTest[i];
1113 icuPropSet.applyIntPropertyValue(prop, 1, errorCode);
1114 std::string msg =
1115 std::string()
1116 + "ICU & PPUCD versions of property "
1117 + u_getPropertyName(prop, U_LONG_PROPERTY_NAME);
1118 assertTrue(msg.c_str(), ppucdPropSets[i] == icuPropSet);
1119 }
1120 }
1121
1122 namespace {
1123
getIDStatus(UChar32 c)1124 int32_t getIDStatus(UChar32 c) {
1125 return u_getIntPropertyValue(c, UCHAR_IDENTIFIER_STATUS);
1126 }
1127
1128 } // namespace
1129
TestIDStatus()1130 void UnicodeTest::TestIDStatus() {
1131 IcuTestErrorCode errorCode(*this, "TestIDStatus()");
1132 assertEquals("ID_Status(slash)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x2F));
1133 assertEquals("ID_Status(digit 0)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x30));
1134 assertEquals("ID_Status(colon)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x3A));
1135 assertEquals("ID_Status(semicolon)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x3B));
1136 assertEquals("ID_Status(Greek small alpha)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x03B1));
1137 assertEquals("ID_Status(Greek small archaic koppa)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x03D9));
1138 assertEquals("ID_Status(Hangul syllable)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0xAC00));
1139 assertEquals("ID_Status(surrogate)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xD800));
1140 assertEquals("ID_Status(Arabic tail fragment)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xFE73));
1141 assertEquals("ID_Status(Hentaigana ko-3)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x1B03A));
1142 assertEquals("ID_Status(Katakana small ko)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x1B155));
1143 assertEquals("ID_Status(U+2EE5D)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x2EE5D));
1144 assertEquals("ID_Status(U+10FFFF)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x10FFFF));
1145
1146 // Property names work and get the correct sets.
1147 UnicodeSet idStatus(u"[:Identifier_Status=Allowed:]", errorCode);
1148 // Unicode 15.1: 112778 Allowed characters; normally grows over time
1149 assertTrue("Allowed number of characters", idStatus.size() >= 112778);
1150 assertFalse("Allowed.contains(slash)", idStatus.contains(0x2F));
1151 assertTrue("Allowed.contains(digit 0)", idStatus.contains(0x30));
1152 assertTrue("Allowed.contains(colon)", idStatus.contains(0x3A));
1153 assertFalse("Allowed.contains(semicolon)", idStatus.contains(0x3B));
1154 assertTrue("Allowed.contains(Greek small alpha)", idStatus.contains(0x03B1));
1155 assertFalse("Allowed.contains(Greek small archaic koppa)", idStatus.contains(0x03D9));
1156 assertTrue("Allowed.contains(Hangul syllable)", idStatus.contains(0xAC00));
1157 assertFalse("Allowed.contains(surrogate)", idStatus.contains(0xD800));
1158 assertFalse("Allowed.contains(Arabic tail fragment)", idStatus.contains(0xFE73));
1159 assertFalse("Allowed.contains(Hentaigana ko-3)", idStatus.contains(0x1B03A));
1160 assertTrue("Allowed.contains(Katakana small ko)", idStatus.contains(0x1B155));
1161 assertTrue("Allowed.contains(U+2EE5D)", idStatus.contains(0x2EE5D));
1162 assertFalse("Allowed.contains(U+10FFFF)", idStatus.contains(0x10FFFF));
1163 }
1164
1165 namespace {
1166
getIDTypes(UChar32 c)1167 UnicodeString getIDTypes(UChar32 c) {
1168 UErrorCode errorCode = U_ZERO_ERROR;
1169 UIdentifierType types[10];
1170 int32_t length = u_getIDTypes(c, types, UPRV_LENGTHOF(types), &errorCode);
1171 if (U_FAILURE(errorCode)) {
1172 return UnicodeString(u_errorName(errorCode), -1, US_INV);
1173 }
1174 // The order of values is undefined, but for simplicity we assume the order
1175 // that the current implementation yields. Otherwise we would have to sort the values.
1176 uint32_t typeBits = 0;
1177 UnicodeString result;
1178 for (int32_t i = 0; i < length; ++i) {
1179 if (i != 0) {
1180 result.append(u' ');
1181 }
1182 auto t = types[i];
1183 typeBits |= 1UL << t;
1184 const char *s = u_getPropertyValueName(UCHAR_IDENTIFIER_TYPE, t, U_LONG_PROPERTY_NAME);
1185 if (s != nullptr) {
1186 result.append(UnicodeString(s, -1, US_INV));
1187 } else {
1188 result.append(u"???");
1189 }
1190 }
1191 // Check that u_hasIDType() agrees.
1192 // Includes undefined behavior with t > largest enum constant.
1193 for (int32_t i = 0; i < 16; ++i) {
1194 UIdentifierType t = (UIdentifierType)i;
1195 bool expected = (typeBits & (1UL << i)) != 0;
1196 bool actual = u_hasIDType(c, t);
1197 if (actual != expected) {
1198 result.append(u" != u_hasIDType() ");
1199 result = result + i;
1200 break;
1201 }
1202 }
1203 return result;
1204 }
1205
1206 } // namespace
1207
TestIDType()1208 void UnicodeTest::TestIDType() {
1209 IcuTestErrorCode errorCode(*this, "TestIDType()");
1210 // Note: Types other than Recommended and Inclusion may well change over time.
1211 assertEquals("ID_Type(slash)", u"Not_XID", getIDTypes(0x2F));
1212 assertEquals("ID_Type(digit 0)", u"Recommended", getIDTypes(0x30));
1213 assertEquals("ID_Type(colon)", u"Inclusion", getIDTypes(0x3A));
1214 assertEquals("ID_Type(semicolon)", u"Not_XID", getIDTypes(0x3B));
1215 assertEquals("ID_Type(Greek small alpha)", u"Recommended", getIDTypes(0x03B1));
1216 assertEquals("ID_Type(Greek small archaic koppa)", u"Obsolete", getIDTypes(0x03D9));
1217 assertEquals("ID_Type(Hangul syllable)", u"Recommended", getIDTypes(0xAC00));
1218 assertEquals("ID_Type(surrogate)", u"Not_Character", getIDTypes(0xD800));
1219 assertEquals("ID_Type(Arabic tail fragment)", u"Technical", getIDTypes(0xFE73));
1220 assertEquals("ID_Type(Linear B syllable)", u"Exclusion", getIDTypes(0x10000));
1221 assertEquals("ID_Type(Hentaigana ko-3)", u"Obsolete", getIDTypes(0x1B03A));
1222 assertEquals("ID_Type(Katakana small ko)", u"Recommended", getIDTypes(0x1B155));
1223 assertEquals("ID_Type(U+2EE5D)", u"Recommended", getIDTypes(0x2EE5D));
1224 assertEquals("ID_Type(U+10FFFF)", u"Not_Character", getIDTypes(0x10FFFF));
1225
1226 assertEquals("ID_Type(CYRILLIC THOUSANDS SIGN)", u"Not_XID Obsolete", getIDTypes(0x0482));
1227 assertEquals("ID_Type(SYRIAC FEMININE DOT)", u"Technical Limited_Use", getIDTypes(0x0740));
1228 assertEquals("ID_Type(NKO LETTER JONA JA)", u"Obsolete Limited_Use", getIDTypes(0x07E8));
1229 assertEquals("ID_Type(SYRIAC END OF PARAGRAPH)", u"Not_XID Limited_Use", getIDTypes(0x0700));
1230 assertEquals("ID_Type(LATIN SMALL LETTER EZH)=", u"Technical Uncommon_Use", getIDTypes(0x0292));
1231 assertEquals("ID_Type(MUSICAL SYMBOL KIEVAN C CLEF)", u"Not_XID Technical Uncommon_Use", getIDTypes(0x1D1DE));
1232 assertEquals("ID_Type(MRO LETTER TA)", u"Exclusion Uncommon_Use", getIDTypes(0x16A40));
1233 assertEquals("ID_Type(GREEK MUSICAL LEIMMA)", u"Not_XID Obsolete", getIDTypes(0x1D245));
1234
1235 // error handling
1236 UIdentifierType types[2];
1237 UErrorCode failure = U_ZERO_ERROR;
1238 u_getIDTypes(0, types, -1, &failure);
1239 assertEquals("u_getIDTypes(capacity<0)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1240
1241 failure = U_ZERO_ERROR;
1242 u_getIDTypes(0, nullptr, 1, &failure);
1243 assertEquals("u_getIDTypes(nullptr)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1244
1245 failure = U_ZERO_ERROR;
1246 int32_t length = u_getIDTypes(0x30, types, 0, &failure);
1247 assertEquals("u_getIDTypes(digit 0, capacity 0) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1248 assertEquals("u_getIDTypes(digit 0, capacity 0) length", 1, length);
1249
1250 failure = U_ZERO_ERROR;
1251 length = u_getIDTypes(0x1D1DE, types, 0, &failure);
1252 assertEquals("u_getIDTypes(Kievan C clef, capacity 2) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1253 assertEquals("u_getIDTypes(Kievan C clef, capacity 2) length", 3, length);
1254
1255 // Property names work and get the correct sets.
1256 UnicodeSet rec(u"[:Identifier_Type=Recommended:]", errorCode);
1257 UnicodeSet incl(u"[:Identifier_Type=Inclusion:]", errorCode);
1258 UnicodeSet limited(u"[:Identifier_Type=Limited_Use:]", errorCode);
1259 UnicodeSet uncommon(u"[:Identifier_Type=Uncommon_Use:]", errorCode);
1260 UnicodeSet notChar(u"[:Identifier_Type=Not_Character:]", errorCode);
1261 // Unicode 15.1 set sizes; normally grows over time except Not_Character shrinks
1262 assertTrue("Recommended number of characters", rec.size() >= 112761);
1263 assertTrue("Inclusion number of characters", incl.size() >= 17);
1264 assertTrue("Limited_Use number of characters", limited.size() >= 5268);
1265 assertTrue("Uncommon_Use number of characters", uncommon.size() >= 398);
1266 assertTrue("Not_Character number of characters",
1267 800000 <= notChar.size() && notChar.size() <= 964293);
1268 assertFalse("Recommended.contains(slash)", rec.contains(0x2F));
1269 assertTrue("Recommended.contains(digit 0)", rec.contains(0x30));
1270 assertTrue("Inclusion.contains(colon)", incl.contains(0x3A));
1271 assertTrue("Recommended.contains(U+2EE5D)", rec.contains(0x2EE5D));
1272 assertTrue("Limited_Use.contains(SYRIAC FEMININE DOT)", limited.contains(0x0740));
1273 assertTrue("Limited_Use.contains(NKO LETTER JONA JA)", limited.contains(0x7E8));
1274 assertTrue("Not_Character.contains(surrogate)", notChar.contains(0xd800));
1275 assertTrue("Not_Character.contains(U+10FFFF)", notChar.contains(0x10FFFF));
1276 assertTrue("Uncommon_Use.contains(LATIN SMALL LETTER EZH)", uncommon.contains(0x0292));
1277 assertTrue("Uncommon_Use.contains(MUSICAL SYMBOL KIEVAN C CLEF)", uncommon.contains(0x1D1DE));
1278
1279 // More mutually exclusive types, including some otherwise combinable ones.
1280 UnicodeSet dep(u"[:Identifier_Type=Deprecated:]", errorCode);
1281 UnicodeSet di(u"[:Identifier_Type=Default_Ignorable:]", errorCode);
1282 UnicodeSet notNFKC(u"[:Identifier_Type=Not_NFKC:]", errorCode);
1283 UnicodeSet excl(u"[:Identifier_Type=Exclusion:]", errorCode);
1284 UnicodeSet allExclusive;
1285 allExclusive.addAll(rec).addAll(incl).addAll(limited).addAll(excl).
1286 addAll(notNFKC).addAll(di).addAll(dep).addAll(notChar);
1287 assertEquals("num chars in mutually exclusive types",
1288 rec.size() + incl.size() + limited.size() + excl.size() +
1289 notNFKC.size() + di.size() + dep.size() + notChar.size(),
1290 allExclusive.size());
1291 }
1292