• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * Copyright (c) 1997-2016, International Business Machines Corporation and
5  * others. All Rights Reserved.
6  ********************************************************************/
7 
8 #include "unicode/ustring.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/uniset.h"
12 #include "unicode/putil.h"
13 #include "unicode/uscript.h"
14 #include "unicode/uset.h"
15 #include "charstr.h"
16 #include "cstring.h"
17 #include "hash.h"
18 #include "patternprops.h"
19 #include "ppucd.h"
20 #include "normalizer2impl.h"
21 #include "testutil.h"
22 #include "uparse.h"
23 #include "ucdtest.h"
24 
25 static const char *ignorePropNames[]={
26     "FC_NFKC",
27     "NFD_QC",
28     "NFC_QC",
29     "NFKD_QC",
30     "NFKC_QC",
31     "Expands_On_NFD",
32     "Expands_On_NFC",
33     "Expands_On_NFKD",
34     "Expands_On_NFKC",
35     "InCB",
36     "NFKC_CF",
37     "NFKC_SCF"
38 };
39 
UnicodeTest()40 UnicodeTest::UnicodeTest()
41 {
42     UErrorCode errorCode=U_ZERO_ERROR;
43     unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
44     if(U_FAILURE(errorCode)) {
45         delete unknownPropertyNames;
46         unknownPropertyNames=nullptr;
47     }
48     // Ignore some property names altogether.
49     for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) {
50         unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
51     }
52 }
53 
~UnicodeTest()54 UnicodeTest::~UnicodeTest()
55 {
56     delete unknownPropertyNames;
57 }
58 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)59 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
60 {
61     if(exec) {
62         logln("TestSuite UnicodeTest: ");
63     }
64     TESTCASE_AUTO_BEGIN;
65     TESTCASE_AUTO(TestAdditionalProperties);
66     TESTCASE_AUTO(TestBinaryValues);
67     TESTCASE_AUTO(TestConsistency);
68     TESTCASE_AUTO(TestPatternProperties);
69     TESTCASE_AUTO(TestScriptMetadata);
70     TESTCASE_AUTO(TestBidiPairedBracketType);
71     TESTCASE_AUTO(TestEmojiProperties);
72     TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
73     TESTCASE_AUTO(TestIndicPositionalCategory);
74     TESTCASE_AUTO(TestIndicSyllabicCategory);
75     TESTCASE_AUTO(TestVerticalOrientation);
76     TESTCASE_AUTO(TestDefaultScriptExtensions);
77     TESTCASE_AUTO(TestInvalidCodePointFolding);
78 #if !UCONFIG_NO_NORMALIZATION
79     TESTCASE_AUTO(TestBinaryCharacterProperties);
80     TESTCASE_AUTO(TestIntCharacterProperties);
81 #endif
82     TESTCASE_AUTO(TestPropertyNames);
83     TESTCASE_AUTO(TestIDSUnaryOperator);
84     TESTCASE_AUTO(TestIDCompatMath);
85     TESTCASE_AUTO(TestBinaryPropertyUsingPpucd);
86     TESTCASE_AUTO(TestIDStatus);
87     TESTCASE_AUTO(TestIDType);
88     TESTCASE_AUTO_END;
89 }
90 
91 //====================================================
92 // private data used by the tests
93 //====================================================
94 
95 // test DerivedCoreProperties.txt -------------------------------------------
96 
97 // copied from genprops.c
98 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)99 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
100     const char *t, *z;
101     int32_t i, j;
102 
103     s=u_skipWhitespace(s);
104     for(i=0; i<countTokens; ++i) {
105         t=tokens[i];
106         if(t!=nullptr) {
107             for(j=0;; ++j) {
108                 if(t[j]!=0) {
109                     if(s[j]!=t[j]) {
110                         break;
111                     }
112                 } else {
113                     z=u_skipWhitespace(s+j);
114                     if(*z==';' || *z==0) {
115                         return i;
116                     } else {
117                         break;
118                     }
119                 }
120             }
121         }
122     }
123     return -1;
124 }
125 
126 static const char *const
127 derivedPropsNames[]={
128     "Math",
129     "Alphabetic",
130     "Lowercase",
131     "Uppercase",
132     "ID_Start",
133     "ID_Continue",
134     "XID_Start",
135     "XID_Continue",
136     "Default_Ignorable_Code_Point",
137     "Full_Composition_Exclusion",
138     "Grapheme_Extend",
139     "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
140     "Grapheme_Base",
141     "Cased",
142     "Case_Ignorable",
143     "Changes_When_Lowercased",
144     "Changes_When_Uppercased",
145     "Changes_When_Titlecased",
146     "Changes_When_Casefolded",
147     "Changes_When_Casemapped",
148     "Changes_When_NFKC_Casefolded"
149 };
150 
151 static const UProperty
152 derivedPropsIndex[]={
153     UCHAR_MATH,
154     UCHAR_ALPHABETIC,
155     UCHAR_LOWERCASE,
156     UCHAR_UPPERCASE,
157     UCHAR_ID_START,
158     UCHAR_ID_CONTINUE,
159     UCHAR_XID_START,
160     UCHAR_XID_CONTINUE,
161     UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
162     UCHAR_FULL_COMPOSITION_EXCLUSION,
163     UCHAR_GRAPHEME_EXTEND,
164     UCHAR_GRAPHEME_LINK,
165     UCHAR_GRAPHEME_BASE,
166     UCHAR_CASED,
167     UCHAR_CASE_IGNORABLE,
168     UCHAR_CHANGES_WHEN_LOWERCASED,
169     UCHAR_CHANGES_WHEN_UPPERCASED,
170     UCHAR_CHANGES_WHEN_TITLECASED,
171     UCHAR_CHANGES_WHEN_CASEFOLDED,
172     UCHAR_CHANGES_WHEN_CASEMAPPED,
173     UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
174 };
175 
176 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 };
177 
178 enum { MAX_ERRORS=50 };
179 
180 U_CFUNC void U_CALLCONV
derivedPropsLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)181 derivedPropsLineFn(void *context,
182                    char *fields[][2], int32_t /* fieldCount */,
183                    UErrorCode *pErrorCode)
184 {
185     UnicodeTest *me=static_cast<UnicodeTest*>(context);
186     uint32_t start, end;
187     int32_t i;
188 
189     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
190     if(U_FAILURE(*pErrorCode)) {
191         me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
192         return;
193     }
194 
195     /* parse derived binary property name, ignore unknown names */
196     i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]);
197     if(i<0) {
198         UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
199         propName.trim();
200         if(me->unknownPropertyNames->find(propName)==nullptr) {
201             UErrorCode errorCode=U_ZERO_ERROR;
202             me->unknownPropertyNames->puti(propName, 1, errorCode);
203             me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
204         }
205         return;
206     }
207 
208     me->derivedProps[i].add(start, end);
209 }
210 
TestAdditionalProperties()211 void UnicodeTest::TestAdditionalProperties() {
212 #if !UCONFIG_NO_NORMALIZATION
213     // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
214     if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) {
215         errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
216               UPRV_LENGTHOF(derivedPropsNames));
217         return;
218     }
219     if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) {
220         errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n");
221         return;
222     }
223 
224     char path[500];
225     if(getUnidataPath(path) == nullptr) {
226         errln("unable to find path to source/data/unidata/");
227         return;
228     }
229     char *basename=strchr(path, 0);
230     strcpy(basename, "DerivedCoreProperties.txt");
231 
232     char *fields[2][2];
233     UErrorCode errorCode=U_ZERO_ERROR;
234     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
235     if(U_FAILURE(errorCode)) {
236         errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
237         return;
238     }
239 
240     strcpy(basename, "DerivedNormalizationProps.txt");
241     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
242     if(U_FAILURE(errorCode)) {
243         errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
244         return;
245     }
246 
247     // now we have all derived core properties in the UnicodeSets
248     // run them all through the API
249     int32_t rangeCount, range;
250     uint32_t i;
251     UChar32 start, end;
252 
253     // test all true properties
254     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
255         rangeCount=derivedProps[i].getRangeCount();
256         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
257             start=derivedProps[i].getRangeStart(range);
258             end=derivedProps[i].getRangeEnd(range);
259             for(; start<=end; ++start) {
260                 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
261                     dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==false is wrong", start, derivedPropsNames[i]);
262                     if(++numErrors[i]>=MAX_ERRORS) {
263                       dataerrln("Too many errors, moving to the next test");
264                       break;
265                     }
266                 }
267             }
268         }
269     }
270 
271     // invert all properties
272     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
273         derivedProps[i].complement();
274     }
275 
276     // test all false properties
277     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
278         rangeCount=derivedProps[i].getRangeCount();
279         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
280             start=derivedProps[i].getRangeStart(range);
281             end=derivedProps[i].getRangeEnd(range);
282             for(; start<=end; ++start) {
283                 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
284                     errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==true is wrong\n", start, derivedPropsNames[i]);
285                     if(++numErrors[i]>=MAX_ERRORS) {
286                       errln("Too many errors, moving to the next test");
287                       break;
288                     }
289                 }
290             }
291         }
292     }
293 #endif /* !UCONFIG_NO_NORMALIZATION */
294 }
295 
TestBinaryValues()296 void UnicodeTest::TestBinaryValues() {
297     /*
298      * Unicode 5.1 explicitly defines binary property value aliases.
299      * Verify that they are all recognized.
300      */
301     UErrorCode errorCode=U_ZERO_ERROR;
302     UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
303     if(U_FAILURE(errorCode)) {
304         dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
305         return;
306     }
307 
308     static const char *const falseValues[]={ "N", "No", "F", "False" };
309     static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
310     int32_t i;
311     for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) {
312         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
313         pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
314         errorCode=U_ZERO_ERROR;
315         UnicodeSet set(pattern, errorCode);
316         if(U_FAILURE(errorCode)) {
317             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
318             continue;
319         }
320         set.complement();
321         if(set!=alpha) {
322             errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
323         }
324     }
325     for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) {
326         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
327         pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
328         errorCode=U_ZERO_ERROR;
329         UnicodeSet set(pattern, errorCode);
330         if(U_FAILURE(errorCode)) {
331             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
332             continue;
333         }
334         if(set!=alpha) {
335             errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
336         }
337     }
338 }
339 
TestConsistency()340 void UnicodeTest::TestConsistency() {
341 #if !UCONFIG_NO_NORMALIZATION
342     /*
343      * Test for an example that getCanonStartSet() delivers
344      * all characters that compose from the input one,
345      * even in multiple steps.
346      * For example, the set for "I" (0049) should contain both
347      * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
348      * In general, the set for the middle such character should be a subset
349      * of the set for the first.
350      */
351     IcuTestErrorCode errorCode(*this, "TestConsistency");
352     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
353     const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
354     if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
355         dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
356                   errorCode.errorName());
357         errorCode.reset();
358         return;
359     }
360 
361     UnicodeSet set1, set2;
362     if (nfcImpl->getCanonStartSet(0x49, set1)) {
363         /* enumerate all characters that are plausible to be latin letters */
364         for(char16_t start=0xa0; start<0x2000; ++start) {
365             UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
366             if(decomp.length()>1 && decomp[0]==0x49) {
367                 set2.add(start);
368             }
369         }
370 
371         if (set1!=set2) {
372             errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
373         }
374         // This was available in cucdtst.c but the test had to move to intltest
375         // because the new internal normalization functions are in C++.
376         //compareUSets(set1, set2,
377         //             "[canon start set of 0049]", "[all c with canon decomp with 0049]",
378         //             true);
379     } else {
380         errln("NFC.getCanonStartSet() returned false");
381     }
382 #endif
383 }
384 
385 /**
386  * Test various implementations of Pattern_Syntax & Pattern_White_Space.
387  */
TestPatternProperties()388 void UnicodeTest::TestPatternProperties() {
389     IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
390     UnicodeSet syn_pp;
391     UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
392     UnicodeSet syn_list(
393         "[!-/\\:-@\\[-\\^`\\{-~"
394         "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
395         "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
396         "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
397     UnicodeSet ws_pp;
398     UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
399     UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
400     UnicodeSet syn_ws_pp;
401     UnicodeSet syn_ws_prop(syn_prop);
402     syn_ws_prop.addAll(ws_prop);
403     for(UChar32 c=0; c<=0xffff; ++c) {
404         if(PatternProps::isSyntax(c)) {
405             syn_pp.add(c);
406         }
407         if(PatternProps::isWhiteSpace(c)) {
408             ws_pp.add(c);
409         }
410         if(PatternProps::isSyntaxOrWhiteSpace(c)) {
411             syn_ws_pp.add(c);
412         }
413     }
414     compareUSets(syn_pp, syn_prop,
415                  "PatternProps.isSyntax()", "[:Pattern_Syntax:]", true);
416     compareUSets(syn_pp, syn_list,
417                  "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", true);
418     compareUSets(ws_pp, ws_prop,
419                  "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", true);
420     compareUSets(ws_pp, ws_list,
421                  "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", true);
422     compareUSets(syn_ws_pp, syn_ws_prop,
423                  "PatternProps.isSyntaxOrWhiteSpace()",
424                  "[[:Pattern_Syntax:][:Pattern_White_Space:]]", true);
425 }
426 
427 // So far only minimal port of Java & cucdtst.c compareUSets().
428 UBool
compareUSets(const UnicodeSet & a,const UnicodeSet & b,const char * a_name,const char * b_name,UBool diffIsError)429 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
430                           const char *a_name, const char *b_name,
431                           UBool diffIsError) {
432     UBool same= a==b;
433     if(!same && diffIsError) {
434         errln("Sets are different: %s vs. %s\n", a_name, b_name);
435     }
436     return same;
437 }
438 
439 namespace {
440 
441 /**
442  * Maps a special script code to the most common script of its encoded characters.
443  */
getCharScript(UScriptCode script)444 UScriptCode getCharScript(UScriptCode script) {
445     switch(script) {
446     case USCRIPT_HAN_WITH_BOPOMOFO:
447     case USCRIPT_SIMPLIFIED_HAN:
448     case USCRIPT_TRADITIONAL_HAN:
449         return USCRIPT_HAN;
450     case USCRIPT_JAPANESE:
451         return USCRIPT_HIRAGANA;
452     case USCRIPT_JAMO:
453     case USCRIPT_KOREAN:
454         return USCRIPT_HANGUL;
455     case USCRIPT_SYMBOLS_EMOJI:
456         return USCRIPT_SYMBOLS;
457     default:
458         return script;
459     }
460 }
461 
462 }  // namespace
463 
TestScriptMetadata()464 void UnicodeTest::TestScriptMetadata() {
465     IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
466     UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
467     // So far, sample characters are uppercase.
468     // Georgian is special.
469     UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
470     for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
471         UScriptCode sc = (UScriptCode)sci;
472         // Run the test with -v to see which script has failures:
473         // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL
474         logln(uscript_getShortName(sc));
475         UScriptUsage usage = uscript_getUsage(sc);
476         UnicodeString sample = uscript_getSampleUnicodeString(sc);
477         UnicodeSet scriptSet;
478         scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
479         if(usage == USCRIPT_USAGE_NOT_ENCODED) {
480             assertTrue("not encoded, no sample", sample.isEmpty());
481             assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
482             assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
483             assertFalse("not encoded, not cased", uscript_isCased(sc));
484             assertTrue("not encoded, no characters", scriptSet.isEmpty());
485         } else {
486             assertFalse("encoded, has a sample character", sample.isEmpty());
487             UChar32 firstChar = sample.char32At(0);
488             UScriptCode charScript = getCharScript(sc);
489             assertEquals("script(sample(script))",
490                          (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode));
491             assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc));
492             assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc));
493             assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty()));
494             if(uscript_isRightToLeft(sc)) {
495                 rtl.removeAll(scriptSet);
496             }
497             if(uscript_isCased(sc)) {
498                 cased.removeAll(scriptSet);
499             }
500         }
501     }
502     UnicodeString pattern;
503     assertEquals("no remaining RTL characters",
504                  UnicodeString("[]"), rtl.toPattern(pattern));
505     assertEquals("no remaining cased characters",
506                  UnicodeString("[]"), cased.toPattern(pattern));
507 
508     assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
509     assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
510     assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
511 }
512 
TestBidiPairedBracketType()513 void UnicodeTest::TestBidiPairedBracketType() {
514     // BidiBrackets-6.3.0.txt says:
515     //
516     // The set of code points listed in this file was originally derived
517     // using the character properties General_Category (gc), Bidi_Class (bc),
518     // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
519     // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
520     // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
521     // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
522     // property values are Open and Close, respectively.
523     IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()");
524     UnicodeSet bpt("[:^bpt=n:]", errorCode);
525     assertTrue("bpt!=None is not empty", !bpt.isEmpty());
526     // The following should always be true.
527     UnicodeSet mirrored("[:Bidi_M:]", errorCode);
528     UnicodeSet other_neutral("[:bc=ON:]", errorCode);
529     assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
530     assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
531     // The following are true at least initially in Unicode 6.3.
532     UnicodeSet bpt_open("[:bpt=o:]", errorCode);
533     UnicodeSet bpt_close("[:bpt=c:]", errorCode);
534     UnicodeSet ps("[:Ps:]", errorCode);
535     UnicodeSet pe("[:Pe:]", errorCode);
536     assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
537     assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
538 }
539 
TestEmojiProperties()540 void UnicodeTest::TestEmojiProperties() {
541     assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI));
542     assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI));
543     IcuTestErrorCode errorCode(*this, "TestEmojiProperties()");
544     UnicodeSet emoji("[:Emoji:]", errorCode);
545     assertTrue("lots of Emoji", emoji.size() > 700);
546 
547     assertTrue("shooting star is Emoji_Presentation",
548                u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION));
549     assertTrue("Fitzpatrick 6 is Emoji_Modifier",
550                u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER));
551     assertTrue("happy person is Emoji_Modifier_Base",
552                u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE));
553     assertTrue("asterisk is Emoji_Component",
554                u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT));
555     assertTrue("copyright is Extended_Pictographic",
556                u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
557 }
558 
559 namespace {
560 
hbp(const char16_t * s,int32_t length,UProperty which)561 UBool hbp(const char16_t *s, int32_t length, UProperty which) {
562     return u_stringHasBinaryProperty(s, length, which);
563 }
564 
hbp(const char16_t * s,UProperty which)565 UBool hbp(const char16_t *s, UProperty which) {
566     return u_stringHasBinaryProperty(s, -1, which);
567 }
568 
569 }  // namespace
570 
TestEmojiPropertiesOfStrings()571 void UnicodeTest::TestEmojiPropertiesOfStrings() {
572     // Property of code points, for coverage
573     assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
574     assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
575     assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
576     assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
577     assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
578     assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
579     assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
580     assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
581     assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
582     assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
583     assertFalse("bicycle is not Ideographic", hbp(u"��", 2, UCHAR_IDEOGRAPHIC));
584     assertFalse("bicycle/0 is not Ideographic", hbp(u"��", -1, UCHAR_IDEOGRAPHIC));
585     assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
586     assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
587 
588     // Property of (code points and) strings
589     assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
590     assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
591     assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
592     assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
593     assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
594     assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
595     assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
596     assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
597     assertTrue("bicycle is Basic_Emoji", hbp(u"��", 2, UCHAR_BASIC_EMOJI));
598     assertTrue("bicycle/0 is Basic_Emoji", hbp(u"��", -1, UCHAR_BASIC_EMOJI));
599     assertFalse("2*bicycle is Basic_Emoji", hbp(u"����", 4, UCHAR_BASIC_EMOJI));
600     assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"����", -1, UCHAR_BASIC_EMOJI));
601     assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
602     assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
603 
604     assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
605     assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
606     assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
607     assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
608 
609     assertFalse("chipmunk is not Basic_Emoji", hbp(u"��", UCHAR_BASIC_EMOJI));
610     assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"��\uFE0F", UCHAR_BASIC_EMOJI));
611     assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"��\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
612 
613     // Properties of strings (only)
614     assertFalse("4+emoji is not Emoji_Keycap_Sequence",
615                 hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
616     assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
617                hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
618 
619     assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
620                 hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
621     assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
622                hbp(u"����", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
623 
624     assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
625                 hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
626     assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
627                hbp(u"��������������", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
628 
629     assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
630                 hbp(u"��", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
631     assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
632                hbp(u"��\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
633 
634     assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
635                 hbp(u"��\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
636     assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
637                hbp(u"��\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
638 
639     // RGI_Emoji = all of the above
640     assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
641     assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
642 
643     assertFalse("chipmunk is not RGI_Emoji", hbp(u"��", UCHAR_RGI_EMOJI));
644     assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"��\uFE0F", UCHAR_RGI_EMOJI));
645 
646     assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
647     assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
648 
649     assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
650     assertTrue("[BE] is RGI_Emoji", hbp(u"����", UCHAR_RGI_EMOJI));
651 
652     assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
653     assertTrue("[Scotland] is RGI_Emoji", hbp(u"��������������", UCHAR_RGI_EMOJI));
654 
655     assertTrue("bicyclist is RGI_Emoji", hbp(u"��", UCHAR_RGI_EMOJI));
656     assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"��\U0001F3FD", UCHAR_RGI_EMOJI));
657 
658     assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"��\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
659     assertTrue("woman pilot: dark skin tone is RGI_Emoji",
660                hbp(u"��\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
661 
662     // UnicodeSet with properties of strings
663     IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
664     UnicodeSet basic("[:Basic_Emoji:]", errorCode);
665     UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
666     UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
667     UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
668     UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
669     UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
670     UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
671     if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
672         return;
673     }
674 
675     // union of all sets except for "rgi" -- should be the same as "rgi"
676     UnicodeSet all(basic);
677     all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
678 
679     UnicodeSet basicOnlyCp(basic);
680     basicOnlyCp.removeAllStrings();
681 
682     UnicodeSet rgiOnlyCp(rgi);
683     rgiOnlyCp.removeAllStrings();
684 
685     assertTrue("lots of Basic_Emoji", basic.size() > 1000);
686     assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
687     assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
688     assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
689     assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
690     assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
691     assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
692 
693     assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
694     assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
695     assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
696     assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
697     assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
698     assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
699     assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
700 
701     assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
702     assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
703     assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
704                  rgiOnlyCp.size(), basicOnlyCp.size());
705     assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
706     assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
707     assertTrue("RGI_Emoji == union", rgi == all);
708 
709     assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
710     assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"��\uFE0F"));
711     assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
712                keycaps.contains(u"4\uFE0F\u20E3"));
713     assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u"����"));
714     assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u"��������������"));
715     assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
716                modified.contains(u"��\U0001F3FD"));
717     assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
718                combos.contains(u"��\U0001F3FF\u200D✈\uFE0F"));
719     assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
720     assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"��\uFE0F"));
721     assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
722     assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u"����"));
723     assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
724     assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u"��������������"));
725     assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u"��"));
726     assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"��\U0001F3FD"));
727     assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"��\U0001F3FF\u200D✈\uFE0F"));
728 }
729 
TestIndicPositionalCategory()730 void UnicodeTest::TestIndicPositionalCategory() {
731     IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
732     UnicodeSet na(u"[:InPC=NA:]", errorCode);
733     assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500);
734     UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode);
735     assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100);
736     assertEquals("U+08FF: NA", U_INPC_NA,
737                  u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY));
738     assertEquals("U+0900: Top", U_INPC_TOP,
739                  u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY));
740     assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK,
741                  u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY));
742 }
743 
TestIndicSyllabicCategory()744 void UnicodeTest::TestIndicSyllabicCategory() {
745     IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()");
746     UnicodeSet other(u"[:InSC=Other:]", errorCode);
747     assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500);
748     UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode);
749     assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100);
750     assertEquals("U+08FF: Other", U_INSC_OTHER,
751                  u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY));
752     assertEquals("U+0900: Bindu", U_INSC_BINDU,
753                  u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY));
754     assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER,
755                  u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY));
756 }
757 
TestVerticalOrientation()758 void UnicodeTest::TestVerticalOrientation() {
759     IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()");
760     UnicodeSet r(u"[:vo=R:]", errorCode);
761     assertTrue("mostly R", 0xc0000 <= r.size() && r.size() <= 0xd0000);
762     UnicodeSet u(u"[:vo=U:]", errorCode);
763     assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x50000);
764     UnicodeSet tu(u"[:vo=Tu:]", errorCode);
765     assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300);
766     assertEquals("U+0E01: Rotated", U_VO_ROTATED,
767                  u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION));
768     assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED,
769                  u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION));
770     assertEquals("U+33333: Upright", U_VO_UPRIGHT,
771                  u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION));
772 }
773 
TestDefaultScriptExtensions()774 void UnicodeTest::TestDefaultScriptExtensions() {
775     // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
776     // but some of its characters revert to scx=<script> which is usually Common.
777     IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()");
778     UScriptCode scx[20];
779     scx[0] = USCRIPT_INVALID_CODE;
780     assertEquals("U+3000 num scx", 1,  // IDEOGRAPHIC SPACE
781                  uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode));
782     assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]);
783     scx[0] = USCRIPT_INVALID_CODE;
784     assertEquals("U+3012 num scx", 1,  // POSTAL MARK
785                  uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode));
786     assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]);
787 }
788 
TestInvalidCodePointFolding()789 void UnicodeTest::TestInvalidCodePointFolding() {
790     // Test behavior when an invalid code point is passed to u_foldCase
791     static const UChar32 invalidCodePoints[] = {
792             0xD800, // lead surrogate
793             0xDFFF, // trail surrogate
794             0xFDD0, // noncharacter
795             0xFFFF, // noncharacter
796             0x110000, // out of range
797             -1 // negative
798     };
799     for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) {
800         UChar32 cp = invalidCodePoints[i];
801         assertEquals("Invalid code points should be echoed back",
802                 cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT));
803         assertEquals("Invalid code points should be echoed back",
804                 cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
805     }
806 }
807 
TestBinaryCharacterProperties()808 void UnicodeTest::TestBinaryCharacterProperties() {
809 #if !UCONFIG_NO_NORMALIZATION
810     IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
811     // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
812     for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
813         const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode);
814         if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) {
815             continue;
816         }
817         const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
818         int32_t count = set.getRangeCount();
819         if (count == 0) {
820             assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
821                 u_hasBinaryProperty(0x20, (UProperty)prop));
822             assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
823                 u_hasBinaryProperty(0x61, (UProperty)prop));
824             assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
825                 u_hasBinaryProperty(0x4e00, (UProperty)prop));
826         } else {
827             UChar32 c = set.getRangeStart(0);
828             if (c > 0) {
829                 assertFalse(
830                     UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
831                         u", " + prop + u")",
832                     u_hasBinaryProperty(c - 1, (UProperty)prop));
833             }
834             assertTrue(
835                 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
836                     u", " + prop + u")",
837                 u_hasBinaryProperty(c, (UProperty)prop));
838             c = set.getRangeEnd(count - 1);
839             assertTrue(
840                 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
841                     u", " + prop + u")",
842                 u_hasBinaryProperty(c, (UProperty)prop));
843             if (c < 0x10ffff) {
844                 assertFalse(
845                     UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
846                         u", " + prop + u")",
847                     u_hasBinaryProperty(c + 1, (UProperty)prop));
848             }
849         }
850     }
851 #endif
852 }
853 
TestIntCharacterProperties()854 void UnicodeTest::TestIntCharacterProperties() {
855 #if !UCONFIG_NO_NORMALIZATION
856     IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
857     // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
858     for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
859         const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode);
860         if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) {
861             continue;
862         }
863         uint32_t value;
864         UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
865         assertTrue("int property first range", end >= 0);
866         UChar32 c = end / 2;
867         assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
868             u_getIntPropertyValue(c, (UProperty)prop), value);
869         end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
870         assertTrue("int property later range", end >= 0);
871         assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
872             u_getIntPropertyValue(end, (UProperty)prop), value);
873         // ucpmap_get() API coverage
874         // TODO: move to cucdtst.c
875         assertEquals(
876             "int property upcmap_get(U+0061)",
877             u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61));
878     }
879 #endif
880 }
881 
882 namespace {
883 
getPropName(UProperty property,int32_t nameChoice)884 const char *getPropName(UProperty property, int32_t nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
885     const char *name = u_getPropertyName(property, (UPropertyNameChoice)nameChoice);
886     return name != nullptr ? name : "null";
887 }
888 
getValueName(UProperty property,int32_t value,int32_t nameChoice)889 const char *getValueName(UProperty property, int32_t value, int32_t nameChoice)
890         UPRV_NO_SANITIZE_UNDEFINED {
891     const char *name = u_getPropertyValueName(property, value, (UPropertyNameChoice)nameChoice);
892     return name != nullptr ? name : "null";
893 }
894 
895 }  // namespace
896 
TestPropertyNames()897 void UnicodeTest::TestPropertyNames() {
898     IcuTestErrorCode errorCode(*this, "TestPropertyNames()");
899     // Test names of certain properties & values.
900     // The UPropertyNameChoice is really an integer with only a couple of named constants.
901     UProperty prop = UCHAR_WHITE_SPACE;
902     constexpr int32_t SHORT = U_SHORT_PROPERTY_NAME;
903     constexpr int32_t LONG = U_LONG_PROPERTY_NAME;
904     assertEquals("White_Space: index -1", "null", getPropName(prop, -1));
905     assertEquals("White_Space: short", "WSpace", getPropName(prop, SHORT));
906     assertEquals("White_Space: long", "White_Space", getPropName(prop, LONG));
907     assertEquals("White_Space: index 2", "space", getPropName(prop, 2));
908     assertEquals("White_Space: index 3", "null", getPropName(prop, 3));
909 
910     prop = UCHAR_SIMPLE_CASE_FOLDING;
911     assertEquals("Simple_Case_Folding: index -1", "null", getPropName(prop, -1));
912     assertEquals("Simple_Case_Folding: short", "scf", getPropName(prop, SHORT));
913     assertEquals("Simple_Case_Folding: long", "Simple_Case_Folding", getPropName(prop, LONG));
914     assertEquals("Simple_Case_Folding: index 2", "sfc", getPropName(prop, 2));
915     assertEquals("Simple_Case_Folding: index 3", "null", getPropName(prop, 3));
916 
917     prop = UCHAR_CASED;
918     assertEquals("Cased=Y: index -1", "null", getValueName(prop, 1, -1));
919     assertEquals("Cased=Y: short", "Y", getValueName(prop, 1, SHORT));
920     assertEquals("Cased=Y: long", "Yes", getValueName(prop, 1, LONG));
921     assertEquals("Cased=Y: index 2", "T", getValueName(prop, 1, 2));
922     assertEquals("Cased=Y: index 3", "True", getValueName(prop, 1, 3));
923     assertEquals("Cased=Y: index 4", "null", getValueName(prop, 1, 4));
924 
925     prop = UCHAR_DECOMPOSITION_TYPE;
926     int32_t value = U_DT_NOBREAK;
927     assertEquals("dt=Nb: index -1", "null", getValueName(prop, value, -1));
928     assertEquals("dt=Nb: short", "Nb", getValueName(prop, value, SHORT));
929     assertEquals("dt=Nb: long", "Nobreak", getValueName(prop, value, LONG));
930     assertEquals("dt=Nb: index 2", "nb", getValueName(prop, value, 2));
931     assertEquals("dt=Nb: index 3", "null", getValueName(prop, value, 3));
932 
933     // Canonical_Combining_Class:
934     // The UCD inserts the numeric values in the second filed of its PropertyValueAliases.txt lines.
935     // In ICU, we don't treat these as names,
936     // they are just the numeric values returned by u_getCombiningClass().
937     // We return the real short and long names for the usual choice constants.
938     prop = UCHAR_CANONICAL_COMBINING_CLASS;
939     assertEquals("ccc=230: index -1", "null", getValueName(prop, 230, -1));
940     assertEquals("ccc=230: short", "A", getValueName(prop, 230, SHORT));
941     assertEquals("ccc=230: long", "Above", getValueName(prop, 230, LONG));
942     assertEquals("ccc=230: index 2", "null", getValueName(prop, 230, 2));
943 
944     prop = UCHAR_GENERAL_CATEGORY;
945     value = U_DECIMAL_DIGIT_NUMBER;
946     assertEquals("gc=Nd: index -1", "null", getValueName(prop, value, -1));
947     assertEquals("gc=Nd: short", "Nd", getValueName(prop, value, SHORT));
948     assertEquals("gc=Nd: long", "Decimal_Number", getValueName(prop, value, LONG));
949     assertEquals("gc=Nd: index 2", "digit", getValueName(prop, value, 2));
950     assertEquals("gc=Nd: index 3", "null", getValueName(prop, value, 3));
951 
952     prop = UCHAR_GENERAL_CATEGORY_MASK;
953     value = U_GC_P_MASK;
954     assertEquals("gc=P mask: index -1", "null", getValueName(prop, value, -1));
955     assertEquals("gc=P mask: short", "P", getValueName(prop, value, SHORT));
956     assertEquals("gc=P mask: long", "Punctuation", getValueName(prop, value, LONG));
957     assertEquals("gc=P mask: index 2", "punct", getValueName(prop, value, 2));
958     assertEquals("gc=P mask: index 3", "null", getValueName(prop, value, 3));
959 }
960 
TestIDSUnaryOperator()961 void UnicodeTest::TestIDSUnaryOperator() {
962     IcuTestErrorCode errorCode(*this, "TestIDSUnaryOperator()");
963     // New in Unicode 15.1 for just two characters.
964     assertFalse("U+2FFC IDSU", u_hasBinaryProperty(0x2ffc, UCHAR_IDS_UNARY_OPERATOR));
965     assertFalse("U+2FFD IDSU", u_hasBinaryProperty(0x2ffd, UCHAR_IDS_UNARY_OPERATOR));
966     assertTrue("U+2FFE IDSU", u_hasBinaryProperty(0x2ffe, UCHAR_IDS_UNARY_OPERATOR));
967     assertTrue("U+2FFF IDSU", u_hasBinaryProperty(0x2fff, UCHAR_IDS_UNARY_OPERATOR));
968     assertFalse("U+3000 IDSU", u_hasBinaryProperty(0x3000, UCHAR_IDS_UNARY_OPERATOR));
969     assertFalse("U+3001 IDSU", u_hasBinaryProperty(0x3001, UCHAR_IDS_UNARY_OPERATOR));
970 
971     // Property name works and gets the correct set.
972     UnicodeSet idsu(u"[:IDS_Unary_Operator:]", errorCode);
973     assertEquals("IDSU set number of characters", 2, idsu.size());
974     assertFalse("idsu.contains(U+2FFD)", idsu.contains(0x2ffd));
975     assertTrue("idsu.contains(U+2FFE)", idsu.contains(0x2ffe));
976     assertTrue("idsu.contains(U+2FFF)", idsu.contains(0x2fff));
977     assertFalse("idsu.contains(U+3000)", idsu.contains(0x3000));
978 }
979 
980 namespace {
981 
isMathStart(UChar32 c)982 bool isMathStart(UChar32 c) {
983     return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_START);
984 }
985 
isMathContinue(UChar32 c)986 bool isMathContinue(UChar32 c) {
987     return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_CONTINUE);
988 }
989 
990 }  // namespace
991 
TestIDCompatMath()992 void UnicodeTest::TestIDCompatMath() {
993     IcuTestErrorCode errorCode(*this, "TestIDCompatMath()");
994     assertFalse("U+00B1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb1));
995     assertTrue("U+00B2 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb2));
996     assertTrue("U+00B3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb3));
997     assertFalse("U+00B4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb4));
998     assertFalse("U+207F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x207f));
999     assertTrue("U+2080 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2080));
1000     assertTrue("U+208E UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208e));
1001     assertFalse("U+208F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208f));
1002     assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2201));
1003     assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2202));
1004     assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D6C1));
1005     assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C3));
1006     assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C4));
1007 
1008     assertFalse("U+00B2 UCHAR_ID_COMPAT_MATH_START", isMathStart(0xb2));
1009     assertFalse("U+2080 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2080));
1010     assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2201));
1011     assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2202));
1012     assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D6C1));
1013     assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C3));
1014     assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C4));
1015 
1016     // Property names work and get the correct sets.
1017     UnicodeSet idcmStart(u"[:ID_Compat_Math_Start:]", errorCode);
1018     UnicodeSet idcmContinue(u"[:ID_Compat_Math_Continue:]", errorCode);
1019     assertEquals("ID_Compat_Math_Start set number of characters", 13, idcmStart.size());
1020     assertEquals("ID_Compat_Math_Continue set number of characters", 43, idcmContinue.size());
1021     assertTrue("ID_Compat_Math_Start is a subset of ID_Compat_Math_Continue",
1022                idcmContinue.containsAll(idcmStart));
1023     assertFalse("idcmContinue.contains(U+207F)", idcmContinue.contains(0x207f));
1024     assertTrue("idcmContinue.contains(U+2080)", idcmContinue.contains(0x2080));
1025     assertTrue("idcmContinue.contains(U+208E)", idcmContinue.contains(0x208e));
1026     assertFalse("idcmContinue.contains(U+208F)", idcmContinue.contains(0x208f));
1027     assertFalse("idcmStart.contains(U+2201)", idcmStart.contains(0x2201));
1028     assertTrue("idcmStart.contains(U+2202)", idcmStart.contains(0x2202));
1029     assertTrue("idcmStart.contains(U+1D7C3)", idcmStart.contains(0x1D7C3));
1030     assertFalse("idcmStart.contains(U+1D7C4)", idcmStart.contains(0x1D7C4));
1031 }
1032 
1033 U_NAMESPACE_BEGIN
1034 
1035 class BuiltInPropertyNames : public PropertyNames {
1036 public:
~BuiltInPropertyNames()1037     ~BuiltInPropertyNames() override {}
1038 
getPropertyEnum(const char * name) const1039     int32_t getPropertyEnum(const char *name) const override {
1040         return u_getPropertyEnum(name);
1041     }
1042 
getPropertyValueEnum(int32_t property,const char * name) const1043     int32_t getPropertyValueEnum(int32_t property, const char *name) const override {
1044         return u_getPropertyValueEnum((UProperty) property, name);
1045     }
1046 };
1047 
1048 U_NAMESPACE_END
1049 
TestBinaryPropertyUsingPpucd()1050 void UnicodeTest::TestBinaryPropertyUsingPpucd() {
1051     IcuTestErrorCode errorCode(*this, "TestBinaryPropertyUsingPpucd()");
1052 
1053     // Initialize PPUCD parsing object using file in repo and using
1054     // property names present in built-in data in ICU
1055     char buffer[500];
1056     // get path to `source/data/unidata/` including trailing `/`
1057     char *unidataPath = getUnidataPath(buffer);
1058     if(unidataPath == nullptr) {
1059         errln("exiting early because unable to open ppucd.txt from ICU source tree");
1060         return;
1061     }
1062     CharString ppucdPath(unidataPath, errorCode);
1063     ppucdPath.appendPathPart("ppucd.txt", errorCode);
1064     PreparsedUCD ppucd(ppucdPath.data(), errorCode);
1065     if(errorCode.isFailure()) {
1066         errln("unable to open %s - %s\n",
1067             ppucdPath.data(), errorCode.errorName());
1068         return;
1069     }
1070     BuiltInPropertyNames builtInPropNames;
1071     ppucd.setPropertyNames(&builtInPropNames);
1072 
1073     // Define which binary properties we want to compare
1074     constexpr UProperty propsUnderTest[] = {
1075         UCHAR_IDS_UNARY_OPERATOR,
1076         UCHAR_ID_COMPAT_MATH_START,
1077         UCHAR_ID_COMPAT_MATH_CONTINUE,
1078     };
1079 
1080     // Allocate & initialize UnicodeSets per binary property from PPUCD data
1081     UnicodeSet ppucdPropSets[std::size(propsUnderTest)];
1082 
1083     // Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
1084     PreparsedUCD::LineType lineType;
1085     UnicodeSet newValues;
1086     while((lineType=ppucd.readLine(errorCode))!=PreparsedUCD::NO_LINE && errorCode.isSuccess()) {
1087         if(ppucd.lineHasPropertyValues()) {
1088             const UniProps *lineProps=ppucd.getProps(newValues, errorCode);
1089 
1090             for(uint32_t i = 0; i < std::size(propsUnderTest); i++) {
1091                 UProperty prop = propsUnderTest[i];
1092                 if (!newValues.contains(prop)) {
1093                     continue;
1094                 }
1095                 if (lineProps->binProps[prop]) {
1096                     ppucdPropSets[i].add(lineProps->start, lineProps->end);
1097                 } else {
1098                     ppucdPropSets[i].remove(lineProps->start, lineProps->end);
1099                 }
1100             }
1101         }
1102     }
1103 
1104     if(errorCode.isFailure()) {
1105         errln("exiting early due to parsing error");
1106         return;
1107     }
1108 
1109     // Assert that the PPUCD data and the ICU data are equivalent for all properties
1110     for(uint32_t i = 0; i < std::size(propsUnderTest); i++) {
1111         UnicodeSet icuPropSet;
1112         UProperty prop = propsUnderTest[i];
1113         icuPropSet.applyIntPropertyValue(prop, 1, errorCode);
1114         std::string msg =
1115             std::string()
1116             + "ICU & PPUCD versions of property "
1117             + u_getPropertyName(prop, U_LONG_PROPERTY_NAME);
1118         assertTrue(msg.c_str(), ppucdPropSets[i] == icuPropSet);
1119     }
1120 }
1121 
1122 namespace {
1123 
getIDStatus(UChar32 c)1124 int32_t getIDStatus(UChar32 c) {
1125     return u_getIntPropertyValue(c, UCHAR_IDENTIFIER_STATUS);
1126 }
1127 
1128 }  // namespace
1129 
TestIDStatus()1130 void UnicodeTest::TestIDStatus() {
1131     IcuTestErrorCode errorCode(*this, "TestIDStatus()");
1132     assertEquals("ID_Status(slash)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x2F));
1133     assertEquals("ID_Status(digit 0)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x30));
1134     assertEquals("ID_Status(colon)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x3A));
1135     assertEquals("ID_Status(semicolon)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x3B));
1136     assertEquals("ID_Status(Greek small alpha)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x03B1));
1137     assertEquals("ID_Status(Greek small archaic koppa)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x03D9));
1138     assertEquals("ID_Status(Hangul syllable)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0xAC00));
1139     assertEquals("ID_Status(surrogate)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xD800));
1140     assertEquals("ID_Status(Arabic tail fragment)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xFE73));
1141     assertEquals("ID_Status(Hentaigana ko-3)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x1B03A));
1142     assertEquals("ID_Status(Katakana small ko)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x1B155));
1143     assertEquals("ID_Status(U+2EE5D)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x2EE5D));
1144     assertEquals("ID_Status(U+10FFFF)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x10FFFF));
1145 
1146     // Property names work and get the correct sets.
1147     UnicodeSet idStatus(u"[:Identifier_Status=Allowed:]", errorCode);
1148     // Unicode 15.1: 112778 Allowed characters; normally grows over time
1149     assertTrue("Allowed number of characters", idStatus.size() >= 112778);
1150     assertFalse("Allowed.contains(slash)", idStatus.contains(0x2F));
1151     assertTrue("Allowed.contains(digit 0)", idStatus.contains(0x30));
1152     assertTrue("Allowed.contains(colon)", idStatus.contains(0x3A));
1153     assertFalse("Allowed.contains(semicolon)", idStatus.contains(0x3B));
1154     assertTrue("Allowed.contains(Greek small alpha)", idStatus.contains(0x03B1));
1155     assertFalse("Allowed.contains(Greek small archaic koppa)", idStatus.contains(0x03D9));
1156     assertTrue("Allowed.contains(Hangul syllable)", idStatus.contains(0xAC00));
1157     assertFalse("Allowed.contains(surrogate)", idStatus.contains(0xD800));
1158     assertFalse("Allowed.contains(Arabic tail fragment)", idStatus.contains(0xFE73));
1159     assertFalse("Allowed.contains(Hentaigana ko-3)", idStatus.contains(0x1B03A));
1160     assertTrue("Allowed.contains(Katakana small ko)", idStatus.contains(0x1B155));
1161     assertTrue("Allowed.contains(U+2EE5D)", idStatus.contains(0x2EE5D));
1162     assertFalse("Allowed.contains(U+10FFFF)", idStatus.contains(0x10FFFF));
1163 }
1164 
1165 namespace {
1166 
getIDTypes(UChar32 c)1167 UnicodeString getIDTypes(UChar32 c) {
1168     UErrorCode errorCode = U_ZERO_ERROR;
1169     UIdentifierType types[10];
1170     int32_t length = u_getIDTypes(c, types, UPRV_LENGTHOF(types), &errorCode);
1171     if (U_FAILURE(errorCode)) {
1172         return UnicodeString(u_errorName(errorCode), -1, US_INV);
1173     }
1174     // The order of values is undefined, but for simplicity we assume the order
1175     // that the current implementation yields. Otherwise we would have to sort the values.
1176     uint32_t typeBits = 0;
1177     UnicodeString result;
1178     for (int32_t i = 0; i < length; ++i) {
1179         if (i != 0) {
1180             result.append(u' ');
1181         }
1182         auto t = types[i];
1183         typeBits |= 1UL << t;
1184         const char *s = u_getPropertyValueName(UCHAR_IDENTIFIER_TYPE, t, U_LONG_PROPERTY_NAME);
1185         if (s != nullptr) {
1186             result.append(UnicodeString(s, -1, US_INV));
1187         } else {
1188             result.append(u"???");
1189         }
1190     }
1191     // Check that u_hasIDType() agrees.
1192     // Includes undefined behavior with t > largest enum constant.
1193     for (int32_t i = 0; i < 16; ++i) {
1194         UIdentifierType t = (UIdentifierType)i;
1195         bool expected = (typeBits & (1UL << i)) != 0;
1196         bool actual = u_hasIDType(c, t);
1197         if (actual != expected) {
1198             result.append(u" != u_hasIDType() ");
1199             result = result + i;
1200             break;
1201         }
1202     }
1203     return result;
1204 }
1205 
1206 }  // namespace
1207 
TestIDType()1208 void UnicodeTest::TestIDType() {
1209     IcuTestErrorCode errorCode(*this, "TestIDType()");
1210     // Note: Types other than Recommended and Inclusion may well change over time.
1211     assertEquals("ID_Type(slash)", u"Not_XID", getIDTypes(0x2F));
1212     assertEquals("ID_Type(digit 0)", u"Recommended", getIDTypes(0x30));
1213     assertEquals("ID_Type(colon)", u"Inclusion", getIDTypes(0x3A));
1214     assertEquals("ID_Type(semicolon)", u"Not_XID", getIDTypes(0x3B));
1215     assertEquals("ID_Type(Greek small alpha)", u"Recommended", getIDTypes(0x03B1));
1216     assertEquals("ID_Type(Greek small archaic koppa)", u"Obsolete", getIDTypes(0x03D9));
1217     assertEquals("ID_Type(Hangul syllable)", u"Recommended", getIDTypes(0xAC00));
1218     assertEquals("ID_Type(surrogate)", u"Not_Character", getIDTypes(0xD800));
1219     assertEquals("ID_Type(Arabic tail fragment)", u"Technical", getIDTypes(0xFE73));
1220     assertEquals("ID_Type(Linear B syllable)", u"Exclusion", getIDTypes(0x10000));
1221     assertEquals("ID_Type(Hentaigana ko-3)", u"Obsolete", getIDTypes(0x1B03A));
1222     assertEquals("ID_Type(Katakana small ko)", u"Recommended", getIDTypes(0x1B155));
1223     assertEquals("ID_Type(U+2EE5D)", u"Recommended", getIDTypes(0x2EE5D));
1224     assertEquals("ID_Type(U+10FFFF)", u"Not_Character", getIDTypes(0x10FFFF));
1225 
1226     assertEquals("ID_Type(CYRILLIC THOUSANDS SIGN)", u"Not_XID Obsolete", getIDTypes(0x0482));
1227     assertEquals("ID_Type(SYRIAC FEMININE DOT)", u"Technical Limited_Use", getIDTypes(0x0740));
1228     assertEquals("ID_Type(NKO LETTER JONA JA)", u"Obsolete Limited_Use", getIDTypes(0x07E8));
1229     assertEquals("ID_Type(SYRIAC END OF PARAGRAPH)", u"Not_XID Limited_Use", getIDTypes(0x0700));
1230     assertEquals("ID_Type(LATIN SMALL LETTER EZH)=", u"Technical Uncommon_Use", getIDTypes(0x0292));
1231     assertEquals("ID_Type(MUSICAL SYMBOL KIEVAN C CLEF)", u"Not_XID Technical Uncommon_Use", getIDTypes(0x1D1DE));
1232     assertEquals("ID_Type(MRO LETTER TA)", u"Exclusion Uncommon_Use", getIDTypes(0x16A40));
1233     assertEquals("ID_Type(GREEK MUSICAL LEIMMA)", u"Not_XID Obsolete", getIDTypes(0x1D245));
1234 
1235     // error handling
1236     UIdentifierType types[2];
1237     UErrorCode failure = U_ZERO_ERROR;
1238     u_getIDTypes(0, types, -1, &failure);
1239     assertEquals("u_getIDTypes(capacity<0)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1240 
1241     failure = U_ZERO_ERROR;
1242     u_getIDTypes(0, nullptr, 1, &failure);
1243     assertEquals("u_getIDTypes(nullptr)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1244 
1245     failure = U_ZERO_ERROR;
1246     int32_t length = u_getIDTypes(0x30, types, 0, &failure);
1247     assertEquals("u_getIDTypes(digit 0, capacity 0) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1248     assertEquals("u_getIDTypes(digit 0, capacity 0) length", 1, length);
1249 
1250     failure = U_ZERO_ERROR;
1251     length = u_getIDTypes(0x1D1DE, types, 0, &failure);
1252     assertEquals("u_getIDTypes(Kievan C clef, capacity 2) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1253     assertEquals("u_getIDTypes(Kievan C clef, capacity 2) length", 3, length);
1254 
1255     // Property names work and get the correct sets.
1256     UnicodeSet rec(u"[:Identifier_Type=Recommended:]", errorCode);
1257     UnicodeSet incl(u"[:Identifier_Type=Inclusion:]", errorCode);
1258     UnicodeSet limited(u"[:Identifier_Type=Limited_Use:]", errorCode);
1259     UnicodeSet uncommon(u"[:Identifier_Type=Uncommon_Use:]", errorCode);
1260     UnicodeSet notChar(u"[:Identifier_Type=Not_Character:]", errorCode);
1261     // Unicode 15.1 set sizes; normally grows over time except Not_Character shrinks
1262     assertTrue("Recommended number of characters", rec.size() >= 112761);
1263     assertTrue("Inclusion number of characters", incl.size() >= 17);
1264     assertTrue("Limited_Use number of characters", limited.size() >= 5268);
1265     assertTrue("Uncommon_Use number of characters", uncommon.size() >= 398);
1266     assertTrue("Not_Character number of characters",
1267                800000 <= notChar.size() && notChar.size() <= 964293);
1268     assertFalse("Recommended.contains(slash)", rec.contains(0x2F));
1269     assertTrue("Recommended.contains(digit 0)", rec.contains(0x30));
1270     assertTrue("Inclusion.contains(colon)", incl.contains(0x3A));
1271     assertTrue("Recommended.contains(U+2EE5D)", rec.contains(0x2EE5D));
1272     assertTrue("Limited_Use.contains(SYRIAC FEMININE DOT)", limited.contains(0x0740));
1273     assertTrue("Limited_Use.contains(NKO LETTER JONA JA)", limited.contains(0x7E8));
1274     assertTrue("Not_Character.contains(surrogate)", notChar.contains(0xd800));
1275     assertTrue("Not_Character.contains(U+10FFFF)", notChar.contains(0x10FFFF));
1276     assertTrue("Uncommon_Use.contains(LATIN SMALL LETTER EZH)", uncommon.contains(0x0292));
1277     assertTrue("Uncommon_Use.contains(MUSICAL SYMBOL KIEVAN C CLEF)", uncommon.contains(0x1D1DE));
1278 
1279     // More mutually exclusive types, including some otherwise combinable ones.
1280     UnicodeSet dep(u"[:Identifier_Type=Deprecated:]", errorCode);
1281     UnicodeSet di(u"[:Identifier_Type=Default_Ignorable:]", errorCode);
1282     UnicodeSet notNFKC(u"[:Identifier_Type=Not_NFKC:]", errorCode);
1283     UnicodeSet excl(u"[:Identifier_Type=Exclusion:]", errorCode);
1284     UnicodeSet allExclusive;
1285     allExclusive.addAll(rec).addAll(incl).addAll(limited).addAll(excl).
1286         addAll(notNFKC).addAll(di).addAll(dep).addAll(notChar);
1287     assertEquals("num chars in mutually exclusive types",
1288                 rec.size() + incl.size() + limited.size() + excl.size() +
1289                     notNFKC.size() + di.size() + dep.size() + notChar.size(),
1290                 allExclusive.size());
1291 }
1292