• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * Copyright (c) 1997-2016, International Business Machines Corporation and
5  * others. All Rights Reserved.
6  ********************************************************************/
7 
8 #include "unicode/ustring.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/uniset.h"
12 #include "unicode/putil.h"
13 #include "unicode/uscript.h"
14 #include "unicode/uset.h"
15 #include "charstr.h"
16 #include "cstring.h"
17 #include "hash.h"
18 #include "patternprops.h"
19 #include "ppucd.h"
20 #include "normalizer2impl.h"
21 #include "testutil.h"
22 #include "uparse.h"
23 #include "ucdtest.h"
24 #include "usettest.h"
25 
26 #include <iostream>
27 
28 static const char *ignorePropNames[]={
29     "FC_NFKC",
30     "NFD_QC",
31     "NFC_QC",
32     "NFKD_QC",
33     "NFKC_QC",
34     "Expands_On_NFD",
35     "Expands_On_NFC",
36     "Expands_On_NFKD",
37     "Expands_On_NFKC",
38     "InCB",
39     "NFKC_CF",
40     "NFKC_SCF"
41 };
42 
UnicodeTest()43 UnicodeTest::UnicodeTest()
44 {
45     UErrorCode errorCode=U_ZERO_ERROR;
46     unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
47     if(U_FAILURE(errorCode)) {
48         delete unknownPropertyNames;
49         unknownPropertyNames=nullptr;
50     }
51     // Ignore some property names altogether.
52     for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) {
53         unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
54     }
55 }
56 
~UnicodeTest()57 UnicodeTest::~UnicodeTest()
58 {
59     delete unknownPropertyNames;
60 }
61 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)62 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
63 {
64     if(exec) {
65         logln("TestSuite UnicodeTest: ");
66     }
67     TESTCASE_AUTO_BEGIN;
68     TESTCASE_AUTO(TestAdditionalProperties);
69     TESTCASE_AUTO(TestBinaryValues);
70     TESTCASE_AUTO(TestConsistency);
71     TESTCASE_AUTO(TestPatternProperties);
72     TESTCASE_AUTO(TestScriptMetadata);
73     TESTCASE_AUTO(TestBidiPairedBracketType);
74     TESTCASE_AUTO(TestEmojiProperties);
75     TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
76     TESTCASE_AUTO(TestIndicPositionalCategory);
77     TESTCASE_AUTO(TestIndicSyllabicCategory);
78     TESTCASE_AUTO(TestVerticalOrientation);
79     TESTCASE_AUTO(TestDefaultScriptExtensions);
80     TESTCASE_AUTO(TestInvalidCodePointFolding);
81 #if !UCONFIG_NO_NORMALIZATION
82     TESTCASE_AUTO(TestBinaryCharacterProperties);
83     TESTCASE_AUTO(TestIntCharacterProperties);
84 #endif
85     TESTCASE_AUTO(TestPropertyNames);
86     TESTCASE_AUTO(TestIDSUnaryOperator);
87     TESTCASE_AUTO(TestIDCompatMath);
88     TESTCASE_AUTO(TestPropertiesUsingPpucd);
89     TESTCASE_AUTO(TestIDStatus);
90     TESTCASE_AUTO(TestIDType);
91     TESTCASE_AUTO_END;
92 }
93 
94 //====================================================
95 // private data used by the tests
96 //====================================================
97 
98 // test DerivedCoreProperties.txt -------------------------------------------
99 
100 // copied from genprops.c
101 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)102 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
103     const char *t, *z;
104     int32_t i, j;
105 
106     s=u_skipWhitespace(s);
107     for(i=0; i<countTokens; ++i) {
108         t=tokens[i];
109         if(t!=nullptr) {
110             for(j=0;; ++j) {
111                 if(t[j]!=0) {
112                     if(s[j]!=t[j]) {
113                         break;
114                     }
115                 } else {
116                     z=u_skipWhitespace(s+j);
117                     if(*z==';' || *z==0) {
118                         return i;
119                     } else {
120                         break;
121                     }
122                 }
123             }
124         }
125     }
126     return -1;
127 }
128 
129 static const char *const
130 derivedPropsNames[]={
131     "Math",
132     "Alphabetic",
133     "Lowercase",
134     "Uppercase",
135     "ID_Start",
136     "ID_Continue",
137     "XID_Start",
138     "XID_Continue",
139     "Default_Ignorable_Code_Point",
140     "Full_Composition_Exclusion",
141     "Grapheme_Extend",
142     "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
143     "Grapheme_Base",
144     "Cased",
145     "Case_Ignorable",
146     "Changes_When_Lowercased",
147     "Changes_When_Uppercased",
148     "Changes_When_Titlecased",
149     "Changes_When_Casefolded",
150     "Changes_When_Casemapped",
151     "Changes_When_NFKC_Casefolded"
152 };
153 
154 static const UProperty
155 derivedPropsIndex[]={
156     UCHAR_MATH,
157     UCHAR_ALPHABETIC,
158     UCHAR_LOWERCASE,
159     UCHAR_UPPERCASE,
160     UCHAR_ID_START,
161     UCHAR_ID_CONTINUE,
162     UCHAR_XID_START,
163     UCHAR_XID_CONTINUE,
164     UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
165     UCHAR_FULL_COMPOSITION_EXCLUSION,
166     UCHAR_GRAPHEME_EXTEND,
167     UCHAR_GRAPHEME_LINK,
168     UCHAR_GRAPHEME_BASE,
169     UCHAR_CASED,
170     UCHAR_CASE_IGNORABLE,
171     UCHAR_CHANGES_WHEN_LOWERCASED,
172     UCHAR_CHANGES_WHEN_UPPERCASED,
173     UCHAR_CHANGES_WHEN_TITLECASED,
174     UCHAR_CHANGES_WHEN_CASEFOLDED,
175     UCHAR_CHANGES_WHEN_CASEMAPPED,
176     UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
177 };
178 
179 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 };
180 
181 enum { MAX_ERRORS=50 };
182 
183 U_CFUNC void U_CALLCONV
derivedPropsLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)184 derivedPropsLineFn(void *context,
185                    char *fields[][2], int32_t /* fieldCount */,
186                    UErrorCode *pErrorCode)
187 {
188     UnicodeTest *me=static_cast<UnicodeTest*>(context);
189     uint32_t start, end;
190     int32_t i;
191 
192     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
193     if(U_FAILURE(*pErrorCode)) {
194         me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
195         return;
196     }
197 
198     /* parse derived binary property name, ignore unknown names */
199     i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]);
200     if(i<0) {
201         UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
202         propName.trim();
203         if(me->unknownPropertyNames->find(propName)==nullptr) {
204             UErrorCode errorCode=U_ZERO_ERROR;
205             me->unknownPropertyNames->puti(propName, 1, errorCode);
206             me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
207         }
208         return;
209     }
210 
211     me->derivedProps[i].add(start, end);
212 }
213 
TestAdditionalProperties()214 void UnicodeTest::TestAdditionalProperties() {
215 #if !UCONFIG_NO_NORMALIZATION
216     // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
217     if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) {
218         errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
219               UPRV_LENGTHOF(derivedPropsNames));
220         return;
221     }
222     if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) {
223         errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n");
224         return;
225     }
226 
227     char path[500];
228     if(getUnidataPath(path) == nullptr) {
229         errln("unable to find path to source/data/unidata/");
230         return;
231     }
232     char *basename=strchr(path, 0);
233     strcpy(basename, "DerivedCoreProperties.txt");
234 
235     char *fields[2][2];
236     UErrorCode errorCode=U_ZERO_ERROR;
237     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
238     if(U_FAILURE(errorCode)) {
239         errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
240         return;
241     }
242 
243     strcpy(basename, "DerivedNormalizationProps.txt");
244     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
245     if(U_FAILURE(errorCode)) {
246         errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
247         return;
248     }
249 
250     // now we have all derived core properties in the UnicodeSets
251     // run them all through the API
252     int32_t rangeCount, range;
253     uint32_t i;
254     UChar32 start, end;
255 
256     // test all true properties
257     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
258         rangeCount=derivedProps[i].getRangeCount();
259         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
260             start=derivedProps[i].getRangeStart(range);
261             end=derivedProps[i].getRangeEnd(range);
262             for(; start<=end; ++start) {
263                 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
264                     dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==false is wrong", start, derivedPropsNames[i]);
265                     if(++numErrors[i]>=MAX_ERRORS) {
266                       dataerrln("Too many errors, moving to the next test");
267                       break;
268                     }
269                 }
270             }
271         }
272     }
273 
274     // invert all properties
275     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
276         derivedProps[i].complement();
277     }
278 
279     // test all false properties
280     for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
281         rangeCount=derivedProps[i].getRangeCount();
282         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
283             start=derivedProps[i].getRangeStart(range);
284             end=derivedProps[i].getRangeEnd(range);
285             for(; start<=end; ++start) {
286                 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
287                     errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==true is wrong\n", start, derivedPropsNames[i]);
288                     if(++numErrors[i]>=MAX_ERRORS) {
289                       errln("Too many errors, moving to the next test");
290                       break;
291                     }
292                 }
293             }
294         }
295     }
296 #endif /* !UCONFIG_NO_NORMALIZATION */
297 }
298 
TestBinaryValues()299 void UnicodeTest::TestBinaryValues() {
300     /*
301      * Unicode 5.1 explicitly defines binary property value aliases.
302      * Verify that they are all recognized.
303      */
304     UErrorCode errorCode=U_ZERO_ERROR;
305     UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
306     if(U_FAILURE(errorCode)) {
307         dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
308         return;
309     }
310 
311     static const char *const falseValues[]={ "N", "No", "F", "False" };
312     static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
313     int32_t i;
314     for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) {
315         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
316         pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
317         errorCode=U_ZERO_ERROR;
318         UnicodeSet set(pattern, errorCode);
319         if(U_FAILURE(errorCode)) {
320             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
321             continue;
322         }
323         set.complement();
324         if(set!=alpha) {
325             errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
326         }
327     }
328     for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) {
329         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
330         pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
331         errorCode=U_ZERO_ERROR;
332         UnicodeSet set(pattern, errorCode);
333         if(U_FAILURE(errorCode)) {
334             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
335             continue;
336         }
337         if(set!=alpha) {
338             errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
339         }
340     }
341 }
342 
TestConsistency()343 void UnicodeTest::TestConsistency() {
344 #if !UCONFIG_NO_NORMALIZATION
345     /*
346      * Test for an example that getCanonStartSet() delivers
347      * all characters that compose from the input one,
348      * even in multiple steps.
349      * For example, the set for "I" (0049) should contain both
350      * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
351      * In general, the set for the middle such character should be a subset
352      * of the set for the first.
353      */
354     IcuTestErrorCode errorCode(*this, "TestConsistency");
355     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
356     const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
357     if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
358         dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
359                   errorCode.errorName());
360         errorCode.reset();
361         return;
362     }
363 
364     UnicodeSet set1, set2;
365     if (nfcImpl->getCanonStartSet(0x49, set1)) {
366         /* enumerate all characters that are plausible to be latin letters */
367         for(char16_t start=0xa0; start<0x2000; ++start) {
368             UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
369             if(decomp.length()>1 && decomp[0]==0x49) {
370                 set2.add(start);
371             }
372         }
373 
374         if (set1!=set2) {
375             errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
376         }
377         // This was available in cucdtst.c but the test had to move to intltest
378         // because the new internal normalization functions are in C++.
379         //compareUSets(set1, set2,
380         //             "[canon start set of 0049]", "[all c with canon decomp with 0049]",
381         //             true);
382     } else {
383         errln("NFC.getCanonStartSet() returned false");
384     }
385 #endif
386 }
387 
388 /**
389  * Test various implementations of Pattern_Syntax & Pattern_White_Space.
390  */
TestPatternProperties()391 void UnicodeTest::TestPatternProperties() {
392     IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
393     UnicodeSet syn_pp;
394     UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
395     UnicodeSet syn_list(
396         "[!-/\\:-@\\[-\\^`\\{-~"
397         "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
398         "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
399         "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
400     UnicodeSet ws_pp;
401     UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
402     UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
403     UnicodeSet syn_ws_pp;
404     UnicodeSet syn_ws_prop(syn_prop);
405     syn_ws_prop.addAll(ws_prop);
406     for(UChar32 c=0; c<=0xffff; ++c) {
407         if(PatternProps::isSyntax(c)) {
408             syn_pp.add(c);
409         }
410         if(PatternProps::isWhiteSpace(c)) {
411             ws_pp.add(c);
412         }
413         if(PatternProps::isSyntaxOrWhiteSpace(c)) {
414             syn_ws_pp.add(c);
415         }
416     }
417     compareUSets(syn_pp, syn_prop,
418                  "PatternProps.isSyntax()", "[:Pattern_Syntax:]", true);
419     compareUSets(syn_pp, syn_list,
420                  "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", true);
421     compareUSets(ws_pp, ws_prop,
422                  "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", true);
423     compareUSets(ws_pp, ws_list,
424                  "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", true);
425     compareUSets(syn_ws_pp, syn_ws_prop,
426                  "PatternProps.isSyntaxOrWhiteSpace()",
427                  "[[:Pattern_Syntax:][:Pattern_White_Space:]]", true);
428 }
429 
430 // So far only minimal port of Java & cucdtst.c compareUSets().
431 UBool
compareUSets(const UnicodeSet & a,const UnicodeSet & b,const char * a_name,const char * b_name,UBool diffIsError)432 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
433                           const char *a_name, const char *b_name,
434                           UBool diffIsError) {
435     UBool same= a==b;
436     if(!same && diffIsError) {
437         errln("Sets are different: %s vs. %s\n", a_name, b_name);
438     }
439     return same;
440 }
441 
442 namespace {
443 
444 /**
445  * Maps a special script code to the most common script of its encoded characters.
446  */
getCharScript(UScriptCode script)447 UScriptCode getCharScript(UScriptCode script) {
448     switch(script) {
449     case USCRIPT_HAN_WITH_BOPOMOFO:
450     case USCRIPT_SIMPLIFIED_HAN:
451     case USCRIPT_TRADITIONAL_HAN:
452         return USCRIPT_HAN;
453     case USCRIPT_JAPANESE:
454         return USCRIPT_HIRAGANA;
455     case USCRIPT_JAMO:
456     case USCRIPT_KOREAN:
457         return USCRIPT_HANGUL;
458     case USCRIPT_SYMBOLS_EMOJI:
459         return USCRIPT_SYMBOLS;
460     default:
461         return script;
462     }
463 }
464 
465 }  // namespace
466 
TestScriptMetadata()467 void UnicodeTest::TestScriptMetadata() {
468     IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
469     UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
470     // So far, sample characters are uppercase.
471     // Georgian is special.
472     UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
473     for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
474         UScriptCode sc = static_cast<UScriptCode>(sci);
475         // Run the test with -v to see which script has failures:
476         // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL
477         logln(uscript_getShortName(sc));
478         UScriptUsage usage = uscript_getUsage(sc);
479         UnicodeString sample = uscript_getSampleUnicodeString(sc);
480         UnicodeSet scriptSet;
481         scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
482         if(usage == USCRIPT_USAGE_NOT_ENCODED) {
483             assertTrue("not encoded, no sample", sample.isEmpty());
484             assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
485             assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
486             assertFalse("not encoded, not cased", uscript_isCased(sc));
487             assertTrue("not encoded, no characters", scriptSet.isEmpty());
488         } else {
489             assertFalse("encoded, has a sample character", sample.isEmpty());
490             UChar32 firstChar = sample.char32At(0);
491             UScriptCode charScript = getCharScript(sc);
492             assertEquals("script(sample(script))",
493                          static_cast<int32_t>(charScript), static_cast<int32_t>(uscript_getScript(firstChar, errorCode)));
494             assertEquals("RTL vs. set", rtl.contains(firstChar), uscript_isRightToLeft(sc));
495             assertEquals("cased vs. set", cased.contains(firstChar), uscript_isCased(sc));
496             assertEquals("encoded, has characters", static_cast<UBool>(sc == charScript), static_cast<UBool>(!scriptSet.isEmpty()));
497             if(uscript_isRightToLeft(sc)) {
498                 rtl.removeAll(scriptSet);
499             }
500             if(uscript_isCased(sc)) {
501                 cased.removeAll(scriptSet);
502             }
503         }
504     }
505     UnicodeString pattern;
506     assertEquals("no remaining RTL characters",
507                  UnicodeString("[]"), rtl.toPattern(pattern));
508     assertEquals("no remaining cased characters",
509                  UnicodeString("[]"), cased.toPattern(pattern));
510 
511     assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
512     assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
513     assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
514 }
515 
TestBidiPairedBracketType()516 void UnicodeTest::TestBidiPairedBracketType() {
517     // BidiBrackets-6.3.0.txt says:
518     //
519     // The set of code points listed in this file was originally derived
520     // using the character properties General_Category (gc), Bidi_Class (bc),
521     // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
522     // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
523     // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
524     // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
525     // property values are Open and Close, respectively.
526     IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()");
527     UnicodeSet bpt("[:^bpt=n:]", errorCode);
528     assertTrue("bpt!=None is not empty", !bpt.isEmpty());
529     // The following should always be true.
530     UnicodeSet mirrored("[:Bidi_M:]", errorCode);
531     UnicodeSet other_neutral("[:bc=ON:]", errorCode);
532     assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
533     assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
534     // The following are true at least initially in Unicode 6.3.
535     UnicodeSet bpt_open("[:bpt=o:]", errorCode);
536     UnicodeSet bpt_close("[:bpt=c:]", errorCode);
537     UnicodeSet ps("[:Ps:]", errorCode);
538     UnicodeSet pe("[:Pe:]", errorCode);
539     assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
540     assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
541 }
542 
TestEmojiProperties()543 void UnicodeTest::TestEmojiProperties() {
544     assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI));
545     assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI));
546     IcuTestErrorCode errorCode(*this, "TestEmojiProperties()");
547     UnicodeSet emoji("[:Emoji:]", errorCode);
548     assertTrue("lots of Emoji", emoji.size() > 700);
549 
550     assertTrue("shooting star is Emoji_Presentation",
551                u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION));
552     assertTrue("Fitzpatrick 6 is Emoji_Modifier",
553                u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER));
554     assertTrue("happy person is Emoji_Modifier_Base",
555                u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE));
556     assertTrue("asterisk is Emoji_Component",
557                u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT));
558     assertTrue("copyright is Extended_Pictographic",
559                u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
560 }
561 
562 namespace {
563 
hbp(const char16_t * s,int32_t length,UProperty which)564 UBool hbp(const char16_t *s, int32_t length, UProperty which) {
565     return u_stringHasBinaryProperty(s, length, which);
566 }
567 
hbp(const char16_t * s,UProperty which)568 UBool hbp(const char16_t *s, UProperty which) {
569     return u_stringHasBinaryProperty(s, -1, which);
570 }
571 
572 }  // namespace
573 
TestEmojiPropertiesOfStrings()574 void UnicodeTest::TestEmojiPropertiesOfStrings() {
575     // Property of code points, for coverage
576     assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
577     assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
578     assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
579     assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
580     assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
581     assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
582     assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
583     assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
584     assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
585     assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
586     assertFalse("bicycle is not Ideographic", hbp(u"��", 2, UCHAR_IDEOGRAPHIC));
587     assertFalse("bicycle/0 is not Ideographic", hbp(u"��", -1, UCHAR_IDEOGRAPHIC));
588     assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
589     assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
590 
591     // Property of (code points and) strings
592     assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
593     assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
594     assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
595     assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
596     assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
597     assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
598     assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
599     assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
600     assertTrue("bicycle is Basic_Emoji", hbp(u"��", 2, UCHAR_BASIC_EMOJI));
601     assertTrue("bicycle/0 is Basic_Emoji", hbp(u"��", -1, UCHAR_BASIC_EMOJI));
602     assertFalse("2*bicycle is Basic_Emoji", hbp(u"����", 4, UCHAR_BASIC_EMOJI));
603     assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"����", -1, UCHAR_BASIC_EMOJI));
604     assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
605     assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
606 
607     assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
608     assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
609     assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
610     assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
611 
612     assertFalse("chipmunk is not Basic_Emoji", hbp(u"��", UCHAR_BASIC_EMOJI));
613     assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"��\uFE0F", UCHAR_BASIC_EMOJI));
614     assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"��\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
615 
616     // Properties of strings (only)
617     assertFalse("4+emoji is not Emoji_Keycap_Sequence",
618                 hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
619     assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
620                hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
621 
622     assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
623                 hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
624     assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
625                hbp(u"����", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
626 
627     assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
628                 hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
629     assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
630                hbp(u"��������������", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
631 
632     assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
633                 hbp(u"��", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
634     assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
635                hbp(u"��\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
636 
637     assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
638                 hbp(u"��\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
639     assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
640                hbp(u"��\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
641 
642     // RGI_Emoji = all of the above
643     assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
644     assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
645 
646     assertFalse("chipmunk is not RGI_Emoji", hbp(u"��", UCHAR_RGI_EMOJI));
647     assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"��\uFE0F", UCHAR_RGI_EMOJI));
648 
649     assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
650     assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
651 
652     assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
653     assertTrue("[BE] is RGI_Emoji", hbp(u"����", UCHAR_RGI_EMOJI));
654 
655     assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
656     assertTrue("[Scotland] is RGI_Emoji", hbp(u"��������������", UCHAR_RGI_EMOJI));
657 
658     assertTrue("bicyclist is RGI_Emoji", hbp(u"��", UCHAR_RGI_EMOJI));
659     assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"��\U0001F3FD", UCHAR_RGI_EMOJI));
660 
661     assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"��\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
662     assertTrue("woman pilot: dark skin tone is RGI_Emoji",
663                hbp(u"��\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
664 
665     // UnicodeSet with properties of strings
666     IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
667     UnicodeSet basic("[:Basic_Emoji:]", errorCode);
668     UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
669     UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
670     UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
671     UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
672     UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
673     UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
674     if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
675         return;
676     }
677 
678     // union of all sets except for "rgi" -- should be the same as "rgi"
679     UnicodeSet all(basic);
680     all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
681 
682     UnicodeSet basicOnlyCp(basic);
683     basicOnlyCp.removeAllStrings();
684 
685     UnicodeSet rgiOnlyCp(rgi);
686     rgiOnlyCp.removeAllStrings();
687 
688     assertTrue("lots of Basic_Emoji", basic.size() > 1000);
689     assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
690     assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
691     assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
692     assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
693     assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
694     assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
695 
696     assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
697     assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
698     assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
699     assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
700     assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
701     assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
702     assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
703 
704     assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
705     assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
706     assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
707                  rgiOnlyCp.size(), basicOnlyCp.size());
708     assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
709     assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
710     assertTrue("RGI_Emoji == union", rgi == all);
711 
712     assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
713     assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"��\uFE0F"));
714     assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
715                keycaps.contains(u"4\uFE0F\u20E3"));
716     assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u"����"));
717     assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u"��������������"));
718     assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
719                modified.contains(u"��\U0001F3FD"));
720     assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
721                combos.contains(u"��\U0001F3FF\u200D✈\uFE0F"));
722     assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
723     assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"��\uFE0F"));
724     assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
725     assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u"����"));
726     assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
727     assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u"��������������"));
728     assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u"��"));
729     assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"��\U0001F3FD"));
730     assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"��\U0001F3FF\u200D✈\uFE0F"));
731 }
732 
TestIndicPositionalCategory()733 void UnicodeTest::TestIndicPositionalCategory() {
734     IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
735     UnicodeSet na(u"[:InPC=NA:]", errorCode);
736     assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500);
737     UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode);
738     assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100);
739     assertEquals("U+08FF: NA", U_INPC_NA,
740                  u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY));
741     assertEquals("U+0900: Top", U_INPC_TOP,
742                  u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY));
743     assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK,
744                  u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY));
745 }
746 
TestIndicSyllabicCategory()747 void UnicodeTest::TestIndicSyllabicCategory() {
748     IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()");
749     UnicodeSet other(u"[:InSC=Other:]", errorCode);
750     assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500);
751     UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode);
752     assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100);
753     assertEquals("U+08FF: Other", U_INSC_OTHER,
754                  u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY));
755     assertEquals("U+0900: Bindu", U_INSC_BINDU,
756                  u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY));
757     assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER,
758                  u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY));
759 }
760 
TestVerticalOrientation()761 void UnicodeTest::TestVerticalOrientation() {
762     IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()");
763     UnicodeSet r(u"[:vo=R:]", errorCode);
764     assertTrue("mostly R", 0xb0000 <= r.size() && r.size() <= 0xd0000);
765     UnicodeSet u(u"[:vo=U:]", errorCode);
766     assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x60000);
767     UnicodeSet tu(u"[:vo=Tu:]", errorCode);
768     assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300);
769     assertEquals("U+0E01: Rotated", U_VO_ROTATED,
770                  u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION));
771     assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED,
772                  u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION));
773     assertEquals("U+33333: Upright", U_VO_UPRIGHT,
774                  u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION));
775 }
776 
TestDefaultScriptExtensions()777 void UnicodeTest::TestDefaultScriptExtensions() {
778     // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
779     // but some of its characters revert to scx=<script> which is usually Common.
780     IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()");
781     UScriptCode scx[20];
782     scx[0] = USCRIPT_INVALID_CODE;
783     assertEquals("U+3000 num scx", 1,  // IDEOGRAPHIC SPACE
784                  uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode));
785     assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]);
786     scx[0] = USCRIPT_INVALID_CODE;
787     assertEquals("U+3012 num scx", 1,  // POSTAL MARK
788                  uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode));
789     assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]);
790 }
791 
TestInvalidCodePointFolding()792 void UnicodeTest::TestInvalidCodePointFolding() {
793     // Test behavior when an invalid code point is passed to u_foldCase
794     static const UChar32 invalidCodePoints[] = {
795             0xD800, // lead surrogate
796             0xDFFF, // trail surrogate
797             0xFDD0, // noncharacter
798             0xFFFF, // noncharacter
799             0x110000, // out of range
800             -1 // negative
801     };
802     for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) {
803         UChar32 cp = invalidCodePoints[i];
804         assertEquals("Invalid code points should be echoed back",
805                 cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT));
806         assertEquals("Invalid code points should be echoed back",
807                 cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
808     }
809 }
810 
TestBinaryCharacterProperties()811 void UnicodeTest::TestBinaryCharacterProperties() {
812 #if !UCONFIG_NO_NORMALIZATION
813     IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
814     // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
815     for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
816         const USet* uset = u_getBinaryPropertySet(static_cast<UProperty>(prop), errorCode);
817         if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", static_cast<int>(prop))) {
818             continue;
819         }
820         const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
821         int32_t count = set.getRangeCount();
822         if (count == 0) {
823             assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
824                 u_hasBinaryProperty(0x20, static_cast<UProperty>(prop)));
825             assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
826                 u_hasBinaryProperty(0x61, static_cast<UProperty>(prop)));
827             assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
828                 u_hasBinaryProperty(0x4e00, static_cast<UProperty>(prop)));
829         } else {
830             UChar32 c = set.getRangeStart(0);
831             if (c > 0) {
832                 assertFalse(
833                     UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
834                         u", " + prop + u")",
835                     u_hasBinaryProperty(c - 1, static_cast<UProperty>(prop)));
836             }
837             assertTrue(
838                 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
839                     u", " + prop + u")",
840                 u_hasBinaryProperty(c, static_cast<UProperty>(prop)));
841             c = set.getRangeEnd(count - 1);
842             assertTrue(
843                 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
844                     u", " + prop + u")",
845                 u_hasBinaryProperty(c, static_cast<UProperty>(prop)));
846             if (c < 0x10ffff) {
847                 assertFalse(
848                     UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
849                         u", " + prop + u")",
850                     u_hasBinaryProperty(c + 1, static_cast<UProperty>(prop)));
851             }
852         }
853     }
854 #endif
855 }
856 
TestIntCharacterProperties()857 void UnicodeTest::TestIntCharacterProperties() {
858 #if !UCONFIG_NO_NORMALIZATION
859     IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
860     // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
861     for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
862         const UCPMap* map = u_getIntPropertyMap(static_cast<UProperty>(prop), errorCode);
863         if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", static_cast<int>(prop))) {
864             continue;
865         }
866         uint32_t value;
867         UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
868         assertTrue("int property first range", end >= 0);
869         UChar32 c = end / 2;
870         assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
871             u_getIntPropertyValue(c, static_cast<UProperty>(prop)), value);
872         end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
873         assertTrue("int property later range", end >= 0);
874         assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
875             u_getIntPropertyValue(end, static_cast<UProperty>(prop)), value);
876         // ucpmap_get() API coverage
877         // TODO: move to cucdtst.c
878         assertEquals(
879             "int property upcmap_get(U+0061)",
880             u_getIntPropertyValue(0x61, static_cast<UProperty>(prop)), ucpmap_get(map, 0x61));
881     }
882 #endif
883 }
884 
885 namespace {
886 
getPropName(UProperty property,int32_t nameChoice)887 const char *getPropName(UProperty property, int32_t nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
888     const char* name = u_getPropertyName(property, static_cast<UPropertyNameChoice>(nameChoice));
889     return name != nullptr ? name : "null";
890 }
891 
getValueName(UProperty property,int32_t value,int32_t nameChoice)892 const char *getValueName(UProperty property, int32_t value, int32_t nameChoice)
893         UPRV_NO_SANITIZE_UNDEFINED {
894     const char* name = u_getPropertyValueName(property, value, static_cast<UPropertyNameChoice>(nameChoice));
895     return name != nullptr ? name : "null";
896 }
897 
898 }  // namespace
899 
TestPropertyNames()900 void UnicodeTest::TestPropertyNames() {
901     IcuTestErrorCode errorCode(*this, "TestPropertyNames()");
902     // Test names of certain properties & values.
903     // The UPropertyNameChoice is really an integer with only a couple of named constants.
904     UProperty prop = UCHAR_WHITE_SPACE;
905     constexpr int32_t SHORT = U_SHORT_PROPERTY_NAME;
906     constexpr int32_t LONG = U_LONG_PROPERTY_NAME;
907     assertEquals("White_Space: index -1", "null", getPropName(prop, -1));
908     assertEquals("White_Space: short", "WSpace", getPropName(prop, SHORT));
909     assertEquals("White_Space: long", "White_Space", getPropName(prop, LONG));
910     assertEquals("White_Space: index 2", "space", getPropName(prop, 2));
911     assertEquals("White_Space: index 3", "null", getPropName(prop, 3));
912 
913     prop = UCHAR_SIMPLE_CASE_FOLDING;
914     assertEquals("Simple_Case_Folding: index -1", "null", getPropName(prop, -1));
915     assertEquals("Simple_Case_Folding: short", "scf", getPropName(prop, SHORT));
916     assertEquals("Simple_Case_Folding: long", "Simple_Case_Folding", getPropName(prop, LONG));
917     assertEquals("Simple_Case_Folding: index 2", "sfc", getPropName(prop, 2));
918     assertEquals("Simple_Case_Folding: index 3", "null", getPropName(prop, 3));
919 
920     prop = UCHAR_CASED;
921     assertEquals("Cased=Y: index -1", "null", getValueName(prop, 1, -1));
922     assertEquals("Cased=Y: short", "Y", getValueName(prop, 1, SHORT));
923     assertEquals("Cased=Y: long", "Yes", getValueName(prop, 1, LONG));
924     assertEquals("Cased=Y: index 2", "T", getValueName(prop, 1, 2));
925     assertEquals("Cased=Y: index 3", "True", getValueName(prop, 1, 3));
926     assertEquals("Cased=Y: index 4", "null", getValueName(prop, 1, 4));
927 
928     prop = UCHAR_DECOMPOSITION_TYPE;
929     int32_t value = U_DT_NOBREAK;
930     assertEquals("dt=Nb: index -1", "null", getValueName(prop, value, -1));
931     assertEquals("dt=Nb: short", "Nb", getValueName(prop, value, SHORT));
932     assertEquals("dt=Nb: long", "Nobreak", getValueName(prop, value, LONG));
933     assertEquals("dt=Nb: index 2", "nb", getValueName(prop, value, 2));
934     assertEquals("dt=Nb: index 3", "null", getValueName(prop, value, 3));
935 
936     // Canonical_Combining_Class:
937     // The UCD inserts the numeric values in the second filed of its PropertyValueAliases.txt lines.
938     // In ICU, we don't treat these as names,
939     // they are just the numeric values returned by u_getCombiningClass().
940     // We return the real short and long names for the usual choice constants.
941     prop = UCHAR_CANONICAL_COMBINING_CLASS;
942     assertEquals("ccc=230: index -1", "null", getValueName(prop, 230, -1));
943     assertEquals("ccc=230: short", "A", getValueName(prop, 230, SHORT));
944     assertEquals("ccc=230: long", "Above", getValueName(prop, 230, LONG));
945     assertEquals("ccc=230: index 2", "null", getValueName(prop, 230, 2));
946 
947     prop = UCHAR_GENERAL_CATEGORY;
948     value = U_DECIMAL_DIGIT_NUMBER;
949     assertEquals("gc=Nd: index -1", "null", getValueName(prop, value, -1));
950     assertEquals("gc=Nd: short", "Nd", getValueName(prop, value, SHORT));
951     assertEquals("gc=Nd: long", "Decimal_Number", getValueName(prop, value, LONG));
952     assertEquals("gc=Nd: index 2", "digit", getValueName(prop, value, 2));
953     assertEquals("gc=Nd: index 3", "null", getValueName(prop, value, 3));
954 
955     prop = UCHAR_GENERAL_CATEGORY_MASK;
956     value = U_GC_P_MASK;
957     assertEquals("gc=P mask: index -1", "null", getValueName(prop, value, -1));
958     assertEquals("gc=P mask: short", "P", getValueName(prop, value, SHORT));
959     assertEquals("gc=P mask: long", "Punctuation", getValueName(prop, value, LONG));
960     assertEquals("gc=P mask: index 2", "punct", getValueName(prop, value, 2));
961     assertEquals("gc=P mask: index 3", "null", getValueName(prop, value, 3));
962 }
963 
TestIDSUnaryOperator()964 void UnicodeTest::TestIDSUnaryOperator() {
965     IcuTestErrorCode errorCode(*this, "TestIDSUnaryOperator()");
966     // New in Unicode 15.1 for just two characters.
967     assertFalse("U+2FFC IDSU", u_hasBinaryProperty(0x2ffc, UCHAR_IDS_UNARY_OPERATOR));
968     assertFalse("U+2FFD IDSU", u_hasBinaryProperty(0x2ffd, UCHAR_IDS_UNARY_OPERATOR));
969     assertTrue("U+2FFE IDSU", u_hasBinaryProperty(0x2ffe, UCHAR_IDS_UNARY_OPERATOR));
970     assertTrue("U+2FFF IDSU", u_hasBinaryProperty(0x2fff, UCHAR_IDS_UNARY_OPERATOR));
971     assertFalse("U+3000 IDSU", u_hasBinaryProperty(0x3000, UCHAR_IDS_UNARY_OPERATOR));
972     assertFalse("U+3001 IDSU", u_hasBinaryProperty(0x3001, UCHAR_IDS_UNARY_OPERATOR));
973 
974     // Property name works and gets the correct set.
975     UnicodeSet idsu(u"[:IDS_Unary_Operator:]", errorCode);
976     assertEquals("IDSU set number of characters", 2, idsu.size());
977     assertFalse("idsu.contains(U+2FFD)", idsu.contains(0x2ffd));
978     assertTrue("idsu.contains(U+2FFE)", idsu.contains(0x2ffe));
979     assertTrue("idsu.contains(U+2FFF)", idsu.contains(0x2fff));
980     assertFalse("idsu.contains(U+3000)", idsu.contains(0x3000));
981 }
982 
983 namespace {
984 
isMathStart(UChar32 c)985 bool isMathStart(UChar32 c) {
986     return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_START);
987 }
988 
isMathContinue(UChar32 c)989 bool isMathContinue(UChar32 c) {
990     return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_CONTINUE);
991 }
992 
993 }  // namespace
994 
TestIDCompatMath()995 void UnicodeTest::TestIDCompatMath() {
996     IcuTestErrorCode errorCode(*this, "TestIDCompatMath()");
997     assertFalse("U+00B1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb1));
998     assertTrue("U+00B2 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb2));
999     assertTrue("U+00B3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb3));
1000     assertFalse("U+00B4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb4));
1001     assertFalse("U+207F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x207f));
1002     assertTrue("U+2080 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2080));
1003     assertTrue("U+208E UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208e));
1004     assertFalse("U+208F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208f));
1005     assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2201));
1006     assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2202));
1007     assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D6C1));
1008     assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C3));
1009     assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C4));
1010 
1011     assertFalse("U+00B2 UCHAR_ID_COMPAT_MATH_START", isMathStart(0xb2));
1012     assertFalse("U+2080 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2080));
1013     assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2201));
1014     assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2202));
1015     assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D6C1));
1016     assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C3));
1017     assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C4));
1018 
1019     // Property names work and get the correct sets.
1020     UnicodeSet idcmStart(u"[:ID_Compat_Math_Start:]", errorCode);
1021     UnicodeSet idcmContinue(u"[:ID_Compat_Math_Continue:]", errorCode);
1022     assertEquals("ID_Compat_Math_Start set number of characters", 13, idcmStart.size());
1023     assertEquals("ID_Compat_Math_Continue set number of characters", 43, idcmContinue.size());
1024     assertTrue("ID_Compat_Math_Start is a subset of ID_Compat_Math_Continue",
1025                idcmContinue.containsAll(idcmStart));
1026     assertFalse("idcmContinue.contains(U+207F)", idcmContinue.contains(0x207f));
1027     assertTrue("idcmContinue.contains(U+2080)", idcmContinue.contains(0x2080));
1028     assertTrue("idcmContinue.contains(U+208E)", idcmContinue.contains(0x208e));
1029     assertFalse("idcmContinue.contains(U+208F)", idcmContinue.contains(0x208f));
1030     assertFalse("idcmStart.contains(U+2201)", idcmStart.contains(0x2201));
1031     assertTrue("idcmStart.contains(U+2202)", idcmStart.contains(0x2202));
1032     assertTrue("idcmStart.contains(U+1D7C3)", idcmStart.contains(0x1D7C3));
1033     assertFalse("idcmStart.contains(U+1D7C4)", idcmStart.contains(0x1D7C4));
1034 }
1035 
1036 U_NAMESPACE_BEGIN
1037 
1038 class BuiltInPropertyNames : public PropertyNames {
1039 public:
~BuiltInPropertyNames()1040     ~BuiltInPropertyNames() override {}
1041 
getPropertyEnum(const char * name) const1042     int32_t getPropertyEnum(const char *name) const override {
1043         return u_getPropertyEnum(name);
1044     }
1045 
getPropertyValueEnum(int32_t property,const char * name) const1046     int32_t getPropertyValueEnum(int32_t property, const char *name) const override {
1047         return u_getPropertyValueEnum(static_cast<UProperty>(property), name);
1048     }
1049 };
1050 
1051 U_NAMESPACE_END
1052 
TestPropertiesUsingPpucd()1053 void UnicodeTest::TestPropertiesUsingPpucd() {
1054     IcuTestErrorCode errorCode(*this, "TestPropertiesUsingPpucd()");
1055 
1056     // Initialize PPUCD parsing object using file in repo and using
1057     // property names present in built-in data in ICU
1058     char buffer[500];
1059     // get path to `source/data/unidata/` including trailing `/`
1060     char *unidataPath = getUnidataPath(buffer);
1061     if(unidataPath == nullptr) {
1062         errln("exiting early because unable to open ppucd.txt from ICU source tree");
1063         return;
1064     }
1065     CharString ppucdPath(unidataPath, errorCode);
1066     ppucdPath.appendPathPart("ppucd.txt", errorCode);
1067     PreparsedUCD ppucd(ppucdPath.data(), errorCode);
1068     if(errorCode.isFailure()) {
1069         errln("unable to open %s - %s\n",
1070             ppucdPath.data(), errorCode.errorName());
1071         return;
1072     }
1073     BuiltInPropertyNames builtInPropNames;
1074     ppucd.setPropertyNames(&builtInPropNames);
1075 
1076     // Define which properties we want to compare
1077     struct TestProp {
1078         const UProperty prop;
1079         const int32_t value = 1;  // binary "Yes"
1080         UnicodeSet set;
1081 
1082         TestProp(UProperty binaryProp) : prop(binaryProp) {}
1083         TestProp(UProperty intProp, int32_t v) : prop(intProp), value(v) {}
1084         bool isBinary() const { return prop < UCHAR_BINARY_LIMIT; }
1085     };
1086     TestProp propsUnderTest[] = {
1087         { UCHAR_IDS_UNARY_OPERATOR },
1088         { UCHAR_ID_COMPAT_MATH_START },
1089         { UCHAR_ID_COMPAT_MATH_CONTINUE },
1090 #if !UCONFIG_NO_NORMALIZATION
1091         { UCHAR_NFD_QUICK_CHECK, UNORM_NO },
1092         { UCHAR_NFKD_QUICK_CHECK, UNORM_NO },
1093         { UCHAR_NFC_QUICK_CHECK, UNORM_NO },
1094         { UCHAR_NFKC_QUICK_CHECK, UNORM_NO },
1095         { UCHAR_NFC_QUICK_CHECK, UNORM_MAYBE },
1096         { UCHAR_NFKC_QUICK_CHECK, UNORM_MAYBE },
1097 #endif  // !UCONFIG_NO_NORMALIZATION
1098         { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
1099         { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
1100         { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
1101         { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
1102         { UCHAR_MODIFIER_COMBINING_MARK },
1103     };
1104 
1105     // Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
1106     PreparsedUCD::LineType lineType;
1107     UnicodeSet newValues;
1108     while((lineType=ppucd.readLine(errorCode))!=PreparsedUCD::NO_LINE && errorCode.isSuccess()) {
1109         if(ppucd.lineHasPropertyValues()) {
1110             const UniProps *lineProps=ppucd.getProps(newValues, errorCode);
1111 
1112             for (auto &tp : propsUnderTest) {
1113                 if (!newValues.contains(tp.prop)) {
1114                     continue;
1115                 }
1116                 bool match = tp.isBinary() ?
1117                     lineProps->binProps[tp.prop] :
1118                     lineProps->getIntProp(tp.prop) == tp.value;
1119                 if (match) {
1120                     tp.set.add(lineProps->start, lineProps->end);
1121                 } else {
1122                     tp.set.remove(lineProps->start, lineProps->end);
1123                 }
1124             }
1125         }
1126     }
1127 
1128     if(errorCode.isFailure()) {
1129         errln("exiting early due to parsing error");
1130         return;
1131     }
1132 
1133     // Assert that the PPUCD data and the ICU data are equivalent for all properties
1134     for (auto &tp : propsUnderTest) {
1135         UnicodeSet icuPropSet;
1136         icuPropSet.applyIntPropertyValue(tp.prop, tp.value, errorCode);
1137         std::string msg =
1138             std::string()
1139             + "ICU & PPUCD versions of "
1140             + u_getPropertyName(tp.prop, U_LONG_PROPERTY_NAME);
1141         if (!tp.isBinary()) {
1142             msg = msg + "=" + u_getPropertyValueName(tp.prop, tp.value, U_LONG_PROPERTY_NAME);
1143         }
1144         UnicodeSetTest::checkEqual(*this, tp.set, icuPropSet, msg.c_str());
1145     }
1146 }
1147 
1148 namespace {
1149 
getIDStatus(UChar32 c)1150 int32_t getIDStatus(UChar32 c) {
1151     return u_getIntPropertyValue(c, UCHAR_IDENTIFIER_STATUS);
1152 }
1153 
1154 }  // namespace
1155 
TestIDStatus()1156 void UnicodeTest::TestIDStatus() {
1157     IcuTestErrorCode errorCode(*this, "TestIDStatus()");
1158     assertEquals("ID_Status(slash)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x2F));
1159     assertEquals("ID_Status(digit 0)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x30));
1160     assertEquals("ID_Status(colon)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x3A));
1161     assertEquals("ID_Status(semicolon)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x3B));
1162     assertEquals("ID_Status(Greek small alpha)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x03B1));
1163     assertEquals("ID_Status(Greek small archaic koppa)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x03D9));
1164     assertEquals("ID_Status(Hangul syllable)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0xAC00));
1165     assertEquals("ID_Status(surrogate)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xD800));
1166     assertEquals("ID_Status(Arabic tail fragment)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xFE73));
1167     assertEquals("ID_Status(Hentaigana ko-3)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x1B03A));
1168     assertEquals("ID_Status(Katakana small ko)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x1B155));
1169     assertEquals("ID_Status(U+2EE5D)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x2EE5D));
1170     assertEquals("ID_Status(U+10FFFF)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x10FFFF));
1171 
1172     // Property names work and get the correct sets.
1173     UnicodeSet idStatus(u"[:Identifier_Status=Allowed:]", errorCode);
1174     // Unicode 15.1: 112778 Allowed characters; normally grows over time
1175     assertTrue("Allowed number of characters", idStatus.size() >= 112778);
1176     assertFalse("Allowed.contains(slash)", idStatus.contains(0x2F));
1177     assertTrue("Allowed.contains(digit 0)", idStatus.contains(0x30));
1178     assertTrue("Allowed.contains(colon)", idStatus.contains(0x3A));
1179     assertFalse("Allowed.contains(semicolon)", idStatus.contains(0x3B));
1180     assertTrue("Allowed.contains(Greek small alpha)", idStatus.contains(0x03B1));
1181     assertFalse("Allowed.contains(Greek small archaic koppa)", idStatus.contains(0x03D9));
1182     assertTrue("Allowed.contains(Hangul syllable)", idStatus.contains(0xAC00));
1183     assertFalse("Allowed.contains(surrogate)", idStatus.contains(0xD800));
1184     assertFalse("Allowed.contains(Arabic tail fragment)", idStatus.contains(0xFE73));
1185     assertFalse("Allowed.contains(Hentaigana ko-3)", idStatus.contains(0x1B03A));
1186     assertTrue("Allowed.contains(Katakana small ko)", idStatus.contains(0x1B155));
1187     assertTrue("Allowed.contains(U+2EE5D)", idStatus.contains(0x2EE5D));
1188     assertFalse("Allowed.contains(U+10FFFF)", idStatus.contains(0x10FFFF));
1189 }
1190 
1191 namespace {
1192 
getIDTypes(UChar32 c)1193 UnicodeString getIDTypes(UChar32 c) {
1194     UErrorCode errorCode = U_ZERO_ERROR;
1195     UIdentifierType types[10];
1196     int32_t length = u_getIDTypes(c, types, UPRV_LENGTHOF(types), &errorCode);
1197     if (U_FAILURE(errorCode)) {
1198         return UnicodeString(u_errorName(errorCode), -1, US_INV);
1199     }
1200     // The order of values is undefined, but for simplicity we assume the order
1201     // that the current implementation yields. Otherwise we would have to sort the values.
1202     uint32_t typeBits = 0;
1203     UnicodeString result;
1204     for (int32_t i = 0; i < length; ++i) {
1205         if (i != 0) {
1206             result.append(u' ');
1207         }
1208         auto t = types[i];
1209         typeBits |= 1UL << t;
1210         const char *s = u_getPropertyValueName(UCHAR_IDENTIFIER_TYPE, t, U_LONG_PROPERTY_NAME);
1211         if (s != nullptr) {
1212             result.append(UnicodeString(s, -1, US_INV));
1213         } else {
1214             result.append(u"???");
1215         }
1216     }
1217     // Check that u_hasIDType() agrees.
1218     // Includes undefined behavior with t > largest enum constant.
1219     for (int32_t i = 0; i < 16; ++i) {
1220         UIdentifierType t = static_cast<UIdentifierType>(i);
1221         bool expected = (typeBits & (1UL << i)) != 0;
1222         bool actual = u_hasIDType(c, t);
1223         if (actual != expected) {
1224             result.append(u" != u_hasIDType() ");
1225             result = result + i;
1226             break;
1227         }
1228     }
1229     return result;
1230 }
1231 
1232 }  // namespace
1233 
TestIDType()1234 void UnicodeTest::TestIDType() {
1235     IcuTestErrorCode errorCode(*this, "TestIDType()");
1236     // Note: Types other than Recommended and Inclusion may well change over time.
1237     assertEquals("ID_Type(slash)", u"Not_XID", getIDTypes(0x2F));
1238     assertEquals("ID_Type(digit 0)", u"Recommended", getIDTypes(0x30));
1239     assertEquals("ID_Type(colon)", u"Inclusion", getIDTypes(0x3A));
1240     assertEquals("ID_Type(semicolon)", u"Not_XID", getIDTypes(0x3B));
1241     assertEquals("ID_Type(Greek small alpha)", u"Recommended", getIDTypes(0x03B1));
1242     assertEquals("ID_Type(Greek small archaic koppa)", u"Obsolete", getIDTypes(0x03D9));
1243     assertEquals("ID_Type(Hangul syllable)", u"Recommended", getIDTypes(0xAC00));
1244     assertEquals("ID_Type(surrogate)", u"Not_Character", getIDTypes(0xD800));
1245     assertEquals("ID_Type(Arabic tail fragment)", u"Technical", getIDTypes(0xFE73));
1246     assertEquals("ID_Type(Linear B syllable)", u"Exclusion", getIDTypes(0x10000));
1247     assertEquals("ID_Type(Hentaigana ko-3)", u"Obsolete", getIDTypes(0x1B03A));
1248     assertEquals("ID_Type(Katakana small ko)", u"Recommended", getIDTypes(0x1B155));
1249     assertEquals("ID_Type(U+2EE5D)", u"Recommended", getIDTypes(0x2EE5D));
1250     assertEquals("ID_Type(U+10FFFF)", u"Not_Character", getIDTypes(0x10FFFF));
1251 
1252     assertEquals("ID_Type(CYRILLIC THOUSANDS SIGN)", u"Not_XID Obsolete", getIDTypes(0x0482));
1253     assertEquals("ID_Type(SYRIAC FEMININE DOT)", u"Technical Limited_Use", getIDTypes(0x0740));
1254     assertEquals("ID_Type(NKO LETTER JONA JA)", u"Obsolete Limited_Use", getIDTypes(0x07E8));
1255     assertEquals("ID_Type(SYRIAC END OF PARAGRAPH)", u"Not_XID Limited_Use", getIDTypes(0x0700));
1256     assertEquals("ID_Type(LATIN SMALL LETTER EZH)=", u"Technical Uncommon_Use", getIDTypes(0x0292));
1257     assertEquals("ID_Type(MUSICAL SYMBOL KIEVAN C CLEF)", u"Not_XID Technical Uncommon_Use", getIDTypes(0x1D1DE));
1258     assertEquals("ID_Type(MRO LETTER TA)", u"Exclusion Uncommon_Use", getIDTypes(0x16A40));
1259     assertEquals("ID_Type(GREEK MUSICAL LEIMMA)", u"Not_XID Obsolete", getIDTypes(0x1D245));
1260 
1261     // error handling
1262     UIdentifierType types[2];
1263     UErrorCode failure = U_ZERO_ERROR;
1264     u_getIDTypes(0, types, -1, &failure);
1265     assertEquals("u_getIDTypes(capacity<0)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1266 
1267     failure = U_ZERO_ERROR;
1268     u_getIDTypes(0, nullptr, 1, &failure);
1269     assertEquals("u_getIDTypes(nullptr)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1270 
1271     failure = U_ZERO_ERROR;
1272     int32_t length = u_getIDTypes(0x30, types, 0, &failure);
1273     assertEquals("u_getIDTypes(digit 0, capacity 0) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1274     assertEquals("u_getIDTypes(digit 0, capacity 0) length", 1, length);
1275 
1276     failure = U_ZERO_ERROR;
1277     length = u_getIDTypes(0x1D1DE, types, 0, &failure);
1278     assertEquals("u_getIDTypes(Kievan C clef, capacity 2) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1279     assertEquals("u_getIDTypes(Kievan C clef, capacity 2) length", 3, length);
1280 
1281     // Property names work and get the correct sets.
1282     UnicodeSet rec(u"[:Identifier_Type=Recommended:]", errorCode);
1283     UnicodeSet incl(u"[:Identifier_Type=Inclusion:]", errorCode);
1284     UnicodeSet limited(u"[:Identifier_Type=Limited_Use:]", errorCode);
1285     UnicodeSet uncommon(u"[:Identifier_Type=Uncommon_Use:]", errorCode);
1286     UnicodeSet notChar(u"[:Identifier_Type=Not_Character:]", errorCode);
1287     // Unicode 15.1 set sizes; normally grows over time except Not_Character shrinks
1288     assertTrue("Recommended number of characters", rec.size() >= 112761);
1289     assertTrue("Inclusion number of characters", incl.size() >= 17);
1290     assertTrue("Limited_Use number of characters", limited.size() >= 5268);
1291     assertTrue("Uncommon_Use number of characters", uncommon.size() >= 398);
1292     assertTrue("Not_Character number of characters",
1293                800000 <= notChar.size() && notChar.size() <= 964293);
1294     assertFalse("Recommended.contains(slash)", rec.contains(0x2F));
1295     assertTrue("Recommended.contains(digit 0)", rec.contains(0x30));
1296     assertTrue("Inclusion.contains(colon)", incl.contains(0x3A));
1297     assertTrue("Recommended.contains(U+2EE5D)", rec.contains(0x2EE5D));
1298     assertTrue("Limited_Use.contains(SYRIAC FEMININE DOT)", limited.contains(0x0740));
1299     assertTrue("Limited_Use.contains(NKO LETTER JONA JA)", limited.contains(0x7E8));
1300     assertTrue("Not_Character.contains(surrogate)", notChar.contains(0xd800));
1301     assertTrue("Not_Character.contains(U+10FFFF)", notChar.contains(0x10FFFF));
1302     assertTrue("Uncommon_Use.contains(LATIN SMALL LETTER EZH)", uncommon.contains(0x0292));
1303     assertTrue("Uncommon_Use.contains(MUSICAL SYMBOL KIEVAN C CLEF)", uncommon.contains(0x1D1DE));
1304 
1305     // More mutually exclusive types, including some otherwise combinable ones.
1306     UnicodeSet dep(u"[:Identifier_Type=Deprecated:]", errorCode);
1307     UnicodeSet di(u"[:Identifier_Type=Default_Ignorable:]", errorCode);
1308     UnicodeSet notNFKC(u"[:Identifier_Type=Not_NFKC:]", errorCode);
1309     UnicodeSet excl(u"[:Identifier_Type=Exclusion:]", errorCode);
1310     UnicodeSet allExclusive;
1311     allExclusive.addAll(rec).addAll(incl).addAll(limited).addAll(excl).
1312         addAll(notNFKC).addAll(di).addAll(dep).addAll(notChar);
1313     assertEquals("num chars in mutually exclusive types",
1314                 rec.size() + incl.size() + limited.size() + excl.size() +
1315                     notNFKC.size() + di.size() + dep.size() + notChar.size(),
1316                 allExclusive.size());
1317 }
1318