1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 1997-2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
7
8 #include "unicode/ustring.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/uniset.h"
12 #include "unicode/putil.h"
13 #include "unicode/uscript.h"
14 #include "unicode/uset.h"
15 #include "charstr.h"
16 #include "cstring.h"
17 #include "hash.h"
18 #include "patternprops.h"
19 #include "ppucd.h"
20 #include "normalizer2impl.h"
21 #include "testutil.h"
22 #include "uparse.h"
23 #include "ucdtest.h"
24 #include "usettest.h"
25
26 #include <iostream>
27
28 static const char *ignorePropNames[]={
29 "FC_NFKC",
30 "NFD_QC",
31 "NFC_QC",
32 "NFKD_QC",
33 "NFKC_QC",
34 "Expands_On_NFD",
35 "Expands_On_NFC",
36 "Expands_On_NFKD",
37 "Expands_On_NFKC",
38 "InCB",
39 "NFKC_CF",
40 "NFKC_SCF"
41 };
42
UnicodeTest()43 UnicodeTest::UnicodeTest()
44 {
45 UErrorCode errorCode=U_ZERO_ERROR;
46 unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
47 if(U_FAILURE(errorCode)) {
48 delete unknownPropertyNames;
49 unknownPropertyNames=nullptr;
50 }
51 // Ignore some property names altogether.
52 for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) {
53 unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
54 }
55 }
56
~UnicodeTest()57 UnicodeTest::~UnicodeTest()
58 {
59 delete unknownPropertyNames;
60 }
61
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)62 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
63 {
64 if(exec) {
65 logln("TestSuite UnicodeTest: ");
66 }
67 TESTCASE_AUTO_BEGIN;
68 TESTCASE_AUTO(TestAdditionalProperties);
69 TESTCASE_AUTO(TestBinaryValues);
70 TESTCASE_AUTO(TestConsistency);
71 TESTCASE_AUTO(TestPatternProperties);
72 TESTCASE_AUTO(TestScriptMetadata);
73 TESTCASE_AUTO(TestBidiPairedBracketType);
74 TESTCASE_AUTO(TestEmojiProperties);
75 TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
76 TESTCASE_AUTO(TestIndicPositionalCategory);
77 TESTCASE_AUTO(TestIndicSyllabicCategory);
78 TESTCASE_AUTO(TestVerticalOrientation);
79 TESTCASE_AUTO(TestDefaultScriptExtensions);
80 TESTCASE_AUTO(TestInvalidCodePointFolding);
81 #if !UCONFIG_NO_NORMALIZATION
82 TESTCASE_AUTO(TestBinaryCharacterProperties);
83 TESTCASE_AUTO(TestIntCharacterProperties);
84 #endif
85 TESTCASE_AUTO(TestPropertyNames);
86 TESTCASE_AUTO(TestIDSUnaryOperator);
87 TESTCASE_AUTO(TestIDCompatMath);
88 TESTCASE_AUTO(TestPropertiesUsingPpucd);
89 TESTCASE_AUTO(TestIDStatus);
90 TESTCASE_AUTO(TestIDType);
91 TESTCASE_AUTO_END;
92 }
93
94 //====================================================
95 // private data used by the tests
96 //====================================================
97
98 // test DerivedCoreProperties.txt -------------------------------------------
99
100 // copied from genprops.c
101 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)102 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
103 const char *t, *z;
104 int32_t i, j;
105
106 s=u_skipWhitespace(s);
107 for(i=0; i<countTokens; ++i) {
108 t=tokens[i];
109 if(t!=nullptr) {
110 for(j=0;; ++j) {
111 if(t[j]!=0) {
112 if(s[j]!=t[j]) {
113 break;
114 }
115 } else {
116 z=u_skipWhitespace(s+j);
117 if(*z==';' || *z==0) {
118 return i;
119 } else {
120 break;
121 }
122 }
123 }
124 }
125 }
126 return -1;
127 }
128
129 static const char *const
130 derivedPropsNames[]={
131 "Math",
132 "Alphabetic",
133 "Lowercase",
134 "Uppercase",
135 "ID_Start",
136 "ID_Continue",
137 "XID_Start",
138 "XID_Continue",
139 "Default_Ignorable_Code_Point",
140 "Full_Composition_Exclusion",
141 "Grapheme_Extend",
142 "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
143 "Grapheme_Base",
144 "Cased",
145 "Case_Ignorable",
146 "Changes_When_Lowercased",
147 "Changes_When_Uppercased",
148 "Changes_When_Titlecased",
149 "Changes_When_Casefolded",
150 "Changes_When_Casemapped",
151 "Changes_When_NFKC_Casefolded"
152 };
153
154 static const UProperty
155 derivedPropsIndex[]={
156 UCHAR_MATH,
157 UCHAR_ALPHABETIC,
158 UCHAR_LOWERCASE,
159 UCHAR_UPPERCASE,
160 UCHAR_ID_START,
161 UCHAR_ID_CONTINUE,
162 UCHAR_XID_START,
163 UCHAR_XID_CONTINUE,
164 UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
165 UCHAR_FULL_COMPOSITION_EXCLUSION,
166 UCHAR_GRAPHEME_EXTEND,
167 UCHAR_GRAPHEME_LINK,
168 UCHAR_GRAPHEME_BASE,
169 UCHAR_CASED,
170 UCHAR_CASE_IGNORABLE,
171 UCHAR_CHANGES_WHEN_LOWERCASED,
172 UCHAR_CHANGES_WHEN_UPPERCASED,
173 UCHAR_CHANGES_WHEN_TITLECASED,
174 UCHAR_CHANGES_WHEN_CASEFOLDED,
175 UCHAR_CHANGES_WHEN_CASEMAPPED,
176 UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
177 };
178
179 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 };
180
181 enum { MAX_ERRORS=50 };
182
183 U_CFUNC void U_CALLCONV
derivedPropsLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)184 derivedPropsLineFn(void *context,
185 char *fields[][2], int32_t /* fieldCount */,
186 UErrorCode *pErrorCode)
187 {
188 UnicodeTest *me=static_cast<UnicodeTest*>(context);
189 uint32_t start, end;
190 int32_t i;
191
192 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
193 if(U_FAILURE(*pErrorCode)) {
194 me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
195 return;
196 }
197
198 /* parse derived binary property name, ignore unknown names */
199 i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]);
200 if(i<0) {
201 UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
202 propName.trim();
203 if(me->unknownPropertyNames->find(propName)==nullptr) {
204 UErrorCode errorCode=U_ZERO_ERROR;
205 me->unknownPropertyNames->puti(propName, 1, errorCode);
206 me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
207 }
208 return;
209 }
210
211 me->derivedProps[i].add(start, end);
212 }
213
TestAdditionalProperties()214 void UnicodeTest::TestAdditionalProperties() {
215 #if !UCONFIG_NO_NORMALIZATION
216 // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
217 if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) {
218 errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
219 UPRV_LENGTHOF(derivedPropsNames));
220 return;
221 }
222 if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) {
223 errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n");
224 return;
225 }
226
227 char path[500];
228 if(getUnidataPath(path) == nullptr) {
229 errln("unable to find path to source/data/unidata/");
230 return;
231 }
232 char *basename=strchr(path, 0);
233 strcpy(basename, "DerivedCoreProperties.txt");
234
235 char *fields[2][2];
236 UErrorCode errorCode=U_ZERO_ERROR;
237 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
238 if(U_FAILURE(errorCode)) {
239 errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
240 return;
241 }
242
243 strcpy(basename, "DerivedNormalizationProps.txt");
244 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
245 if(U_FAILURE(errorCode)) {
246 errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
247 return;
248 }
249
250 // now we have all derived core properties in the UnicodeSets
251 // run them all through the API
252 int32_t rangeCount, range;
253 uint32_t i;
254 UChar32 start, end;
255
256 // test all true properties
257 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
258 rangeCount=derivedProps[i].getRangeCount();
259 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
260 start=derivedProps[i].getRangeStart(range);
261 end=derivedProps[i].getRangeEnd(range);
262 for(; start<=end; ++start) {
263 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
264 dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==false is wrong", start, derivedPropsNames[i]);
265 if(++numErrors[i]>=MAX_ERRORS) {
266 dataerrln("Too many errors, moving to the next test");
267 break;
268 }
269 }
270 }
271 }
272 }
273
274 // invert all properties
275 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
276 derivedProps[i].complement();
277 }
278
279 // test all false properties
280 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
281 rangeCount=derivedProps[i].getRangeCount();
282 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
283 start=derivedProps[i].getRangeStart(range);
284 end=derivedProps[i].getRangeEnd(range);
285 for(; start<=end; ++start) {
286 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
287 errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==true is wrong\n", start, derivedPropsNames[i]);
288 if(++numErrors[i]>=MAX_ERRORS) {
289 errln("Too many errors, moving to the next test");
290 break;
291 }
292 }
293 }
294 }
295 }
296 #endif /* !UCONFIG_NO_NORMALIZATION */
297 }
298
TestBinaryValues()299 void UnicodeTest::TestBinaryValues() {
300 /*
301 * Unicode 5.1 explicitly defines binary property value aliases.
302 * Verify that they are all recognized.
303 */
304 UErrorCode errorCode=U_ZERO_ERROR;
305 UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
306 if(U_FAILURE(errorCode)) {
307 dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
308 return;
309 }
310
311 static const char *const falseValues[]={ "N", "No", "F", "False" };
312 static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
313 int32_t i;
314 for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) {
315 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
316 pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
317 errorCode=U_ZERO_ERROR;
318 UnicodeSet set(pattern, errorCode);
319 if(U_FAILURE(errorCode)) {
320 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
321 continue;
322 }
323 set.complement();
324 if(set!=alpha) {
325 errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
326 }
327 }
328 for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) {
329 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
330 pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
331 errorCode=U_ZERO_ERROR;
332 UnicodeSet set(pattern, errorCode);
333 if(U_FAILURE(errorCode)) {
334 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
335 continue;
336 }
337 if(set!=alpha) {
338 errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
339 }
340 }
341 }
342
TestConsistency()343 void UnicodeTest::TestConsistency() {
344 #if !UCONFIG_NO_NORMALIZATION
345 /*
346 * Test for an example that getCanonStartSet() delivers
347 * all characters that compose from the input one,
348 * even in multiple steps.
349 * For example, the set for "I" (0049) should contain both
350 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
351 * In general, the set for the middle such character should be a subset
352 * of the set for the first.
353 */
354 IcuTestErrorCode errorCode(*this, "TestConsistency");
355 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
356 const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
357 if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
358 dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
359 errorCode.errorName());
360 errorCode.reset();
361 return;
362 }
363
364 UnicodeSet set1, set2;
365 if (nfcImpl->getCanonStartSet(0x49, set1)) {
366 /* enumerate all characters that are plausible to be latin letters */
367 for(char16_t start=0xa0; start<0x2000; ++start) {
368 UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
369 if(decomp.length()>1 && decomp[0]==0x49) {
370 set2.add(start);
371 }
372 }
373
374 if (set1!=set2) {
375 errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
376 }
377 // This was available in cucdtst.c but the test had to move to intltest
378 // because the new internal normalization functions are in C++.
379 //compareUSets(set1, set2,
380 // "[canon start set of 0049]", "[all c with canon decomp with 0049]",
381 // true);
382 } else {
383 errln("NFC.getCanonStartSet() returned false");
384 }
385 #endif
386 }
387
388 /**
389 * Test various implementations of Pattern_Syntax & Pattern_White_Space.
390 */
TestPatternProperties()391 void UnicodeTest::TestPatternProperties() {
392 IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
393 UnicodeSet syn_pp;
394 UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
395 UnicodeSet syn_list(
396 "[!-/\\:-@\\[-\\^`\\{-~"
397 "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
398 "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
399 "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
400 UnicodeSet ws_pp;
401 UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
402 UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
403 UnicodeSet syn_ws_pp;
404 UnicodeSet syn_ws_prop(syn_prop);
405 syn_ws_prop.addAll(ws_prop);
406 for(UChar32 c=0; c<=0xffff; ++c) {
407 if(PatternProps::isSyntax(c)) {
408 syn_pp.add(c);
409 }
410 if(PatternProps::isWhiteSpace(c)) {
411 ws_pp.add(c);
412 }
413 if(PatternProps::isSyntaxOrWhiteSpace(c)) {
414 syn_ws_pp.add(c);
415 }
416 }
417 compareUSets(syn_pp, syn_prop,
418 "PatternProps.isSyntax()", "[:Pattern_Syntax:]", true);
419 compareUSets(syn_pp, syn_list,
420 "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", true);
421 compareUSets(ws_pp, ws_prop,
422 "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", true);
423 compareUSets(ws_pp, ws_list,
424 "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", true);
425 compareUSets(syn_ws_pp, syn_ws_prop,
426 "PatternProps.isSyntaxOrWhiteSpace()",
427 "[[:Pattern_Syntax:][:Pattern_White_Space:]]", true);
428 }
429
430 // So far only minimal port of Java & cucdtst.c compareUSets().
431 UBool
compareUSets(const UnicodeSet & a,const UnicodeSet & b,const char * a_name,const char * b_name,UBool diffIsError)432 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
433 const char *a_name, const char *b_name,
434 UBool diffIsError) {
435 UBool same= a==b;
436 if(!same && diffIsError) {
437 errln("Sets are different: %s vs. %s\n", a_name, b_name);
438 }
439 return same;
440 }
441
442 namespace {
443
444 /**
445 * Maps a special script code to the most common script of its encoded characters.
446 */
getCharScript(UScriptCode script)447 UScriptCode getCharScript(UScriptCode script) {
448 switch(script) {
449 case USCRIPT_HAN_WITH_BOPOMOFO:
450 case USCRIPT_SIMPLIFIED_HAN:
451 case USCRIPT_TRADITIONAL_HAN:
452 return USCRIPT_HAN;
453 case USCRIPT_JAPANESE:
454 return USCRIPT_HIRAGANA;
455 case USCRIPT_JAMO:
456 case USCRIPT_KOREAN:
457 return USCRIPT_HANGUL;
458 case USCRIPT_SYMBOLS_EMOJI:
459 return USCRIPT_SYMBOLS;
460 default:
461 return script;
462 }
463 }
464
465 } // namespace
466
TestScriptMetadata()467 void UnicodeTest::TestScriptMetadata() {
468 IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
469 UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
470 // So far, sample characters are uppercase.
471 // Georgian is special.
472 UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
473 for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
474 UScriptCode sc = static_cast<UScriptCode>(sci);
475 // Run the test with -v to see which script has failures:
476 // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL
477 logln(uscript_getShortName(sc));
478 UScriptUsage usage = uscript_getUsage(sc);
479 UnicodeString sample = uscript_getSampleUnicodeString(sc);
480 UnicodeSet scriptSet;
481 scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
482 if(usage == USCRIPT_USAGE_NOT_ENCODED) {
483 assertTrue("not encoded, no sample", sample.isEmpty());
484 assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
485 assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
486 assertFalse("not encoded, not cased", uscript_isCased(sc));
487 assertTrue("not encoded, no characters", scriptSet.isEmpty());
488 } else {
489 assertFalse("encoded, has a sample character", sample.isEmpty());
490 UChar32 firstChar = sample.char32At(0);
491 UScriptCode charScript = getCharScript(sc);
492 assertEquals("script(sample(script))",
493 static_cast<int32_t>(charScript), static_cast<int32_t>(uscript_getScript(firstChar, errorCode)));
494 assertEquals("RTL vs. set", rtl.contains(firstChar), uscript_isRightToLeft(sc));
495 assertEquals("cased vs. set", cased.contains(firstChar), uscript_isCased(sc));
496 assertEquals("encoded, has characters", static_cast<UBool>(sc == charScript), static_cast<UBool>(!scriptSet.isEmpty()));
497 if(uscript_isRightToLeft(sc)) {
498 rtl.removeAll(scriptSet);
499 }
500 if(uscript_isCased(sc)) {
501 cased.removeAll(scriptSet);
502 }
503 }
504 }
505 UnicodeString pattern;
506 assertEquals("no remaining RTL characters",
507 UnicodeString("[]"), rtl.toPattern(pattern));
508 assertEquals("no remaining cased characters",
509 UnicodeString("[]"), cased.toPattern(pattern));
510
511 assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
512 assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
513 assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
514 }
515
TestBidiPairedBracketType()516 void UnicodeTest::TestBidiPairedBracketType() {
517 // BidiBrackets-6.3.0.txt says:
518 //
519 // The set of code points listed in this file was originally derived
520 // using the character properties General_Category (gc), Bidi_Class (bc),
521 // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
522 // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
523 // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
524 // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
525 // property values are Open and Close, respectively.
526 IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()");
527 UnicodeSet bpt("[:^bpt=n:]", errorCode);
528 assertTrue("bpt!=None is not empty", !bpt.isEmpty());
529 // The following should always be true.
530 UnicodeSet mirrored("[:Bidi_M:]", errorCode);
531 UnicodeSet other_neutral("[:bc=ON:]", errorCode);
532 assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
533 assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
534 // The following are true at least initially in Unicode 6.3.
535 UnicodeSet bpt_open("[:bpt=o:]", errorCode);
536 UnicodeSet bpt_close("[:bpt=c:]", errorCode);
537 UnicodeSet ps("[:Ps:]", errorCode);
538 UnicodeSet pe("[:Pe:]", errorCode);
539 assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
540 assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
541 }
542
TestEmojiProperties()543 void UnicodeTest::TestEmojiProperties() {
544 assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI));
545 assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI));
546 IcuTestErrorCode errorCode(*this, "TestEmojiProperties()");
547 UnicodeSet emoji("[:Emoji:]", errorCode);
548 assertTrue("lots of Emoji", emoji.size() > 700);
549
550 assertTrue("shooting star is Emoji_Presentation",
551 u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION));
552 assertTrue("Fitzpatrick 6 is Emoji_Modifier",
553 u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER));
554 assertTrue("happy person is Emoji_Modifier_Base",
555 u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE));
556 assertTrue("asterisk is Emoji_Component",
557 u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT));
558 assertTrue("copyright is Extended_Pictographic",
559 u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
560 }
561
562 namespace {
563
hbp(const char16_t * s,int32_t length,UProperty which)564 UBool hbp(const char16_t *s, int32_t length, UProperty which) {
565 return u_stringHasBinaryProperty(s, length, which);
566 }
567
hbp(const char16_t * s,UProperty which)568 UBool hbp(const char16_t *s, UProperty which) {
569 return u_stringHasBinaryProperty(s, -1, which);
570 }
571
572 } // namespace
573
TestEmojiPropertiesOfStrings()574 void UnicodeTest::TestEmojiPropertiesOfStrings() {
575 // Property of code points, for coverage
576 assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
577 assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
578 assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
579 assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
580 assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
581 assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
582 assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
583 assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
584 assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
585 assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
586 assertFalse("bicycle is not Ideographic", hbp(u"", 2, UCHAR_IDEOGRAPHIC));
587 assertFalse("bicycle/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
588 assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
589 assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
590
591 // Property of (code points and) strings
592 assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
593 assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
594 assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
595 assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
596 assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
597 assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
598 assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
599 assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
600 assertTrue("bicycle is Basic_Emoji", hbp(u"", 2, UCHAR_BASIC_EMOJI));
601 assertTrue("bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
602 assertFalse("2*bicycle is Basic_Emoji", hbp(u"", 4, UCHAR_BASIC_EMOJI));
603 assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
604 assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
605 assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
606
607 assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
608 assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
609 assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
610 assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
611
612 assertFalse("chipmunk is not Basic_Emoji", hbp(u"", UCHAR_BASIC_EMOJI));
613 assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"\uFE0F", UCHAR_BASIC_EMOJI));
614 assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
615
616 // Properties of strings (only)
617 assertFalse("4+emoji is not Emoji_Keycap_Sequence",
618 hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
619 assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
620 hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
621
622 assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
623 hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
624 assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
625 hbp(u"", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
626
627 assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
628 hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
629 assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
630 hbp(u"", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
631
632 assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
633 hbp(u"", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
634 assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
635 hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
636
637 assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
638 hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
639 assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
640 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
641
642 // RGI_Emoji = all of the above
643 assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
644 assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
645
646 assertFalse("chipmunk is not RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
647 assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"\uFE0F", UCHAR_RGI_EMOJI));
648
649 assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
650 assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
651
652 assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
653 assertTrue("[BE] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
654
655 assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
656 assertTrue("[Scotland] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
657
658 assertTrue("bicyclist is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
659 assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI));
660
661 assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
662 assertTrue("woman pilot: dark skin tone is RGI_Emoji",
663 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
664
665 // UnicodeSet with properties of strings
666 IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
667 UnicodeSet basic("[:Basic_Emoji:]", errorCode);
668 UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
669 UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
670 UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
671 UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
672 UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
673 UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
674 if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
675 return;
676 }
677
678 // union of all sets except for "rgi" -- should be the same as "rgi"
679 UnicodeSet all(basic);
680 all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
681
682 UnicodeSet basicOnlyCp(basic);
683 basicOnlyCp.removeAllStrings();
684
685 UnicodeSet rgiOnlyCp(rgi);
686 rgiOnlyCp.removeAllStrings();
687
688 assertTrue("lots of Basic_Emoji", basic.size() > 1000);
689 assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
690 assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
691 assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
692 assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
693 assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
694 assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
695
696 assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
697 assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
698 assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
699 assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
700 assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
701 assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
702 assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
703
704 assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
705 assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
706 assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
707 rgiOnlyCp.size(), basicOnlyCp.size());
708 assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
709 assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
710 assertTrue("RGI_Emoji == union", rgi == all);
711
712 assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
713 assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"\uFE0F"));
714 assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
715 keycaps.contains(u"4\uFE0F\u20E3"));
716 assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u""));
717 assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u""));
718 assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
719 modified.contains(u"\U0001F3FD"));
720 assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
721 combos.contains(u"\U0001F3FF\u200D✈\uFE0F"));
722 assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
723 assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"\uFE0F"));
724 assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
725 assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u""));
726 assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
727 assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u""));
728 assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u""));
729 assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"\U0001F3FD"));
730 assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"\U0001F3FF\u200D✈\uFE0F"));
731 }
732
TestIndicPositionalCategory()733 void UnicodeTest::TestIndicPositionalCategory() {
734 IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
735 UnicodeSet na(u"[:InPC=NA:]", errorCode);
736 assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500);
737 UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode);
738 assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100);
739 assertEquals("U+08FF: NA", U_INPC_NA,
740 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY));
741 assertEquals("U+0900: Top", U_INPC_TOP,
742 u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY));
743 assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK,
744 u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY));
745 }
746
TestIndicSyllabicCategory()747 void UnicodeTest::TestIndicSyllabicCategory() {
748 IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()");
749 UnicodeSet other(u"[:InSC=Other:]", errorCode);
750 assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500);
751 UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode);
752 assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100);
753 assertEquals("U+08FF: Other", U_INSC_OTHER,
754 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY));
755 assertEquals("U+0900: Bindu", U_INSC_BINDU,
756 u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY));
757 assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER,
758 u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY));
759 }
760
TestVerticalOrientation()761 void UnicodeTest::TestVerticalOrientation() {
762 IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()");
763 UnicodeSet r(u"[:vo=R:]", errorCode);
764 assertTrue("mostly R", 0xb0000 <= r.size() && r.size() <= 0xd0000);
765 UnicodeSet u(u"[:vo=U:]", errorCode);
766 assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x60000);
767 UnicodeSet tu(u"[:vo=Tu:]", errorCode);
768 assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300);
769 assertEquals("U+0E01: Rotated", U_VO_ROTATED,
770 u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION));
771 assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED,
772 u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION));
773 assertEquals("U+33333: Upright", U_VO_UPRIGHT,
774 u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION));
775 }
776
TestDefaultScriptExtensions()777 void UnicodeTest::TestDefaultScriptExtensions() {
778 // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
779 // but some of its characters revert to scx=<script> which is usually Common.
780 IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()");
781 UScriptCode scx[20];
782 scx[0] = USCRIPT_INVALID_CODE;
783 assertEquals("U+3000 num scx", 1, // IDEOGRAPHIC SPACE
784 uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode));
785 assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]);
786 scx[0] = USCRIPT_INVALID_CODE;
787 assertEquals("U+3012 num scx", 1, // POSTAL MARK
788 uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode));
789 assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]);
790 }
791
TestInvalidCodePointFolding()792 void UnicodeTest::TestInvalidCodePointFolding() {
793 // Test behavior when an invalid code point is passed to u_foldCase
794 static const UChar32 invalidCodePoints[] = {
795 0xD800, // lead surrogate
796 0xDFFF, // trail surrogate
797 0xFDD0, // noncharacter
798 0xFFFF, // noncharacter
799 0x110000, // out of range
800 -1 // negative
801 };
802 for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) {
803 UChar32 cp = invalidCodePoints[i];
804 assertEquals("Invalid code points should be echoed back",
805 cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT));
806 assertEquals("Invalid code points should be echoed back",
807 cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
808 }
809 }
810
TestBinaryCharacterProperties()811 void UnicodeTest::TestBinaryCharacterProperties() {
812 #if !UCONFIG_NO_NORMALIZATION
813 IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
814 // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
815 for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
816 const USet* uset = u_getBinaryPropertySet(static_cast<UProperty>(prop), errorCode);
817 if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", static_cast<int>(prop))) {
818 continue;
819 }
820 const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
821 int32_t count = set.getRangeCount();
822 if (count == 0) {
823 assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
824 u_hasBinaryProperty(0x20, static_cast<UProperty>(prop)));
825 assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
826 u_hasBinaryProperty(0x61, static_cast<UProperty>(prop)));
827 assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
828 u_hasBinaryProperty(0x4e00, static_cast<UProperty>(prop)));
829 } else {
830 UChar32 c = set.getRangeStart(0);
831 if (c > 0) {
832 assertFalse(
833 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
834 u", " + prop + u")",
835 u_hasBinaryProperty(c - 1, static_cast<UProperty>(prop)));
836 }
837 assertTrue(
838 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
839 u", " + prop + u")",
840 u_hasBinaryProperty(c, static_cast<UProperty>(prop)));
841 c = set.getRangeEnd(count - 1);
842 assertTrue(
843 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
844 u", " + prop + u")",
845 u_hasBinaryProperty(c, static_cast<UProperty>(prop)));
846 if (c < 0x10ffff) {
847 assertFalse(
848 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
849 u", " + prop + u")",
850 u_hasBinaryProperty(c + 1, static_cast<UProperty>(prop)));
851 }
852 }
853 }
854 #endif
855 }
856
TestIntCharacterProperties()857 void UnicodeTest::TestIntCharacterProperties() {
858 #if !UCONFIG_NO_NORMALIZATION
859 IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
860 // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
861 for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
862 const UCPMap* map = u_getIntPropertyMap(static_cast<UProperty>(prop), errorCode);
863 if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", static_cast<int>(prop))) {
864 continue;
865 }
866 uint32_t value;
867 UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
868 assertTrue("int property first range", end >= 0);
869 UChar32 c = end / 2;
870 assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
871 u_getIntPropertyValue(c, static_cast<UProperty>(prop)), value);
872 end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
873 assertTrue("int property later range", end >= 0);
874 assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
875 u_getIntPropertyValue(end, static_cast<UProperty>(prop)), value);
876 // ucpmap_get() API coverage
877 // TODO: move to cucdtst.c
878 assertEquals(
879 "int property upcmap_get(U+0061)",
880 u_getIntPropertyValue(0x61, static_cast<UProperty>(prop)), ucpmap_get(map, 0x61));
881 }
882 #endif
883 }
884
885 namespace {
886
getPropName(UProperty property,int32_t nameChoice)887 const char *getPropName(UProperty property, int32_t nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
888 const char* name = u_getPropertyName(property, static_cast<UPropertyNameChoice>(nameChoice));
889 return name != nullptr ? name : "null";
890 }
891
getValueName(UProperty property,int32_t value,int32_t nameChoice)892 const char *getValueName(UProperty property, int32_t value, int32_t nameChoice)
893 UPRV_NO_SANITIZE_UNDEFINED {
894 const char* name = u_getPropertyValueName(property, value, static_cast<UPropertyNameChoice>(nameChoice));
895 return name != nullptr ? name : "null";
896 }
897
898 } // namespace
899
TestPropertyNames()900 void UnicodeTest::TestPropertyNames() {
901 IcuTestErrorCode errorCode(*this, "TestPropertyNames()");
902 // Test names of certain properties & values.
903 // The UPropertyNameChoice is really an integer with only a couple of named constants.
904 UProperty prop = UCHAR_WHITE_SPACE;
905 constexpr int32_t SHORT = U_SHORT_PROPERTY_NAME;
906 constexpr int32_t LONG = U_LONG_PROPERTY_NAME;
907 assertEquals("White_Space: index -1", "null", getPropName(prop, -1));
908 assertEquals("White_Space: short", "WSpace", getPropName(prop, SHORT));
909 assertEquals("White_Space: long", "White_Space", getPropName(prop, LONG));
910 assertEquals("White_Space: index 2", "space", getPropName(prop, 2));
911 assertEquals("White_Space: index 3", "null", getPropName(prop, 3));
912
913 prop = UCHAR_SIMPLE_CASE_FOLDING;
914 assertEquals("Simple_Case_Folding: index -1", "null", getPropName(prop, -1));
915 assertEquals("Simple_Case_Folding: short", "scf", getPropName(prop, SHORT));
916 assertEquals("Simple_Case_Folding: long", "Simple_Case_Folding", getPropName(prop, LONG));
917 assertEquals("Simple_Case_Folding: index 2", "sfc", getPropName(prop, 2));
918 assertEquals("Simple_Case_Folding: index 3", "null", getPropName(prop, 3));
919
920 prop = UCHAR_CASED;
921 assertEquals("Cased=Y: index -1", "null", getValueName(prop, 1, -1));
922 assertEquals("Cased=Y: short", "Y", getValueName(prop, 1, SHORT));
923 assertEquals("Cased=Y: long", "Yes", getValueName(prop, 1, LONG));
924 assertEquals("Cased=Y: index 2", "T", getValueName(prop, 1, 2));
925 assertEquals("Cased=Y: index 3", "True", getValueName(prop, 1, 3));
926 assertEquals("Cased=Y: index 4", "null", getValueName(prop, 1, 4));
927
928 prop = UCHAR_DECOMPOSITION_TYPE;
929 int32_t value = U_DT_NOBREAK;
930 assertEquals("dt=Nb: index -1", "null", getValueName(prop, value, -1));
931 assertEquals("dt=Nb: short", "Nb", getValueName(prop, value, SHORT));
932 assertEquals("dt=Nb: long", "Nobreak", getValueName(prop, value, LONG));
933 assertEquals("dt=Nb: index 2", "nb", getValueName(prop, value, 2));
934 assertEquals("dt=Nb: index 3", "null", getValueName(prop, value, 3));
935
936 // Canonical_Combining_Class:
937 // The UCD inserts the numeric values in the second filed of its PropertyValueAliases.txt lines.
938 // In ICU, we don't treat these as names,
939 // they are just the numeric values returned by u_getCombiningClass().
940 // We return the real short and long names for the usual choice constants.
941 prop = UCHAR_CANONICAL_COMBINING_CLASS;
942 assertEquals("ccc=230: index -1", "null", getValueName(prop, 230, -1));
943 assertEquals("ccc=230: short", "A", getValueName(prop, 230, SHORT));
944 assertEquals("ccc=230: long", "Above", getValueName(prop, 230, LONG));
945 assertEquals("ccc=230: index 2", "null", getValueName(prop, 230, 2));
946
947 prop = UCHAR_GENERAL_CATEGORY;
948 value = U_DECIMAL_DIGIT_NUMBER;
949 assertEquals("gc=Nd: index -1", "null", getValueName(prop, value, -1));
950 assertEquals("gc=Nd: short", "Nd", getValueName(prop, value, SHORT));
951 assertEquals("gc=Nd: long", "Decimal_Number", getValueName(prop, value, LONG));
952 assertEquals("gc=Nd: index 2", "digit", getValueName(prop, value, 2));
953 assertEquals("gc=Nd: index 3", "null", getValueName(prop, value, 3));
954
955 prop = UCHAR_GENERAL_CATEGORY_MASK;
956 value = U_GC_P_MASK;
957 assertEquals("gc=P mask: index -1", "null", getValueName(prop, value, -1));
958 assertEquals("gc=P mask: short", "P", getValueName(prop, value, SHORT));
959 assertEquals("gc=P mask: long", "Punctuation", getValueName(prop, value, LONG));
960 assertEquals("gc=P mask: index 2", "punct", getValueName(prop, value, 2));
961 assertEquals("gc=P mask: index 3", "null", getValueName(prop, value, 3));
962 }
963
TestIDSUnaryOperator()964 void UnicodeTest::TestIDSUnaryOperator() {
965 IcuTestErrorCode errorCode(*this, "TestIDSUnaryOperator()");
966 // New in Unicode 15.1 for just two characters.
967 assertFalse("U+2FFC IDSU", u_hasBinaryProperty(0x2ffc, UCHAR_IDS_UNARY_OPERATOR));
968 assertFalse("U+2FFD IDSU", u_hasBinaryProperty(0x2ffd, UCHAR_IDS_UNARY_OPERATOR));
969 assertTrue("U+2FFE IDSU", u_hasBinaryProperty(0x2ffe, UCHAR_IDS_UNARY_OPERATOR));
970 assertTrue("U+2FFF IDSU", u_hasBinaryProperty(0x2fff, UCHAR_IDS_UNARY_OPERATOR));
971 assertFalse("U+3000 IDSU", u_hasBinaryProperty(0x3000, UCHAR_IDS_UNARY_OPERATOR));
972 assertFalse("U+3001 IDSU", u_hasBinaryProperty(0x3001, UCHAR_IDS_UNARY_OPERATOR));
973
974 // Property name works and gets the correct set.
975 UnicodeSet idsu(u"[:IDS_Unary_Operator:]", errorCode);
976 assertEquals("IDSU set number of characters", 2, idsu.size());
977 assertFalse("idsu.contains(U+2FFD)", idsu.contains(0x2ffd));
978 assertTrue("idsu.contains(U+2FFE)", idsu.contains(0x2ffe));
979 assertTrue("idsu.contains(U+2FFF)", idsu.contains(0x2fff));
980 assertFalse("idsu.contains(U+3000)", idsu.contains(0x3000));
981 }
982
983 namespace {
984
isMathStart(UChar32 c)985 bool isMathStart(UChar32 c) {
986 return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_START);
987 }
988
isMathContinue(UChar32 c)989 bool isMathContinue(UChar32 c) {
990 return u_hasBinaryProperty(c, UCHAR_ID_COMPAT_MATH_CONTINUE);
991 }
992
993 } // namespace
994
TestIDCompatMath()995 void UnicodeTest::TestIDCompatMath() {
996 IcuTestErrorCode errorCode(*this, "TestIDCompatMath()");
997 assertFalse("U+00B1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb1));
998 assertTrue("U+00B2 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb2));
999 assertTrue("U+00B3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb3));
1000 assertFalse("U+00B4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0xb4));
1001 assertFalse("U+207F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x207f));
1002 assertTrue("U+2080 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2080));
1003 assertTrue("U+208E UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208e));
1004 assertFalse("U+208F UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x208f));
1005 assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2201));
1006 assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x2202));
1007 assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D6C1));
1008 assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C3));
1009 assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_CONTINUE", isMathContinue(0x1D7C4));
1010
1011 assertFalse("U+00B2 UCHAR_ID_COMPAT_MATH_START", isMathStart(0xb2));
1012 assertFalse("U+2080 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2080));
1013 assertFalse("U+2201 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2201));
1014 assertTrue("U+2202 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x2202));
1015 assertTrue("U+1D6C1 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D6C1));
1016 assertTrue("U+1D7C3 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C3));
1017 assertFalse("U+1D7C4 UCHAR_ID_COMPAT_MATH_START", isMathStart(0x1D7C4));
1018
1019 // Property names work and get the correct sets.
1020 UnicodeSet idcmStart(u"[:ID_Compat_Math_Start:]", errorCode);
1021 UnicodeSet idcmContinue(u"[:ID_Compat_Math_Continue:]", errorCode);
1022 assertEquals("ID_Compat_Math_Start set number of characters", 13, idcmStart.size());
1023 assertEquals("ID_Compat_Math_Continue set number of characters", 43, idcmContinue.size());
1024 assertTrue("ID_Compat_Math_Start is a subset of ID_Compat_Math_Continue",
1025 idcmContinue.containsAll(idcmStart));
1026 assertFalse("idcmContinue.contains(U+207F)", idcmContinue.contains(0x207f));
1027 assertTrue("idcmContinue.contains(U+2080)", idcmContinue.contains(0x2080));
1028 assertTrue("idcmContinue.contains(U+208E)", idcmContinue.contains(0x208e));
1029 assertFalse("idcmContinue.contains(U+208F)", idcmContinue.contains(0x208f));
1030 assertFalse("idcmStart.contains(U+2201)", idcmStart.contains(0x2201));
1031 assertTrue("idcmStart.contains(U+2202)", idcmStart.contains(0x2202));
1032 assertTrue("idcmStart.contains(U+1D7C3)", idcmStart.contains(0x1D7C3));
1033 assertFalse("idcmStart.contains(U+1D7C4)", idcmStart.contains(0x1D7C4));
1034 }
1035
1036 U_NAMESPACE_BEGIN
1037
1038 class BuiltInPropertyNames : public PropertyNames {
1039 public:
~BuiltInPropertyNames()1040 ~BuiltInPropertyNames() override {}
1041
getPropertyEnum(const char * name) const1042 int32_t getPropertyEnum(const char *name) const override {
1043 return u_getPropertyEnum(name);
1044 }
1045
getPropertyValueEnum(int32_t property,const char * name) const1046 int32_t getPropertyValueEnum(int32_t property, const char *name) const override {
1047 return u_getPropertyValueEnum(static_cast<UProperty>(property), name);
1048 }
1049 };
1050
1051 U_NAMESPACE_END
1052
TestPropertiesUsingPpucd()1053 void UnicodeTest::TestPropertiesUsingPpucd() {
1054 IcuTestErrorCode errorCode(*this, "TestPropertiesUsingPpucd()");
1055
1056 // Initialize PPUCD parsing object using file in repo and using
1057 // property names present in built-in data in ICU
1058 char buffer[500];
1059 // get path to `source/data/unidata/` including trailing `/`
1060 char *unidataPath = getUnidataPath(buffer);
1061 if(unidataPath == nullptr) {
1062 errln("exiting early because unable to open ppucd.txt from ICU source tree");
1063 return;
1064 }
1065 CharString ppucdPath(unidataPath, errorCode);
1066 ppucdPath.appendPathPart("ppucd.txt", errorCode);
1067 PreparsedUCD ppucd(ppucdPath.data(), errorCode);
1068 if(errorCode.isFailure()) {
1069 errln("unable to open %s - %s\n",
1070 ppucdPath.data(), errorCode.errorName());
1071 return;
1072 }
1073 BuiltInPropertyNames builtInPropNames;
1074 ppucd.setPropertyNames(&builtInPropNames);
1075
1076 // Define which properties we want to compare
1077 struct TestProp {
1078 const UProperty prop;
1079 const int32_t value = 1; // binary "Yes"
1080 UnicodeSet set;
1081
1082 TestProp(UProperty binaryProp) : prop(binaryProp) {}
1083 TestProp(UProperty intProp, int32_t v) : prop(intProp), value(v) {}
1084 bool isBinary() const { return prop < UCHAR_BINARY_LIMIT; }
1085 };
1086 TestProp propsUnderTest[] = {
1087 { UCHAR_IDS_UNARY_OPERATOR },
1088 { UCHAR_ID_COMPAT_MATH_START },
1089 { UCHAR_ID_COMPAT_MATH_CONTINUE },
1090 #if !UCONFIG_NO_NORMALIZATION
1091 { UCHAR_NFD_QUICK_CHECK, UNORM_NO },
1092 { UCHAR_NFKD_QUICK_CHECK, UNORM_NO },
1093 { UCHAR_NFC_QUICK_CHECK, UNORM_NO },
1094 { UCHAR_NFKC_QUICK_CHECK, UNORM_NO },
1095 { UCHAR_NFC_QUICK_CHECK, UNORM_MAYBE },
1096 { UCHAR_NFKC_QUICK_CHECK, UNORM_MAYBE },
1097 #endif // !UCONFIG_NO_NORMALIZATION
1098 { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
1099 { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
1100 { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
1101 { UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
1102 { UCHAR_MODIFIER_COMBINING_MARK },
1103 };
1104
1105 // Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
1106 PreparsedUCD::LineType lineType;
1107 UnicodeSet newValues;
1108 while((lineType=ppucd.readLine(errorCode))!=PreparsedUCD::NO_LINE && errorCode.isSuccess()) {
1109 if(ppucd.lineHasPropertyValues()) {
1110 const UniProps *lineProps=ppucd.getProps(newValues, errorCode);
1111
1112 for (auto &tp : propsUnderTest) {
1113 if (!newValues.contains(tp.prop)) {
1114 continue;
1115 }
1116 bool match = tp.isBinary() ?
1117 lineProps->binProps[tp.prop] :
1118 lineProps->getIntProp(tp.prop) == tp.value;
1119 if (match) {
1120 tp.set.add(lineProps->start, lineProps->end);
1121 } else {
1122 tp.set.remove(lineProps->start, lineProps->end);
1123 }
1124 }
1125 }
1126 }
1127
1128 if(errorCode.isFailure()) {
1129 errln("exiting early due to parsing error");
1130 return;
1131 }
1132
1133 // Assert that the PPUCD data and the ICU data are equivalent for all properties
1134 for (auto &tp : propsUnderTest) {
1135 UnicodeSet icuPropSet;
1136 icuPropSet.applyIntPropertyValue(tp.prop, tp.value, errorCode);
1137 std::string msg =
1138 std::string()
1139 + "ICU & PPUCD versions of "
1140 + u_getPropertyName(tp.prop, U_LONG_PROPERTY_NAME);
1141 if (!tp.isBinary()) {
1142 msg = msg + "=" + u_getPropertyValueName(tp.prop, tp.value, U_LONG_PROPERTY_NAME);
1143 }
1144 UnicodeSetTest::checkEqual(*this, tp.set, icuPropSet, msg.c_str());
1145 }
1146 }
1147
1148 namespace {
1149
getIDStatus(UChar32 c)1150 int32_t getIDStatus(UChar32 c) {
1151 return u_getIntPropertyValue(c, UCHAR_IDENTIFIER_STATUS);
1152 }
1153
1154 } // namespace
1155
TestIDStatus()1156 void UnicodeTest::TestIDStatus() {
1157 IcuTestErrorCode errorCode(*this, "TestIDStatus()");
1158 assertEquals("ID_Status(slash)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x2F));
1159 assertEquals("ID_Status(digit 0)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x30));
1160 assertEquals("ID_Status(colon)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x3A));
1161 assertEquals("ID_Status(semicolon)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x3B));
1162 assertEquals("ID_Status(Greek small alpha)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x03B1));
1163 assertEquals("ID_Status(Greek small archaic koppa)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x03D9));
1164 assertEquals("ID_Status(Hangul syllable)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0xAC00));
1165 assertEquals("ID_Status(surrogate)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xD800));
1166 assertEquals("ID_Status(Arabic tail fragment)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0xFE73));
1167 assertEquals("ID_Status(Hentaigana ko-3)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x1B03A));
1168 assertEquals("ID_Status(Katakana small ko)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x1B155));
1169 assertEquals("ID_Status(U+2EE5D)=Allowed", U_ID_STATUS_ALLOWED, getIDStatus(0x2EE5D));
1170 assertEquals("ID_Status(U+10FFFF)=Restricted", U_ID_STATUS_RESTRICTED, getIDStatus(0x10FFFF));
1171
1172 // Property names work and get the correct sets.
1173 UnicodeSet idStatus(u"[:Identifier_Status=Allowed:]", errorCode);
1174 // Unicode 15.1: 112778 Allowed characters; normally grows over time
1175 assertTrue("Allowed number of characters", idStatus.size() >= 112778);
1176 assertFalse("Allowed.contains(slash)", idStatus.contains(0x2F));
1177 assertTrue("Allowed.contains(digit 0)", idStatus.contains(0x30));
1178 assertTrue("Allowed.contains(colon)", idStatus.contains(0x3A));
1179 assertFalse("Allowed.contains(semicolon)", idStatus.contains(0x3B));
1180 assertTrue("Allowed.contains(Greek small alpha)", idStatus.contains(0x03B1));
1181 assertFalse("Allowed.contains(Greek small archaic koppa)", idStatus.contains(0x03D9));
1182 assertTrue("Allowed.contains(Hangul syllable)", idStatus.contains(0xAC00));
1183 assertFalse("Allowed.contains(surrogate)", idStatus.contains(0xD800));
1184 assertFalse("Allowed.contains(Arabic tail fragment)", idStatus.contains(0xFE73));
1185 assertFalse("Allowed.contains(Hentaigana ko-3)", idStatus.contains(0x1B03A));
1186 assertTrue("Allowed.contains(Katakana small ko)", idStatus.contains(0x1B155));
1187 assertTrue("Allowed.contains(U+2EE5D)", idStatus.contains(0x2EE5D));
1188 assertFalse("Allowed.contains(U+10FFFF)", idStatus.contains(0x10FFFF));
1189 }
1190
1191 namespace {
1192
getIDTypes(UChar32 c)1193 UnicodeString getIDTypes(UChar32 c) {
1194 UErrorCode errorCode = U_ZERO_ERROR;
1195 UIdentifierType types[10];
1196 int32_t length = u_getIDTypes(c, types, UPRV_LENGTHOF(types), &errorCode);
1197 if (U_FAILURE(errorCode)) {
1198 return UnicodeString(u_errorName(errorCode), -1, US_INV);
1199 }
1200 // The order of values is undefined, but for simplicity we assume the order
1201 // that the current implementation yields. Otherwise we would have to sort the values.
1202 uint32_t typeBits = 0;
1203 UnicodeString result;
1204 for (int32_t i = 0; i < length; ++i) {
1205 if (i != 0) {
1206 result.append(u' ');
1207 }
1208 auto t = types[i];
1209 typeBits |= 1UL << t;
1210 const char *s = u_getPropertyValueName(UCHAR_IDENTIFIER_TYPE, t, U_LONG_PROPERTY_NAME);
1211 if (s != nullptr) {
1212 result.append(UnicodeString(s, -1, US_INV));
1213 } else {
1214 result.append(u"???");
1215 }
1216 }
1217 // Check that u_hasIDType() agrees.
1218 // Includes undefined behavior with t > largest enum constant.
1219 for (int32_t i = 0; i < 16; ++i) {
1220 UIdentifierType t = static_cast<UIdentifierType>(i);
1221 bool expected = (typeBits & (1UL << i)) != 0;
1222 bool actual = u_hasIDType(c, t);
1223 if (actual != expected) {
1224 result.append(u" != u_hasIDType() ");
1225 result = result + i;
1226 break;
1227 }
1228 }
1229 return result;
1230 }
1231
1232 } // namespace
1233
TestIDType()1234 void UnicodeTest::TestIDType() {
1235 IcuTestErrorCode errorCode(*this, "TestIDType()");
1236 // Note: Types other than Recommended and Inclusion may well change over time.
1237 assertEquals("ID_Type(slash)", u"Not_XID", getIDTypes(0x2F));
1238 assertEquals("ID_Type(digit 0)", u"Recommended", getIDTypes(0x30));
1239 assertEquals("ID_Type(colon)", u"Inclusion", getIDTypes(0x3A));
1240 assertEquals("ID_Type(semicolon)", u"Not_XID", getIDTypes(0x3B));
1241 assertEquals("ID_Type(Greek small alpha)", u"Recommended", getIDTypes(0x03B1));
1242 assertEquals("ID_Type(Greek small archaic koppa)", u"Obsolete", getIDTypes(0x03D9));
1243 assertEquals("ID_Type(Hangul syllable)", u"Recommended", getIDTypes(0xAC00));
1244 assertEquals("ID_Type(surrogate)", u"Not_Character", getIDTypes(0xD800));
1245 assertEquals("ID_Type(Arabic tail fragment)", u"Technical", getIDTypes(0xFE73));
1246 assertEquals("ID_Type(Linear B syllable)", u"Exclusion", getIDTypes(0x10000));
1247 assertEquals("ID_Type(Hentaigana ko-3)", u"Obsolete", getIDTypes(0x1B03A));
1248 assertEquals("ID_Type(Katakana small ko)", u"Recommended", getIDTypes(0x1B155));
1249 assertEquals("ID_Type(U+2EE5D)", u"Recommended", getIDTypes(0x2EE5D));
1250 assertEquals("ID_Type(U+10FFFF)", u"Not_Character", getIDTypes(0x10FFFF));
1251
1252 assertEquals("ID_Type(CYRILLIC THOUSANDS SIGN)", u"Not_XID Obsolete", getIDTypes(0x0482));
1253 assertEquals("ID_Type(SYRIAC FEMININE DOT)", u"Technical Limited_Use", getIDTypes(0x0740));
1254 assertEquals("ID_Type(NKO LETTER JONA JA)", u"Obsolete Limited_Use", getIDTypes(0x07E8));
1255 assertEquals("ID_Type(SYRIAC END OF PARAGRAPH)", u"Not_XID Limited_Use", getIDTypes(0x0700));
1256 assertEquals("ID_Type(LATIN SMALL LETTER EZH)=", u"Technical Uncommon_Use", getIDTypes(0x0292));
1257 assertEquals("ID_Type(MUSICAL SYMBOL KIEVAN C CLEF)", u"Not_XID Technical Uncommon_Use", getIDTypes(0x1D1DE));
1258 assertEquals("ID_Type(MRO LETTER TA)", u"Exclusion Uncommon_Use", getIDTypes(0x16A40));
1259 assertEquals("ID_Type(GREEK MUSICAL LEIMMA)", u"Not_XID Obsolete", getIDTypes(0x1D245));
1260
1261 // error handling
1262 UIdentifierType types[2];
1263 UErrorCode failure = U_ZERO_ERROR;
1264 u_getIDTypes(0, types, -1, &failure);
1265 assertEquals("u_getIDTypes(capacity<0)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1266
1267 failure = U_ZERO_ERROR;
1268 u_getIDTypes(0, nullptr, 1, &failure);
1269 assertEquals("u_getIDTypes(nullptr)", U_ILLEGAL_ARGUMENT_ERROR, failure);
1270
1271 failure = U_ZERO_ERROR;
1272 int32_t length = u_getIDTypes(0x30, types, 0, &failure);
1273 assertEquals("u_getIDTypes(digit 0, capacity 0) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1274 assertEquals("u_getIDTypes(digit 0, capacity 0) length", 1, length);
1275
1276 failure = U_ZERO_ERROR;
1277 length = u_getIDTypes(0x1D1DE, types, 0, &failure);
1278 assertEquals("u_getIDTypes(Kievan C clef, capacity 2) overflow", U_BUFFER_OVERFLOW_ERROR, failure);
1279 assertEquals("u_getIDTypes(Kievan C clef, capacity 2) length", 3, length);
1280
1281 // Property names work and get the correct sets.
1282 UnicodeSet rec(u"[:Identifier_Type=Recommended:]", errorCode);
1283 UnicodeSet incl(u"[:Identifier_Type=Inclusion:]", errorCode);
1284 UnicodeSet limited(u"[:Identifier_Type=Limited_Use:]", errorCode);
1285 UnicodeSet uncommon(u"[:Identifier_Type=Uncommon_Use:]", errorCode);
1286 UnicodeSet notChar(u"[:Identifier_Type=Not_Character:]", errorCode);
1287 // Unicode 15.1 set sizes; normally grows over time except Not_Character shrinks
1288 assertTrue("Recommended number of characters", rec.size() >= 112761);
1289 assertTrue("Inclusion number of characters", incl.size() >= 17);
1290 assertTrue("Limited_Use number of characters", limited.size() >= 5268);
1291 assertTrue("Uncommon_Use number of characters", uncommon.size() >= 398);
1292 assertTrue("Not_Character number of characters",
1293 800000 <= notChar.size() && notChar.size() <= 964293);
1294 assertFalse("Recommended.contains(slash)", rec.contains(0x2F));
1295 assertTrue("Recommended.contains(digit 0)", rec.contains(0x30));
1296 assertTrue("Inclusion.contains(colon)", incl.contains(0x3A));
1297 assertTrue("Recommended.contains(U+2EE5D)", rec.contains(0x2EE5D));
1298 assertTrue("Limited_Use.contains(SYRIAC FEMININE DOT)", limited.contains(0x0740));
1299 assertTrue("Limited_Use.contains(NKO LETTER JONA JA)", limited.contains(0x7E8));
1300 assertTrue("Not_Character.contains(surrogate)", notChar.contains(0xd800));
1301 assertTrue("Not_Character.contains(U+10FFFF)", notChar.contains(0x10FFFF));
1302 assertTrue("Uncommon_Use.contains(LATIN SMALL LETTER EZH)", uncommon.contains(0x0292));
1303 assertTrue("Uncommon_Use.contains(MUSICAL SYMBOL KIEVAN C CLEF)", uncommon.contains(0x1D1DE));
1304
1305 // More mutually exclusive types, including some otherwise combinable ones.
1306 UnicodeSet dep(u"[:Identifier_Type=Deprecated:]", errorCode);
1307 UnicodeSet di(u"[:Identifier_Type=Default_Ignorable:]", errorCode);
1308 UnicodeSet notNFKC(u"[:Identifier_Type=Not_NFKC:]", errorCode);
1309 UnicodeSet excl(u"[:Identifier_Type=Exclusion:]", errorCode);
1310 UnicodeSet allExclusive;
1311 allExclusive.addAll(rec).addAll(incl).addAll(limited).addAll(excl).
1312 addAll(notNFKC).addAll(di).addAll(dep).addAll(notChar);
1313 assertEquals("num chars in mutually exclusive types",
1314 rec.size() + incl.size() + limited.size() + excl.size() +
1315 notNFKC.size() + di.size() + dep.size() + notChar.size(),
1316 allExclusive.size());
1317 }
1318