1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 1997-2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
7
8 #include "unicode/ustring.h"
9 #include "unicode/uchar.h"
10 #include "unicode/ucpmap.h"
11 #include "unicode/uniset.h"
12 #include "unicode/putil.h"
13 #include "unicode/uscript.h"
14 #include "unicode/uset.h"
15 #include "cstring.h"
16 #include "hash.h"
17 #include "patternprops.h"
18 #include "normalizer2impl.h"
19 #include "testutil.h"
20 #include "uparse.h"
21 #include "ucdtest.h"
22
23 static const char *ignorePropNames[]={
24 "FC_NFKC",
25 "NFD_QC",
26 "NFC_QC",
27 "NFKD_QC",
28 "NFKC_QC",
29 "Expands_On_NFD",
30 "Expands_On_NFC",
31 "Expands_On_NFKD",
32 "Expands_On_NFKC",
33 "NFKC_CF"
34 };
35
UnicodeTest()36 UnicodeTest::UnicodeTest()
37 {
38 UErrorCode errorCode=U_ZERO_ERROR;
39 unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
40 if(U_FAILURE(errorCode)) {
41 delete unknownPropertyNames;
42 unknownPropertyNames=nullptr;
43 }
44 // Ignore some property names altogether.
45 for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) {
46 unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
47 }
48 }
49
~UnicodeTest()50 UnicodeTest::~UnicodeTest()
51 {
52 delete unknownPropertyNames;
53 }
54
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)55 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
56 {
57 if(exec) {
58 logln("TestSuite UnicodeTest: ");
59 }
60 TESTCASE_AUTO_BEGIN;
61 TESTCASE_AUTO(TestAdditionalProperties);
62 TESTCASE_AUTO(TestBinaryValues);
63 TESTCASE_AUTO(TestConsistency);
64 TESTCASE_AUTO(TestPatternProperties);
65 TESTCASE_AUTO(TestScriptMetadata);
66 TESTCASE_AUTO(TestBidiPairedBracketType);
67 TESTCASE_AUTO(TestEmojiProperties);
68 TESTCASE_AUTO(TestEmojiPropertiesOfStrings);
69 TESTCASE_AUTO(TestIndicPositionalCategory);
70 TESTCASE_AUTO(TestIndicSyllabicCategory);
71 TESTCASE_AUTO(TestVerticalOrientation);
72 TESTCASE_AUTO(TestDefaultScriptExtensions);
73 TESTCASE_AUTO(TestInvalidCodePointFolding);
74 #if !UCONFIG_NO_NORMALIZATION
75 TESTCASE_AUTO(TestBinaryCharacterProperties);
76 TESTCASE_AUTO(TestIntCharacterProperties);
77 #endif
78 TESTCASE_AUTO(TestPropertyNames);
79 TESTCASE_AUTO_END;
80 }
81
82 //====================================================
83 // private data used by the tests
84 //====================================================
85
86 // test DerivedCoreProperties.txt -------------------------------------------
87
88 // copied from genprops.c
89 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)90 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
91 const char *t, *z;
92 int32_t i, j;
93
94 s=u_skipWhitespace(s);
95 for(i=0; i<countTokens; ++i) {
96 t=tokens[i];
97 if(t!=nullptr) {
98 for(j=0;; ++j) {
99 if(t[j]!=0) {
100 if(s[j]!=t[j]) {
101 break;
102 }
103 } else {
104 z=u_skipWhitespace(s+j);
105 if(*z==';' || *z==0) {
106 return i;
107 } else {
108 break;
109 }
110 }
111 }
112 }
113 }
114 return -1;
115 }
116
117 static const char *const
118 derivedPropsNames[]={
119 "Math",
120 "Alphabetic",
121 "Lowercase",
122 "Uppercase",
123 "ID_Start",
124 "ID_Continue",
125 "XID_Start",
126 "XID_Continue",
127 "Default_Ignorable_Code_Point",
128 "Full_Composition_Exclusion",
129 "Grapheme_Extend",
130 "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
131 "Grapheme_Base",
132 "Cased",
133 "Case_Ignorable",
134 "Changes_When_Lowercased",
135 "Changes_When_Uppercased",
136 "Changes_When_Titlecased",
137 "Changes_When_Casefolded",
138 "Changes_When_Casemapped",
139 "Changes_When_NFKC_Casefolded"
140 };
141
142 static const UProperty
143 derivedPropsIndex[]={
144 UCHAR_MATH,
145 UCHAR_ALPHABETIC,
146 UCHAR_LOWERCASE,
147 UCHAR_UPPERCASE,
148 UCHAR_ID_START,
149 UCHAR_ID_CONTINUE,
150 UCHAR_XID_START,
151 UCHAR_XID_CONTINUE,
152 UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
153 UCHAR_FULL_COMPOSITION_EXCLUSION,
154 UCHAR_GRAPHEME_EXTEND,
155 UCHAR_GRAPHEME_LINK,
156 UCHAR_GRAPHEME_BASE,
157 UCHAR_CASED,
158 UCHAR_CASE_IGNORABLE,
159 UCHAR_CHANGES_WHEN_LOWERCASED,
160 UCHAR_CHANGES_WHEN_UPPERCASED,
161 UCHAR_CHANGES_WHEN_TITLECASED,
162 UCHAR_CHANGES_WHEN_CASEFOLDED,
163 UCHAR_CHANGES_WHEN_CASEMAPPED,
164 UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
165 };
166
167 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 };
168
169 enum { MAX_ERRORS=50 };
170
171 U_CFUNC void U_CALLCONV
derivedPropsLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)172 derivedPropsLineFn(void *context,
173 char *fields[][2], int32_t /* fieldCount */,
174 UErrorCode *pErrorCode)
175 {
176 UnicodeTest *me=static_cast<UnicodeTest*>(context);
177 uint32_t start, end;
178 int32_t i;
179
180 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
181 if(U_FAILURE(*pErrorCode)) {
182 me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
183 return;
184 }
185
186 /* parse derived binary property name, ignore unknown names */
187 i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]);
188 if(i<0) {
189 UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
190 propName.trim();
191 if(me->unknownPropertyNames->find(propName)==nullptr) {
192 UErrorCode errorCode=U_ZERO_ERROR;
193 me->unknownPropertyNames->puti(propName, 1, errorCode);
194 me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
195 }
196 return;
197 }
198
199 me->derivedProps[i].add(start, end);
200 }
201
TestAdditionalProperties()202 void UnicodeTest::TestAdditionalProperties() {
203 #if !UCONFIG_NO_NORMALIZATION
204 // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
205 if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) {
206 errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
207 UPRV_LENGTHOF(derivedPropsNames));
208 return;
209 }
210 if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) {
211 errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n");
212 return;
213 }
214
215 char path[500];
216 if(getUnidataPath(path) == nullptr) {
217 errln("unable to find path to source/data/unidata/");
218 return;
219 }
220 char *basename=strchr(path, 0);
221 strcpy(basename, "DerivedCoreProperties.txt");
222
223 char *fields[2][2];
224 UErrorCode errorCode=U_ZERO_ERROR;
225 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
226 if(U_FAILURE(errorCode)) {
227 errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
228 return;
229 }
230
231 strcpy(basename, "DerivedNormalizationProps.txt");
232 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
233 if(U_FAILURE(errorCode)) {
234 errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
235 return;
236 }
237
238 // now we have all derived core properties in the UnicodeSets
239 // run them all through the API
240 int32_t rangeCount, range;
241 uint32_t i;
242 UChar32 start, end;
243
244 // test all true properties
245 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
246 rangeCount=derivedProps[i].getRangeCount();
247 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
248 start=derivedProps[i].getRangeStart(range);
249 end=derivedProps[i].getRangeEnd(range);
250 for(; start<=end; ++start) {
251 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
252 dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==false is wrong", start, derivedPropsNames[i]);
253 if(++numErrors[i]>=MAX_ERRORS) {
254 dataerrln("Too many errors, moving to the next test");
255 break;
256 }
257 }
258 }
259 }
260 }
261
262 // invert all properties
263 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
264 derivedProps[i].complement();
265 }
266
267 // test all false properties
268 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) {
269 rangeCount=derivedProps[i].getRangeCount();
270 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
271 start=derivedProps[i].getRangeStart(range);
272 end=derivedProps[i].getRangeEnd(range);
273 for(; start<=end; ++start) {
274 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
275 errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==true is wrong\n", start, derivedPropsNames[i]);
276 if(++numErrors[i]>=MAX_ERRORS) {
277 errln("Too many errors, moving to the next test");
278 break;
279 }
280 }
281 }
282 }
283 }
284 #endif /* !UCONFIG_NO_NORMALIZATION */
285 }
286
TestBinaryValues()287 void UnicodeTest::TestBinaryValues() {
288 /*
289 * Unicode 5.1 explicitly defines binary property value aliases.
290 * Verify that they are all recognized.
291 */
292 UErrorCode errorCode=U_ZERO_ERROR;
293 UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
294 if(U_FAILURE(errorCode)) {
295 dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
296 return;
297 }
298
299 static const char *const falseValues[]={ "N", "No", "F", "False" };
300 static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
301 int32_t i;
302 for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) {
303 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
304 pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
305 errorCode=U_ZERO_ERROR;
306 UnicodeSet set(pattern, errorCode);
307 if(U_FAILURE(errorCode)) {
308 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
309 continue;
310 }
311 set.complement();
312 if(set!=alpha) {
313 errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
314 }
315 }
316 for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) {
317 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
318 pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
319 errorCode=U_ZERO_ERROR;
320 UnicodeSet set(pattern, errorCode);
321 if(U_FAILURE(errorCode)) {
322 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
323 continue;
324 }
325 if(set!=alpha) {
326 errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
327 }
328 }
329 }
330
TestConsistency()331 void UnicodeTest::TestConsistency() {
332 #if !UCONFIG_NO_NORMALIZATION
333 /*
334 * Test for an example that getCanonStartSet() delivers
335 * all characters that compose from the input one,
336 * even in multiple steps.
337 * For example, the set for "I" (0049) should contain both
338 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
339 * In general, the set for the middle such character should be a subset
340 * of the set for the first.
341 */
342 IcuTestErrorCode errorCode(*this, "TestConsistency");
343 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
344 const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
345 if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) {
346 dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
347 errorCode.errorName());
348 errorCode.reset();
349 return;
350 }
351
352 UnicodeSet set1, set2;
353 if (nfcImpl->getCanonStartSet(0x49, set1)) {
354 /* enumerate all characters that are plausible to be latin letters */
355 for(char16_t start=0xa0; start<0x2000; ++start) {
356 UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
357 if(decomp.length()>1 && decomp[0]==0x49) {
358 set2.add(start);
359 }
360 }
361
362 if (set1!=set2) {
363 errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
364 }
365 // This was available in cucdtst.c but the test had to move to intltest
366 // because the new internal normalization functions are in C++.
367 //compareUSets(set1, set2,
368 // "[canon start set of 0049]", "[all c with canon decomp with 0049]",
369 // true);
370 } else {
371 errln("NFC.getCanonStartSet() returned false");
372 }
373 #endif
374 }
375
376 /**
377 * Test various implementations of Pattern_Syntax & Pattern_White_Space.
378 */
TestPatternProperties()379 void UnicodeTest::TestPatternProperties() {
380 IcuTestErrorCode errorCode(*this, "TestPatternProperties()");
381 UnicodeSet syn_pp;
382 UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode);
383 UnicodeSet syn_list(
384 "[!-/\\:-@\\[-\\^`\\{-~"
385 "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7"
386 "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775"
387 "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode);
388 UnicodeSet ws_pp;
389 UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode);
390 UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode);
391 UnicodeSet syn_ws_pp;
392 UnicodeSet syn_ws_prop(syn_prop);
393 syn_ws_prop.addAll(ws_prop);
394 for(UChar32 c=0; c<=0xffff; ++c) {
395 if(PatternProps::isSyntax(c)) {
396 syn_pp.add(c);
397 }
398 if(PatternProps::isWhiteSpace(c)) {
399 ws_pp.add(c);
400 }
401 if(PatternProps::isSyntaxOrWhiteSpace(c)) {
402 syn_ws_pp.add(c);
403 }
404 }
405 compareUSets(syn_pp, syn_prop,
406 "PatternProps.isSyntax()", "[:Pattern_Syntax:]", true);
407 compareUSets(syn_pp, syn_list,
408 "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", true);
409 compareUSets(ws_pp, ws_prop,
410 "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", true);
411 compareUSets(ws_pp, ws_list,
412 "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", true);
413 compareUSets(syn_ws_pp, syn_ws_prop,
414 "PatternProps.isSyntaxOrWhiteSpace()",
415 "[[:Pattern_Syntax:][:Pattern_White_Space:]]", true);
416 }
417
418 // So far only minimal port of Java & cucdtst.c compareUSets().
419 UBool
compareUSets(const UnicodeSet & a,const UnicodeSet & b,const char * a_name,const char * b_name,UBool diffIsError)420 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b,
421 const char *a_name, const char *b_name,
422 UBool diffIsError) {
423 UBool same= a==b;
424 if(!same && diffIsError) {
425 errln("Sets are different: %s vs. %s\n", a_name, b_name);
426 }
427 return same;
428 }
429
430 namespace {
431
432 /**
433 * Maps a special script code to the most common script of its encoded characters.
434 */
getCharScript(UScriptCode script)435 UScriptCode getCharScript(UScriptCode script) {
436 switch(script) {
437 case USCRIPT_HAN_WITH_BOPOMOFO:
438 case USCRIPT_SIMPLIFIED_HAN:
439 case USCRIPT_TRADITIONAL_HAN:
440 return USCRIPT_HAN;
441 case USCRIPT_JAPANESE:
442 return USCRIPT_HIRAGANA;
443 case USCRIPT_JAMO:
444 case USCRIPT_KOREAN:
445 return USCRIPT_HANGUL;
446 case USCRIPT_SYMBOLS_EMOJI:
447 return USCRIPT_SYMBOLS;
448 default:
449 return script;
450 }
451 }
452
453 } // namespace
454
TestScriptMetadata()455 void UnicodeTest::TestScriptMetadata() {
456 IcuTestErrorCode errorCode(*this, "TestScriptMetadata()");
457 UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode);
458 // So far, sample characters are uppercase.
459 // Georgian is special.
460 UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode);
461 for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) {
462 UScriptCode sc = (UScriptCode)sci;
463 // Run the test with -v to see which script has failures:
464 // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 6 FAIL
465 logln(uscript_getShortName(sc));
466 UScriptUsage usage = uscript_getUsage(sc);
467 UnicodeString sample = uscript_getSampleUnicodeString(sc);
468 UnicodeSet scriptSet;
469 scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode);
470 if(usage == USCRIPT_USAGE_NOT_ENCODED) {
471 assertTrue("not encoded, no sample", sample.isEmpty());
472 assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc));
473 assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc));
474 assertFalse("not encoded, not cased", uscript_isCased(sc));
475 assertTrue("not encoded, no characters", scriptSet.isEmpty());
476 } else {
477 assertFalse("encoded, has a sample character", sample.isEmpty());
478 UChar32 firstChar = sample.char32At(0);
479 UScriptCode charScript = getCharScript(sc);
480 assertEquals("script(sample(script))",
481 (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode));
482 assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc));
483 assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc));
484 assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty()));
485 if(uscript_isRightToLeft(sc)) {
486 rtl.removeAll(scriptSet);
487 }
488 if(uscript_isCased(sc)) {
489 cased.removeAll(scriptSet);
490 }
491 }
492 }
493 UnicodeString pattern;
494 assertEquals("no remaining RTL characters",
495 UnicodeString("[]"), rtl.toPattern(pattern));
496 assertEquals("no remaining cased characters",
497 UnicodeString("[]"), cased.toPattern(pattern));
498
499 assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN));
500 assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI));
501 assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN));
502 }
503
TestBidiPairedBracketType()504 void UnicodeTest::TestBidiPairedBracketType() {
505 // BidiBrackets-6.3.0.txt says:
506 //
507 // The set of code points listed in this file was originally derived
508 // using the character properties General_Category (gc), Bidi_Class (bc),
509 // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
510 // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
511 // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
512 // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
513 // property values are Open and Close, respectively.
514 IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()");
515 UnicodeSet bpt("[:^bpt=n:]", errorCode);
516 assertTrue("bpt!=None is not empty", !bpt.isEmpty());
517 // The following should always be true.
518 UnicodeSet mirrored("[:Bidi_M:]", errorCode);
519 UnicodeSet other_neutral("[:bc=ON:]", errorCode);
520 assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
521 assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
522 // The following are true at least initially in Unicode 6.3.
523 UnicodeSet bpt_open("[:bpt=o:]", errorCode);
524 UnicodeSet bpt_close("[:bpt=c:]", errorCode);
525 UnicodeSet ps("[:Ps:]", errorCode);
526 UnicodeSet pe("[:Pe:]", errorCode);
527 assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
528 assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
529 }
530
TestEmojiProperties()531 void UnicodeTest::TestEmojiProperties() {
532 assertFalse("space is not Emoji", u_hasBinaryProperty(0x20, UCHAR_EMOJI));
533 assertTrue("shooting star is Emoji", u_hasBinaryProperty(0x1F320, UCHAR_EMOJI));
534 IcuTestErrorCode errorCode(*this, "TestEmojiProperties()");
535 UnicodeSet emoji("[:Emoji:]", errorCode);
536 assertTrue("lots of Emoji", emoji.size() > 700);
537
538 assertTrue("shooting star is Emoji_Presentation",
539 u_hasBinaryProperty(0x1F320, UCHAR_EMOJI_PRESENTATION));
540 assertTrue("Fitzpatrick 6 is Emoji_Modifier",
541 u_hasBinaryProperty(0x1F3FF, UCHAR_EMOJI_MODIFIER));
542 assertTrue("happy person is Emoji_Modifier_Base",
543 u_hasBinaryProperty(0x1F64B, UCHAR_EMOJI_MODIFIER_BASE));
544 assertTrue("asterisk is Emoji_Component",
545 u_hasBinaryProperty(0x2A, UCHAR_EMOJI_COMPONENT));
546 assertTrue("copyright is Extended_Pictographic",
547 u_hasBinaryProperty(0xA9, UCHAR_EXTENDED_PICTOGRAPHIC));
548 }
549
550 namespace {
551
hbp(const char16_t * s,int32_t length,UProperty which)552 UBool hbp(const char16_t *s, int32_t length, UProperty which) {
553 return u_stringHasBinaryProperty(s, length, which);
554 }
555
hbp(const char16_t * s,UProperty which)556 UBool hbp(const char16_t *s, UProperty which) {
557 return u_stringHasBinaryProperty(s, -1, which);
558 }
559
560 } // namespace
561
TestEmojiPropertiesOfStrings()562 void UnicodeTest::TestEmojiPropertiesOfStrings() {
563 // Property of code points, for coverage
564 assertFalse("null is not Ideographic", hbp(nullptr, 1, UCHAR_IDEOGRAPHIC));
565 assertFalse("null/0 is not Ideographic", hbp(nullptr, -1, UCHAR_IDEOGRAPHIC));
566 assertFalse("empty string is not Ideographic", hbp(u"", 0, UCHAR_IDEOGRAPHIC));
567 assertFalse("empty string/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
568 assertFalse("L is not Ideographic", hbp(u"L", 1, UCHAR_IDEOGRAPHIC));
569 assertFalse("L/0 is not Ideographic", hbp(u"L", -1, UCHAR_IDEOGRAPHIC));
570 assertTrue("U+4E02 is Ideographic", hbp(u"丂", 1, UCHAR_IDEOGRAPHIC));
571 assertTrue("U+4E02/0 is Ideographic", hbp(u"丂", -1, UCHAR_IDEOGRAPHIC));
572 assertFalse("2*U+4E02 is not Ideographic", hbp(u"丂丂", 2, UCHAR_IDEOGRAPHIC));
573 assertFalse("2*U+4E02/0 is not Ideographic", hbp(u"丂丂", -1, UCHAR_IDEOGRAPHIC));
574 assertFalse("bicycle is not Ideographic", hbp(u"", 2, UCHAR_IDEOGRAPHIC));
575 assertFalse("bicycle/0 is not Ideographic", hbp(u"", -1, UCHAR_IDEOGRAPHIC));
576 assertTrue("U+23456 is Ideographic", hbp(u"\U00023456", 2, UCHAR_IDEOGRAPHIC));
577 assertTrue("U+23456/0 is Ideographic", hbp(u"\U00023456", -1, UCHAR_IDEOGRAPHIC));
578
579 // Property of (code points and) strings
580 assertFalse("null is not Basic_Emoji", hbp(nullptr, 1, UCHAR_BASIC_EMOJI));
581 assertFalse("null/0 is not Basic_Emoji", hbp(nullptr, -1, UCHAR_BASIC_EMOJI));
582 assertFalse("empty string is not Basic_Emoji", hbp(u"", 0, UCHAR_BASIC_EMOJI));
583 assertFalse("empty string/0 is not Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
584 assertFalse("L is not Basic_Emoji", hbp(u"L", 1, UCHAR_BASIC_EMOJI));
585 assertFalse("L/0 is not Basic_Emoji", hbp(u"L", -1, UCHAR_BASIC_EMOJI));
586 assertFalse("U+4E02 is not Basic_Emoji", hbp(u"丂", 1, UCHAR_BASIC_EMOJI));
587 assertFalse("U+4E02/0 is not Basic_Emoji", hbp(u"丂", -1, UCHAR_BASIC_EMOJI));
588 assertTrue("bicycle is Basic_Emoji", hbp(u"", 2, UCHAR_BASIC_EMOJI));
589 assertTrue("bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
590 assertFalse("2*bicycle is Basic_Emoji", hbp(u"", 4, UCHAR_BASIC_EMOJI));
591 assertFalse("2*bicycle/0 is Basic_Emoji", hbp(u"", -1, UCHAR_BASIC_EMOJI));
592 assertFalse("U+23456 is not Basic_Emoji", hbp(u"\U00023456", 2, UCHAR_BASIC_EMOJI));
593 assertFalse("U+23456/0 is not Basic_Emoji", hbp(u"\U00023456", -1, UCHAR_BASIC_EMOJI));
594
595 assertFalse("stopwatch is not Basic_Emoji", hbp(u"⏱", 1, UCHAR_BASIC_EMOJI));
596 assertFalse("stopwatch/0 is not Basic_Emoji", hbp(u"⏱", -1, UCHAR_BASIC_EMOJI));
597 assertTrue("stopwatch+emoji is Basic_Emoji", hbp(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI));
598 assertTrue("stopwatch+emoji/0 is Basic_Emoji", hbp(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI));
599
600 assertFalse("chipmunk is not Basic_Emoji", hbp(u"", UCHAR_BASIC_EMOJI));
601 assertTrue("chipmunk+emoji is Basic_Emoji", hbp(u"\uFE0F", UCHAR_BASIC_EMOJI));
602 assertFalse("chipmunk+2*emoji is not Basic_Emoji", hbp(u"\uFE0F\uFE0F", UCHAR_BASIC_EMOJI));
603
604 // Properties of strings (only)
605 assertFalse("4+emoji is not Emoji_Keycap_Sequence",
606 hbp(u"4\uFE0F", UCHAR_EMOJI_KEYCAP_SEQUENCE));
607 assertTrue("4+emoji+keycap is Emoji_Keycap_Sequence",
608 hbp(u"4\uFE0F\u20E3", UCHAR_EMOJI_KEYCAP_SEQUENCE));
609
610 assertFalse("[B] is not RGI_Emoji_Flag_Sequence",
611 hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
612 assertTrue("[BE] is RGI_Emoji_Flag_Sequence",
613 hbp(u"", UCHAR_RGI_EMOJI_FLAG_SEQUENCE));
614
615 assertFalse("[flag] is not RGI_Emoji_Tag_Sequence",
616 hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
617 assertTrue("[Scotland] is RGI_Emoji_Tag_Sequence",
618 hbp(u"", UCHAR_RGI_EMOJI_TAG_SEQUENCE));
619
620 assertFalse("bicyclist is not RGI_Emoji_Modifier_Sequence",
621 hbp(u"", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
622 assertTrue("bicyclist+medium is RGI_Emoji_Modifier_Sequence",
623 hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE));
624
625 assertFalse("woman+dark+ZWJ is not RGI_Emoji_ZWJ_Sequence",
626 hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
627 assertTrue("woman pilot: dark skin tone is RGI_Emoji_ZWJ_Sequence",
628 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI_ZWJ_SEQUENCE));
629
630 // RGI_Emoji = all of the above
631 assertFalse("stopwatch is not RGI_Emoji", hbp(u"⏱", UCHAR_RGI_EMOJI));
632 assertTrue("stopwatch+emoji is RGI_Emoji", hbp(u"⏱\uFE0F", UCHAR_RGI_EMOJI));
633
634 assertFalse("chipmunk is not RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
635 assertTrue("chipmunk+emoji is RGI_Emoji", hbp(u"\uFE0F", UCHAR_RGI_EMOJI));
636
637 assertFalse("4+emoji is not RGI_Emoji", hbp(u"4\uFE0F", UCHAR_RGI_EMOJI));
638 assertTrue("4+emoji+keycap is RGI_Emoji", hbp(u"4\uFE0F\u20E3", UCHAR_RGI_EMOJI));
639
640 assertFalse("[B] is not RGI_Emoji", hbp(u"\U0001F1E7", UCHAR_RGI_EMOJI));
641 assertTrue("[BE] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
642
643 assertTrue("[flag] is RGI_Emoji", hbp(u"\U0001F3F4", UCHAR_RGI_EMOJI));
644 assertTrue("[Scotland] is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
645
646 assertTrue("bicyclist is RGI_Emoji", hbp(u"", UCHAR_RGI_EMOJI));
647 assertTrue("bicyclist+medium is RGI_Emoji", hbp(u"\U0001F3FD", UCHAR_RGI_EMOJI));
648
649 assertFalse("woman+dark+ZWJ is not RGI_Emoji", hbp(u"\U0001F3FF\u200D", UCHAR_RGI_EMOJI));
650 assertTrue("woman pilot: dark skin tone is RGI_Emoji",
651 hbp(u"\U0001F3FF\u200D✈\uFE0F", UCHAR_RGI_EMOJI));
652
653 // UnicodeSet with properties of strings
654 IcuTestErrorCode errorCode(*this, "TestEmojiPropertiesOfStrings()");
655 UnicodeSet basic("[:Basic_Emoji:]", errorCode);
656 UnicodeSet keycaps("[:Emoji_Keycap_Sequence:]", errorCode);
657 UnicodeSet modified("[:RGI_Emoji_Modifier_Sequence:]", errorCode);
658 UnicodeSet flags("[:RGI_Emoji_Flag_Sequence:]", errorCode);
659 UnicodeSet tags("[:RGI_Emoji_Tag_Sequence:]", errorCode);
660 UnicodeSet combos("[:RGI_Emoji_ZWJ_Sequence:]", errorCode);
661 UnicodeSet rgi("[:RGI_Emoji:]", errorCode);
662 if (errorCode.errDataIfFailureAndReset("UnicodeSets")) {
663 return;
664 }
665
666 // union of all sets except for "rgi" -- should be the same as "rgi"
667 UnicodeSet all(basic);
668 all.addAll(keycaps).addAll(modified).addAll(flags).addAll(tags).addAll(combos);
669
670 UnicodeSet basicOnlyCp(basic);
671 basicOnlyCp.removeAllStrings();
672
673 UnicodeSet rgiOnlyCp(rgi);
674 rgiOnlyCp.removeAllStrings();
675
676 assertTrue("lots of Basic_Emoji", basic.size() > 1000);
677 assertEquals("12 Emoji_Keycap_Sequence", 12, keycaps.size());
678 assertTrue("lots of RGI_Emoji_Modifier_Sequence", modified.size() > 600);
679 assertTrue("lots of RGI_Emoji_Flag_Sequence", flags.size() > 250);
680 assertTrue("some RGI_Emoji_Tag_Sequence", tags.size() >= 3);
681 assertTrue("lots of RGI_Emoji_ZWJ_Sequence", combos.size() > 1300);
682 assertTrue("lots of RGI_Emoji", rgi.size() > 3000);
683
684 assertTrue("lots of Basic_Emoji code points", basicOnlyCp.size() > 1000);
685 assertTrue("Basic_Emoji.hasStrings()", basic.hasStrings());
686 assertEquals("no Emoji_Keycap_Sequence code points", 0, keycaps.getRangeCount());
687 assertEquals("lots of RGI_Emoji_Modifier_Sequence", 0, modified.getRangeCount());
688 assertEquals("lots of RGI_Emoji_Flag_Sequence", 0, flags.getRangeCount());
689 assertEquals("some RGI_Emoji_Tag_Sequence", 0, tags.getRangeCount());
690 assertEquals("lots of RGI_Emoji_ZWJ_Sequence", 0, combos.getRangeCount());
691
692 assertTrue("lots of RGI_Emoji code points", rgiOnlyCp.size() > 1000);
693 assertTrue("RGI_Emoji.hasStrings()", rgi.hasStrings());
694 assertEquals("RGI_Emoji/only-cp.size() == Basic_Emoji/only-cp.size()",
695 rgiOnlyCp.size(), basicOnlyCp.size());
696 assertTrue("RGI_Emoji/only-cp == Basic_Emoji/only-cp", rgiOnlyCp == basicOnlyCp);
697 assertEquals("RGI_Emoji.size() == union.size()", rgi.size(), all.size());
698 assertTrue("RGI_Emoji == union", rgi == all);
699
700 assertTrue("Basic_Emoji.contains(stopwatch+emoji)", basic.contains(u"⏱\uFE0F"));
701 assertTrue("Basic_Emoji.contains(chipmunk+emoji)", basic.contains(u"\uFE0F"));
702 assertTrue("Emoji_Keycap_Sequence.contains(4+emoji+keycap)",
703 keycaps.contains(u"4\uFE0F\u20E3"));
704 assertTrue("RGI_Emoji_Flag_Sequence.contains([BE])", flags.contains(u""));
705 assertTrue("RGI_Emoji_Tag_Sequence.contains([Scotland])", tags.contains(u""));
706 assertTrue("RGI_Emoji_Modifier_Sequence.contains(bicyclist+medium)",
707 modified.contains(u"\U0001F3FD"));
708 assertTrue("RGI_Emoji_ZWJ_Sequence.contains(woman pilot: dark skin tone)",
709 combos.contains(u"\U0001F3FF\u200D✈\uFE0F"));
710 assertTrue("RGI_Emoji.contains(stopwatch+emoji)", rgi.contains(u"⏱\uFE0F"));
711 assertTrue("RGI_Emoji.contains(chipmunk+emoji)", rgi.contains(u"\uFE0F"));
712 assertTrue("RGI_Emoji.contains(4+emoji+keycap)", rgi.contains(u"4\uFE0F\u20E3"));
713 assertTrue("RGI_Emoji.contains([BE] is RGI_Emoji)", rgi.contains(u""));
714 assertTrue("RGI_Emoji.contains([flag])", rgi.contains(u"\U0001F3F4"));
715 assertTrue("RGI_Emoji.contains([Scotland])", rgi.contains(u""));
716 assertTrue("RGI_Emoji.contains(bicyclist)", rgi.contains(u""));
717 assertTrue("RGI_Emoji.contains(bicyclist+medium)", rgi.contains(u"\U0001F3FD"));
718 assertTrue("RGI_Emoji.contains(woman pilot: dark skin tone)", rgi.contains(u"\U0001F3FF\u200D✈\uFE0F"));
719 }
720
TestIndicPositionalCategory()721 void UnicodeTest::TestIndicPositionalCategory() {
722 IcuTestErrorCode errorCode(*this, "TestIndicPositionalCategory()");
723 UnicodeSet na(u"[:InPC=NA:]", errorCode);
724 assertTrue("mostly NA", 1000000 <= na.size() && na.size() <= UCHAR_MAX_VALUE - 500);
725 UnicodeSet vol(u"[:InPC=Visual_Order_Left:]", errorCode);
726 assertTrue("some Visual_Order_Left", 19 <= vol.size() && vol.size() <= 100);
727 assertEquals("U+08FF: NA", U_INPC_NA,
728 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_POSITIONAL_CATEGORY));
729 assertEquals("U+0900: Top", U_INPC_TOP,
730 u_getIntPropertyValue(0x0900, UCHAR_INDIC_POSITIONAL_CATEGORY));
731 assertEquals("U+10A06: Overstruck", U_INPC_OVERSTRUCK,
732 u_getIntPropertyValue(0x10A06, UCHAR_INDIC_POSITIONAL_CATEGORY));
733 }
734
TestIndicSyllabicCategory()735 void UnicodeTest::TestIndicSyllabicCategory() {
736 IcuTestErrorCode errorCode(*this, "TestIndicSyllabicCategory()");
737 UnicodeSet other(u"[:InSC=Other:]", errorCode);
738 assertTrue("mostly Other", 1000000 <= other.size() && other.size() <= UCHAR_MAX_VALUE - 500);
739 UnicodeSet ava(u"[:InSC=Avagraha:]", errorCode);
740 assertTrue("some Avagraha", 16 <= ava.size() && ava.size() <= 100);
741 assertEquals("U+08FF: Other", U_INSC_OTHER,
742 u_getIntPropertyValue(0x08FF, UCHAR_INDIC_SYLLABIC_CATEGORY));
743 assertEquals("U+0900: Bindu", U_INSC_BINDU,
744 u_getIntPropertyValue(0x0900, UCHAR_INDIC_SYLLABIC_CATEGORY));
745 assertEquals("U+11065: Brahmi_Joining_Number", U_INSC_BRAHMI_JOINING_NUMBER,
746 u_getIntPropertyValue(0x11065, UCHAR_INDIC_SYLLABIC_CATEGORY));
747 }
748
TestVerticalOrientation()749 void UnicodeTest::TestVerticalOrientation() {
750 IcuTestErrorCode errorCode(*this, "TestVerticalOrientation()");
751 UnicodeSet r(u"[:vo=R:]", errorCode);
752 assertTrue("mostly R", 0xc0000 <= r.size() && r.size() <= 0xd0000);
753 UnicodeSet u(u"[:vo=U:]", errorCode);
754 assertTrue("much U", 0x40000 <= u.size() && u.size() <= 0x50000);
755 UnicodeSet tu(u"[:vo=Tu:]", errorCode);
756 assertTrue("some Tu", 147 <= tu.size() && tu.size() <= 300);
757 assertEquals("U+0E01: Rotated", U_VO_ROTATED,
758 u_getIntPropertyValue(0x0E01, UCHAR_VERTICAL_ORIENTATION));
759 assertEquals("U+3008: Transformed_Rotated", U_VO_TRANSFORMED_ROTATED,
760 u_getIntPropertyValue(0x3008, UCHAR_VERTICAL_ORIENTATION));
761 assertEquals("U+33333: Upright", U_VO_UPRIGHT,
762 u_getIntPropertyValue(0x33333, UCHAR_VERTICAL_ORIENTATION));
763 }
764
TestDefaultScriptExtensions()765 void UnicodeTest::TestDefaultScriptExtensions() {
766 // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
767 // but some of its characters revert to scx=<script> which is usually Common.
768 IcuTestErrorCode errorCode(*this, "TestDefaultScriptExtensions()");
769 UScriptCode scx[20];
770 scx[0] = USCRIPT_INVALID_CODE;
771 assertEquals("U+3000 num scx", 1, // IDEOGRAPHIC SPACE
772 uscript_getScriptExtensions(0x3000, scx, UPRV_LENGTHOF(scx), errorCode));
773 assertEquals("U+3000 num scx[0]", USCRIPT_COMMON, scx[0]);
774 scx[0] = USCRIPT_INVALID_CODE;
775 assertEquals("U+3012 num scx", 1, // POSTAL MARK
776 uscript_getScriptExtensions(0x3012, scx, UPRV_LENGTHOF(scx), errorCode));
777 assertEquals("U+3012 num scx[0]", USCRIPT_COMMON, scx[0]);
778 }
779
TestInvalidCodePointFolding()780 void UnicodeTest::TestInvalidCodePointFolding() {
781 // Test behavior when an invalid code point is passed to u_foldCase
782 static const UChar32 invalidCodePoints[] = {
783 0xD800, // lead surrogate
784 0xDFFF, // trail surrogate
785 0xFDD0, // noncharacter
786 0xFFFF, // noncharacter
787 0x110000, // out of range
788 -1 // negative
789 };
790 for (int32_t i=0; i<UPRV_LENGTHOF(invalidCodePoints); ++i) {
791 UChar32 cp = invalidCodePoints[i];
792 assertEquals("Invalid code points should be echoed back",
793 cp, u_foldCase(cp, U_FOLD_CASE_DEFAULT));
794 assertEquals("Invalid code points should be echoed back",
795 cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I));
796 }
797 }
798
TestBinaryCharacterProperties()799 void UnicodeTest::TestBinaryCharacterProperties() {
800 #if !UCONFIG_NO_NORMALIZATION
801 IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()");
802 // Spot-check getBinaryPropertySet() vs. hasBinaryProperty().
803 for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) {
804 const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode);
805 if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) {
806 continue;
807 }
808 const UnicodeSet &set = *UnicodeSet::fromUSet(uset);
809 int32_t count = set.getRangeCount();
810 if (count == 0) {
811 assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")",
812 u_hasBinaryProperty(0x20, (UProperty)prop));
813 assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")",
814 u_hasBinaryProperty(0x61, (UProperty)prop));
815 assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")",
816 u_hasBinaryProperty(0x4e00, (UProperty)prop));
817 } else {
818 UChar32 c = set.getRangeStart(0);
819 if (c > 0) {
820 assertFalse(
821 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) +
822 u", " + prop + u")",
823 u_hasBinaryProperty(c - 1, (UProperty)prop));
824 }
825 assertTrue(
826 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
827 u", " + prop + u")",
828 u_hasBinaryProperty(c, (UProperty)prop));
829 c = set.getRangeEnd(count - 1);
830 assertTrue(
831 UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) +
832 u", " + prop + u")",
833 u_hasBinaryProperty(c, (UProperty)prop));
834 if (c < 0x10ffff) {
835 assertFalse(
836 UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) +
837 u", " + prop + u")",
838 u_hasBinaryProperty(c + 1, (UProperty)prop));
839 }
840 }
841 }
842 #endif
843 }
844
TestIntCharacterProperties()845 void UnicodeTest::TestIntCharacterProperties() {
846 #if !UCONFIG_NO_NORMALIZATION
847 IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()");
848 // Spot-check getIntPropertyMap() vs. getIntPropertyValue().
849 for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) {
850 const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode);
851 if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) {
852 continue;
853 }
854 uint32_t value;
855 UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
856 assertTrue("int property first range", end >= 0);
857 UChar32 c = end / 2;
858 assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c),
859 u_getIntPropertyValue(c, (UProperty)prop), value);
860 end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value);
861 assertTrue("int property later range", end >= 0);
862 assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end),
863 u_getIntPropertyValue(end, (UProperty)prop), value);
864 // ucpmap_get() API coverage
865 // TODO: move to cucdtst.c
866 assertEquals(
867 "int property upcmap_get(U+0061)",
868 u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61));
869 }
870 #endif
871 }
872
873 namespace {
874
getPropName(UProperty property,int32_t nameChoice)875 const char *getPropName(UProperty property, int32_t nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
876 const char *name = u_getPropertyName(property, (UPropertyNameChoice)nameChoice);
877 return name != nullptr ? name : "null";
878 }
879
getValueName(UProperty property,int32_t value,int32_t nameChoice)880 const char *getValueName(UProperty property, int32_t value, int32_t nameChoice)
881 UPRV_NO_SANITIZE_UNDEFINED {
882 const char *name = u_getPropertyValueName(property, value, (UPropertyNameChoice)nameChoice);
883 return name != nullptr ? name : "null";
884 }
885
886 } // namespace
887
TestPropertyNames()888 void UnicodeTest::TestPropertyNames() {
889 IcuTestErrorCode errorCode(*this, "TestPropertyNames()");
890 // Test names of certain properties & values.
891 // The UPropertyNameChoice is really an integer with only a couple of named constants.
892 UProperty prop = UCHAR_WHITE_SPACE;
893 constexpr int32_t SHORT = U_SHORT_PROPERTY_NAME;
894 constexpr int32_t LONG = U_LONG_PROPERTY_NAME;
895 assertEquals("White_Space: index -1", "null", getPropName(prop, -1));
896 assertEquals("White_Space: short", "WSpace", getPropName(prop, SHORT));
897 assertEquals("White_Space: long", "White_Space", getPropName(prop, LONG));
898 assertEquals("White_Space: index 2", "space", getPropName(prop, 2));
899 assertEquals("White_Space: index 3", "null", getPropName(prop, 3));
900
901 prop = UCHAR_SIMPLE_CASE_FOLDING;
902 assertEquals("Simple_Case_Folding: index -1", "null", getPropName(prop, -1));
903 assertEquals("Simple_Case_Folding: short", "scf", getPropName(prop, SHORT));
904 assertEquals("Simple_Case_Folding: long", "Simple_Case_Folding", getPropName(prop, LONG));
905 assertEquals("Simple_Case_Folding: index 2", "sfc", getPropName(prop, 2));
906 assertEquals("Simple_Case_Folding: index 3", "null", getPropName(prop, 3));
907
908 prop = UCHAR_CASED;
909 assertEquals("Cased=Y: index -1", "null", getValueName(prop, 1, -1));
910 assertEquals("Cased=Y: short", "Y", getValueName(prop, 1, SHORT));
911 assertEquals("Cased=Y: long", "Yes", getValueName(prop, 1, LONG));
912 assertEquals("Cased=Y: index 2", "T", getValueName(prop, 1, 2));
913 assertEquals("Cased=Y: index 3", "True", getValueName(prop, 1, 3));
914 assertEquals("Cased=Y: index 4", "null", getValueName(prop, 1, 4));
915
916 prop = UCHAR_DECOMPOSITION_TYPE;
917 int32_t value = U_DT_NOBREAK;
918 assertEquals("dt=Nb: index -1", "null", getValueName(prop, value, -1));
919 assertEquals("dt=Nb: short", "Nb", getValueName(prop, value, SHORT));
920 assertEquals("dt=Nb: long", "Nobreak", getValueName(prop, value, LONG));
921 assertEquals("dt=Nb: index 2", "nb", getValueName(prop, value, 2));
922 assertEquals("dt=Nb: index 3", "null", getValueName(prop, value, 3));
923
924 // Canonical_Combining_Class:
925 // The UCD inserts the numeric values in the second filed of its PropertyValueAliases.txt lines.
926 // In ICU, we don't treat these as names,
927 // they are just the numeric values returned by u_getCombiningClass().
928 // We return the real short and long names for the usual choice constants.
929 prop = UCHAR_CANONICAL_COMBINING_CLASS;
930 assertEquals("ccc=230: index -1", "null", getValueName(prop, 230, -1));
931 assertEquals("ccc=230: short", "A", getValueName(prop, 230, SHORT));
932 assertEquals("ccc=230: long", "Above", getValueName(prop, 230, LONG));
933 assertEquals("ccc=230: index 2", "null", getValueName(prop, 230, 2));
934
935 prop = UCHAR_GENERAL_CATEGORY;
936 value = U_DECIMAL_DIGIT_NUMBER;
937 assertEquals("gc=Nd: index -1", "null", getValueName(prop, value, -1));
938 assertEquals("gc=Nd: short", "Nd", getValueName(prop, value, SHORT));
939 assertEquals("gc=Nd: long", "Decimal_Number", getValueName(prop, value, LONG));
940 assertEquals("gc=Nd: index 2", "digit", getValueName(prop, value, 2));
941 assertEquals("gc=Nd: index 3", "null", getValueName(prop, value, 3));
942
943 prop = UCHAR_GENERAL_CATEGORY_MASK;
944 value = U_GC_P_MASK;
945 assertEquals("gc=P mask: index -1", "null", getValueName(prop, value, -1));
946 assertEquals("gc=P mask: short", "P", getValueName(prop, value, SHORT));
947 assertEquals("gc=P mask: long", "Punctuation", getValueName(prop, value, LONG));
948 assertEquals("gc=P mask: index 2", "punct", getValueName(prop, value, 2));
949 assertEquals("gc=P mask: index 3", "null", getValueName(prop, value, 3));
950 }
951