• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1997-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 *        Name                     Description
14 *     Madhu Katragadda            Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17 
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21 
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28 #include "unicode/utf16.h"
29 #include "unicode/utf_old.h"
30 #include "cintltst.h"
31 #include "putilimp.h"
32 #include "uparse.h"
33 #include "ucase.h"
34 #include "ubidi_props.h"
35 #include "uprops.h"
36 #include "uset_imp.h"
37 #include "usc_impl.h"
38 #include "udatamem.h"
39 #include "cucdapi.h"
40 #include "cmemory.h"
41 
42 /* prototypes --------------------------------------------------------------- */
43 
44 static void TestUpperLower(void);
45 static void TestLetterNumber(void);
46 static void TestMisc(void);
47 static void TestPOSIX(void);
48 static void TestControlPrint(void);
49 static void TestIdentifier(void);
50 static void TestUnicodeData(void);
51 static void TestCodeUnit(void);
52 static void TestCodePoint(void);
53 static void TestCharLength(void);
54 static void TestCharNames(void);
55 static void TestUCharFromNameUnderflow(void);
56 static void TestMirroring(void);
57 static void TestUScriptRunAPI(void);
58 static void TestAdditionalProperties(void);
59 static void TestNumericProperties(void);
60 static void TestPropertyNames(void);
61 static void TestPropertyValues(void);
62 static void TestConsistency(void);
63 static void TestCaseFolding(void);
64 static void TestBinaryCharacterPropertiesAPI(void);
65 static void TestIntCharacterPropertiesAPI(void);
66 
67 /* internal methods used */
68 static int32_t MakeProp(char* str);
69 static int32_t MakeDir(char* str);
70 
71 /* helpers ------------------------------------------------------------------ */
72 
73 static void
parseUCDFile(const char * filename,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)74 parseUCDFile(const char *filename,
75              char *fields[][2], int32_t fieldCount,
76              UParseLineFn *lineFn, void *context,
77              UErrorCode *pErrorCode) {
78     char path[256];
79     char backupPath[256];
80 
81     if(U_FAILURE(*pErrorCode)) {
82         return;
83     }
84 
85     /* Look inside ICU_DATA first */
86     strcpy(path, u_getDataDirectory());
87     strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
88     strcat(path, filename);
89 
90     /* As a fallback, try to guess where the source data was located
91      *    at the time ICU was built, and look there.
92      */
93     strcpy(backupPath, ctest_dataSrcDir());
94     strcat(backupPath, U_FILE_SEP_STRING);
95     strcat(backupPath, "unidata" U_FILE_SEP_STRING);
96     strcat(backupPath, filename);
97 
98     u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
99     if(*pErrorCode==U_FILE_ACCESS_ERROR) {
100         *pErrorCode=U_ZERO_ERROR;
101         u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
102     }
103     if(U_FAILURE(*pErrorCode)) {
104         log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
105     }
106 }
107 
108 /* test data ---------------------------------------------------------------- */
109 
110 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
111 static const int32_t tagValues[] =
112     {
113     /* Mn */ U_NON_SPACING_MARK,
114     /* Mc */ U_COMBINING_SPACING_MARK,
115     /* Me */ U_ENCLOSING_MARK,
116     /* Nd */ U_DECIMAL_DIGIT_NUMBER,
117     /* Nl */ U_LETTER_NUMBER,
118     /* No */ U_OTHER_NUMBER,
119     /* Zs */ U_SPACE_SEPARATOR,
120     /* Zl */ U_LINE_SEPARATOR,
121     /* Zp */ U_PARAGRAPH_SEPARATOR,
122     /* Cc */ U_CONTROL_CHAR,
123     /* Cf */ U_FORMAT_CHAR,
124     /* Cs */ U_SURROGATE,
125     /* Co */ U_PRIVATE_USE_CHAR,
126     /* Cn */ U_UNASSIGNED,
127     /* Lu */ U_UPPERCASE_LETTER,
128     /* Ll */ U_LOWERCASE_LETTER,
129     /* Lt */ U_TITLECASE_LETTER,
130     /* Lm */ U_MODIFIER_LETTER,
131     /* Lo */ U_OTHER_LETTER,
132     /* Pc */ U_CONNECTOR_PUNCTUATION,
133     /* Pd */ U_DASH_PUNCTUATION,
134     /* Ps */ U_START_PUNCTUATION,
135     /* Pe */ U_END_PUNCTUATION,
136     /* Po */ U_OTHER_PUNCTUATION,
137     /* Sm */ U_MATH_SYMBOL,
138     /* Sc */ U_CURRENCY_SYMBOL,
139     /* Sk */ U_MODIFIER_SYMBOL,
140     /* So */ U_OTHER_SYMBOL,
141     /* Pi */ U_INITIAL_PUNCTUATION,
142     /* Pf */ U_FINAL_PUNCTUATION
143     };
144 
145 static const char dirStrings[][5] = {
146     "L",
147     "R",
148     "EN",
149     "ES",
150     "ET",
151     "AN",
152     "CS",
153     "B",
154     "S",
155     "WS",
156     "ON",
157     "LRE",
158     "LRO",
159     "AL",
160     "RLE",
161     "RLO",
162     "PDF",
163     "NSM",
164     "BN",
165     /* new in Unicode 6.3/ICU 52 */
166     "FSI",
167     "LRI",
168     "RLI",
169     "PDI"
170 };
171 
172 void addUnicodeTest(TestNode** root);
173 
addUnicodeTest(TestNode ** root)174 void addUnicodeTest(TestNode** root)
175 {
176     addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
177     addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
178     addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
179     addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
180     addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
181     addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
182     addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
183     addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
184     addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
185     addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
186     addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
187     addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
188     addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
189     addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
190     addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
191     addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
192     addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
193     addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
194     addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
195     addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
196     addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
197     addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
198     addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
199     addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
200     addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
201     addTest(root, &TestBinaryCharacterPropertiesAPI,
202             "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
203     addTest(root, &TestIntCharacterPropertiesAPI,
204             "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
205 }
206 
207 /*==================================================== */
208 /* test u_toupper() and u_tolower()                    */
209 /*==================================================== */
TestUpperLower()210 static void TestUpperLower()
211 {
212     const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
213     const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
214     U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
215     U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
216     int32_t i;
217 
218     U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
219     U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
220 
221 /*
222 Checks LetterLike Symbols which were previously a source of confusion
223 [Bertrand A. D. 02/04/98]
224 */
225     for (i=0x2100;i<0x2138;i++)
226     {
227         /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
228         if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
229         {
230             if (i != (int)u_tolower(i)) /* itself */
231                 log_err("Failed case conversion with itself: U+%04x\n", i);
232             if (i != (int)u_toupper(i))
233                 log_err("Failed case conversion with itself: U+%04x\n", i);
234         }
235     }
236 
237     for(i=0; i < u_strlen(upper); i++){
238         if(u_tolower(upper[i]) != lower[i]){
239             log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
240         }
241     }
242 
243     log_verbose("testing upper lower\n");
244     for (i = 0; i < 21; i++) {
245 
246         if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
247         {
248             log_err("Failed isLowerCase test at  %c\n", upperTest[i]);
249         }
250         else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
251          {
252             log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
253         }
254         else if (upperTest[i] != u_tolower(lowerTest[i]))
255         {
256             log_err("Failed case conversion from %c  To %c :\n", lowerTest[i], upperTest[i]);
257         }
258         else if (lowerTest[i] != u_toupper(upperTest[i]))
259          {
260             log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
261         }
262         else if (upperTest[i] != u_tolower(upperTest[i]))
263         {
264             log_err("Failed case conversion with itself: %c\n", upperTest[i]);
265         }
266         else if (lowerTest[i] != u_toupper(lowerTest[i]))
267         {
268             log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
269         }
270     }
271     log_verbose("done testing upper lower\n");
272 
273     log_verbose("testing u_istitle\n");
274     {
275         static const UChar expected[] = {
276             0x1F88,
277             0x1F89,
278             0x1F8A,
279             0x1F8B,
280             0x1F8C,
281             0x1F8D,
282             0x1F8E,
283             0x1F8F,
284             0x1F88,
285             0x1F89,
286             0x1F8A,
287             0x1F8B,
288             0x1F8C,
289             0x1F8D,
290             0x1F8E,
291             0x1F8F,
292             0x1F98,
293             0x1F99,
294             0x1F9A,
295             0x1F9B,
296             0x1F9C,
297             0x1F9D,
298             0x1F9E,
299             0x1F9F,
300             0x1F98,
301             0x1F99,
302             0x1F9A,
303             0x1F9B,
304             0x1F9C,
305             0x1F9D,
306             0x1F9E,
307             0x1F9F,
308             0x1FA8,
309             0x1FA9,
310             0x1FAA,
311             0x1FAB,
312             0x1FAC,
313             0x1FAD,
314             0x1FAE,
315             0x1FAF,
316             0x1FA8,
317             0x1FA9,
318             0x1FAA,
319             0x1FAB,
320             0x1FAC,
321             0x1FAD,
322             0x1FAE,
323             0x1FAF,
324             0x1FBC,
325             0x1FBC,
326             0x1FCC,
327             0x1FCC,
328             0x1FFC,
329             0x1FFC,
330         };
331         int32_t num = UPRV_LENGTHOF(expected);
332         for(i=0; i<num; i++){
333             if(!u_istitle(expected[i])){
334                 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
335             }
336         }
337 
338     }
339 }
340 
341 /* compare two sets and verify that their difference or intersection is empty */
342 static UBool
showADiffB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool expect,UBool diffIsError)343 showADiffB(const USet *a, const USet *b,
344            const char *a_name, const char *b_name,
345            UBool expect, UBool diffIsError) {
346     USet *aa;
347     int32_t i, start, end, length;
348     UErrorCode errorCode;
349 
350     /*
351      * expect:
352      * TRUE  -> a-b should be empty, that is, b should contain all of a
353      * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
354      */
355     if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
356         return TRUE;
357     }
358 
359     /* clone a to aa because a is const */
360     aa=uset_open(1, 0);
361     if(aa==NULL) {
362         /* unusual problem - out of memory? */
363         return FALSE;
364     }
365     uset_addAll(aa, a);
366 
367     /* compute the set in question */
368     if(expect) {
369         /* a-b */
370         uset_removeAll(aa, b);
371     } else {
372         /* a&b */
373         uset_retainAll(aa, b);
374     }
375 
376     /* aa is not empty because of the initial tests above; show its contents */
377     errorCode=U_ZERO_ERROR;
378     i=0;
379     for(;;) {
380         length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
381         if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
382             break; /* done */
383         }
384         if(U_FAILURE(errorCode)) {
385             log_err("error comparing %s with %s at difference item %d: %s\n",
386                 a_name, b_name, i, u_errorName(errorCode));
387             break;
388         }
389         if(length!=0) {
390             break; /* done with code points, got a string or -1 */
391         }
392 
393         if(diffIsError) {
394             if(expect) {
395                 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
396             } else {
397                 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
398             }
399         } else {
400             if(expect) {
401                 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
402             } else {
403                 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
404             }
405         }
406 
407         ++i;
408     }
409 
410     uset_close(aa);
411     return FALSE;
412 }
413 
414 static UBool
showAMinusB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)415 showAMinusB(const USet *a, const USet *b,
416             const char *a_name, const char *b_name,
417             UBool diffIsError) {
418     return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
419 }
420 
421 static UBool
showAIntersectB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)422 showAIntersectB(const USet *a, const USet *b,
423                 const char *a_name, const char *b_name,
424                 UBool diffIsError) {
425     return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
426 }
427 
428 static UBool
compareUSets(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)429 compareUSets(const USet *a, const USet *b,
430              const char *a_name, const char *b_name,
431              UBool diffIsError) {
432     /*
433      * Use an arithmetic & not a logical && so that both branches
434      * are always taken and all differences are shown.
435      */
436     return
437         showAMinusB(a, b, a_name, b_name, diffIsError) &
438         showAMinusB(b, a, b_name, a_name, diffIsError);
439 }
440 
441 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumber()442 static void TestLetterNumber()
443 {
444     UChar i = 0x0000;
445 
446     log_verbose("Testing for isalpha\n");
447     for (i = 0x0041; i < 0x005B; i++) {
448         if (!u_isalpha(i))
449         {
450             log_err("Failed isLetter test at  %.4X\n", i);
451         }
452     }
453     for (i = 0x0660; i < 0x066A; i++) {
454         if (u_isalpha(i))
455         {
456             log_err("Failed isLetter test with numbers at %.4X\n", i);
457         }
458     }
459 
460     log_verbose("Testing for isdigit\n");
461     for (i = 0x0660; i < 0x066A; i++) {
462         if (!u_isdigit(i))
463         {
464             log_verbose("Failed isNumber test at %.4X\n", i);
465         }
466     }
467 
468     log_verbose("Testing for isalnum\n");
469     for (i = 0x0041; i < 0x005B; i++) {
470         if (!u_isalnum(i))
471         {
472             log_err("Failed isAlNum test at  %.4X\n", i);
473         }
474     }
475     for (i = 0x0660; i < 0x066A; i++) {
476         if (!u_isalnum(i))
477         {
478             log_err("Failed isAlNum test at  %.4X\n", i);
479         }
480     }
481 
482     {
483         /*
484          * The following checks work only starting from Unicode 4.0.
485          * Check the version number here.
486          */
487         static UVersionInfo u401={ 4, 0, 1, 0 };
488         UVersionInfo version;
489         u_getUnicodeVersion(version);
490         if(version[0]<4 || 0==memcmp(version, u401, 4)) {
491             return;
492         }
493     }
494 
495     {
496         /*
497          * Sanity check:
498          * Verify that exactly the digit characters have decimal digit values.
499          * This assumption is used in the implementation of u_digit()
500          * (which checks nt=de)
501          * compared with the parallel java.lang.Character.digit()
502          * (which checks Nd).
503          *
504          * This was not true in Unicode 3.2 and earlier.
505          * Unicode 4.0 fixed discrepancies.
506          * Unicode 4.0.1 re-introduced problems in this area due to an
507          * unintentionally incomplete last-minute change.
508          */
509         U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
510         U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
511 
512         USet *digits, *decimalValues;
513         UErrorCode errorCode;
514 
515         U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
516         U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
517         errorCode=U_ZERO_ERROR;
518         digits=uset_openPattern(digitsPattern, 6, &errorCode);
519         decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
520 
521         if(U_SUCCESS(errorCode)) {
522             compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
523         }
524 
525         uset_close(digits);
526         uset_close(decimalValues);
527     }
528 }
529 
testSampleCharProps(UBool propFn (UChar32),const char * propName,const UChar32 * sampleChars,int32_t sampleCharsLength,UBool expected)530 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
531                                 const UChar32 *sampleChars, int32_t sampleCharsLength,
532                                 UBool expected) {
533     int32_t i;
534     for (i = 0; i < sampleCharsLength; ++i) {
535         UBool result = propFn(sampleChars[i]);
536         if (result != expected) {
537             log_err("error: character property function %s(U+%04x)=%d is wrong\n",
538                     propName, sampleChars[i], result);
539         }
540     }
541 }
542 
543 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMisc()544 static void TestMisc()
545 {
546     static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
547     static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
548     static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
549     static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
550     static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
551     static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
552 /*    static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
553     static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
554     static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
555     static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
556     static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
557 
558     static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
559 
560     uint32_t mask;
561 
562     int32_t i;
563     char icuVersion[U_MAX_VERSION_STRING_LENGTH];
564     UVersionInfo realVersion;
565 
566     memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
567 
568     testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
569     testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570 
571     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
572                         sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
573     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
574                         sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
575 
576     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
577                         sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
578     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
579                         sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
580 
581     testSampleCharProps(u_isdefined, "u_isdefined",
582                         sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
583     testSampleCharProps(u_isdefined, "u_isdefined",
584                         sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
585 
586     testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
587     testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
588 
589     testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
590     testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
591 
592     for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
593         if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
594             log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
595                     sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
596         }
597     }
598 
599     /* Tests the ICU version #*/
600     u_getVersion(realVersion);
601     u_versionToString(realVersion, icuVersion);
602     if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
603     {
604         log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
605     }
606 #if defined(ICU_VERSION)
607     /* test only happens where we have configure.in with VERSION - sanity check. */
608     if(strcmp(U_ICU_VERSION, ICU_VERSION))
609     {
610         log_err("ICU version mismatch: Header says %s, build environment says %s.\n",  U_ICU_VERSION, ICU_VERSION);
611     }
612 #endif
613 
614     /* test U_GC_... */
615     if(
616         U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
617         U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
618         U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
619         U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
620         U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
621         U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
622     ) {
623         log_err("error: U_GET_GC_MASK does not work properly\n");
624     }
625 
626     mask=0;
627     mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
628 
629     mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
630     mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
631     mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
632     mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
633     mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
634 
635     mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
636     mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
637     mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
638 
639     mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
640     mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
641     mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
642 
643     mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
644     mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
645     mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
646 
647     mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
648     mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
649     mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
650     mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
651 
652     mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
653     mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
654     mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
655     mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
656     mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
657 
658     mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
659     mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
660     mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
661     mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
662 
663     mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
664     mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
665 
666     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
667         log_err("error: problems with U_GC_XX_MASK constants\n");
668     }
669 
670     mask=0;
671     mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
672     mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
673     mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
674     mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
675     mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
676     mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
677     mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
678 
679     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
680         log_err("error: problems with U_GC_Y_MASK constants\n");
681     }
682     {
683         static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
684         for(i=0; i<10; i++){
685             if(digit[i]!=u_forDigit(i,10)){
686                 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
687             }
688         }
689     }
690 
691     /* test u_digit() */
692     {
693         static const struct {
694             UChar32 c;
695             int8_t radix, value;
696         } data[]={
697             /* base 16 */
698             { 0x0031, 16, 1 },
699             { 0x0038, 16, 8 },
700             { 0x0043, 16, 12 },
701             { 0x0066, 16, 15 },
702             { 0x00e4, 16, -1 },
703             { 0x0662, 16, 2 },
704             { 0x06f5, 16, 5 },
705             { 0xff13, 16, 3 },
706             { 0xff41, 16, 10 },
707 
708             /* base 8 */
709             { 0x0031, 8, 1 },
710             { 0x0038, 8, -1 },
711             { 0x0043, 8, -1 },
712             { 0x0066, 8, -1 },
713             { 0x00e4, 8, -1 },
714             { 0x0662, 8, 2 },
715             { 0x06f5, 8, 5 },
716             { 0xff13, 8, 3 },
717             { 0xff41, 8, -1 },
718 
719             /* base 36 */
720             { 0x5a, 36, 35 },
721             { 0x7a, 36, 35 },
722             { 0xff3a, 36, 35 },
723             { 0xff5a, 36, 35 },
724 
725             /* wrong radix values */
726             { 0x0031, 1, -1 },
727             { 0xff3a, 37, -1 }
728         };
729 
730         for(i=0; i<UPRV_LENGTHOF(data); ++i) {
731             if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
732                 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
733                         data[i].c,
734                         data[i].radix,
735                         u_digit(data[i].c, data[i].radix),
736                         data[i].value);
737             }
738         }
739     }
740 }
741 
742 /* test C/POSIX-style functions --------------------------------------------- */
743 
744 /* bit flags */
745 #define ISAL     1
746 #define ISLO     2
747 #define ISUP     4
748 
749 #define ISDI     8
750 #define ISXD  0x10
751 
752 #define ISAN  0x20
753 
754 #define ISPU  0x40
755 #define ISGR  0x80
756 #define ISPR 0x100
757 
758 #define ISSP 0x200
759 #define ISBL 0x400
760 #define ISCN 0x800
761 
762 /* C/POSIX-style functions, in the same order as the bit flags */
763 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
764 
765 static const struct {
766     IsPOSIXClass *fn;
767     const char *name;
768 } posixClasses[]={
769     { u_isalpha, "isalpha" },
770     { u_islower, "islower" },
771     { u_isupper, "isupper" },
772     { u_isdigit, "isdigit" },
773     { u_isxdigit, "isxdigit" },
774     { u_isalnum, "isalnum" },
775     { u_ispunct, "ispunct" },
776     { u_isgraph, "isgraph" },
777     { u_isprint, "isprint" },
778     { u_isspace, "isspace" },
779     { u_isblank, "isblank" },
780     { u_iscntrl, "iscntrl" }
781 };
782 
783 static const struct {
784     UChar32 c;
785     uint32_t posixResults;
786 } posixData[]={
787     { 0x0008,                                                        ISCN },    /* backspace */
788     { 0x0009,                                              ISSP|ISBL|ISCN },    /* TAB */
789     { 0x000a,                                              ISSP|     ISCN },    /* LF */
790     { 0x000c,                                              ISSP|     ISCN },    /* FF */
791     { 0x000d,                                              ISSP|     ISCN },    /* CR */
792     { 0x0020,                                         ISPR|ISSP|ISBL      },    /* space */
793     { 0x0021,                               ISPU|ISGR|ISPR                },    /* ! */
794     { 0x0033,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* 3 */
795     { 0x0040,                               ISPU|ISGR|ISPR                },    /* @ */
796     { 0x0041, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* A */
797     { 0x007a, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* z */
798     { 0x007b,                               ISPU|ISGR|ISPR                },    /* { */
799     { 0x0085,                                              ISSP|     ISCN },    /* NEL */
800     { 0x00a0,                                         ISPR|ISSP|ISBL      },    /* NBSP */
801     { 0x00a4,                                    ISGR|ISPR                },    /* currency sign */
802     { 0x00e4, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* a-umlaut */
803     { 0x0300,                                    ISGR|ISPR                },    /* combining grave */
804     { 0x0600,                                                        ISCN },    /* arabic number sign */
805     { 0x0627, ISAL|                    ISAN|     ISGR|ISPR                },    /* alef */
806     { 0x0663,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* arabic 3 */
807     { 0x2002,                                         ISPR|ISSP|ISBL      },    /* en space */
808     { 0x2007,                                         ISPR|ISSP|ISBL      },    /* figure space */
809     { 0x2009,                                         ISPR|ISSP|ISBL      },    /* thin space */
810     { 0x200b,                                                        ISCN },    /* ZWSP */
811   /*{ 0x200b,                                         ISPR|ISSP           },*/    /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
812     { 0x200e,                                                        ISCN },    /* LRM */
813     { 0x2028,                                         ISPR|ISSP|     ISCN },    /* LS */
814     { 0x2029,                                         ISPR|ISSP|     ISCN },    /* PS */
815     { 0x20ac,                                    ISGR|ISPR                },    /* Euro */
816     { 0xff15,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth 5 */
817     { 0xff25, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth E */
818     { 0xff35, ISAL|     ISUP|          ISAN|     ISGR|ISPR                },    /* fullwidth U */
819     { 0xff45, ISAL|ISLO|          ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth e */
820     { 0xff55, ISAL|ISLO|               ISAN|     ISGR|ISPR                }     /* fullwidth u */
821 };
822 
823 static void
TestPOSIX()824 TestPOSIX() {
825     uint32_t mask;
826     int32_t cl, i;
827     UBool expect;
828 
829     mask=1;
830     for(cl=0; cl<12; ++cl) {
831         for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
832             expect=(UBool)((posixData[i].posixResults&mask)!=0);
833             if(posixClasses[cl].fn(posixData[i].c)!=expect) {
834                 log_err("u_%s(U+%04x)=%s is wrong\n",
835                     posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
836             }
837         }
838         mask<<=1;
839     }
840 }
841 
842 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrint()843 static void TestControlPrint()
844 {
845     const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
846     const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
847     const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
848     const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
849     UChar32 c;
850 
851     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
852     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
853 
854     testSampleCharProps(u_isprint, "u_isprint",
855                         samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
856     testSampleCharProps(u_isprint, "u_isprint",
857                         sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
858 
859     /* test all ISO 8 controls */
860     for(c=0; c<=0x9f; ++c) {
861         if(c==0x20) {
862             /* skip ASCII graphic characters and continue with DEL */
863             c=0x7f;
864         }
865         if(!u_iscntrl(c)) {
866             log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
867         }
868         if(!u_isISOControl(c)) {
869             log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
870         }
871         if(u_isprint(c)) {
872             log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
873         }
874     }
875 
876     /* test all Latin-1 graphic characters */
877     for(c=0x20; c<=0xff; ++c) {
878         if(c==0x7f) {
879             c=0xa0;
880         } else if(c==0xad) {
881             /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
882             ++c;
883         }
884         if(!u_isprint(c)) {
885             log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
886         }
887     }
888 }
889 
890 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifier()891 static void TestIdentifier()
892 {
893     const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
894     const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
895     const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
896     const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
897     const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
898     const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
899     const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
900     const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
901     const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
902     const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
903 
904     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
905                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
906     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
907                         sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
908 
909     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
910                         sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
911     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
912                         sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
913 
914     /* IDPart should imply IDStart */
915     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
916                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
917 
918     testSampleCharProps(u_isIDStart, "u_isIDStart",
919                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
920     testSampleCharProps(u_isIDStart, "u_isIDStart",
921                         sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
922 
923     testSampleCharProps(u_isIDPart, "u_isIDPart",
924                         sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
925     testSampleCharProps(u_isIDPart, "u_isIDPart",
926                         sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
927 
928     /* IDPart should imply IDStart */
929     testSampleCharProps(u_isIDPart, "u_isIDPart",
930                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
931 
932     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
933                         sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
934     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
935                         sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
936 }
937 
938 /* for each line of UnicodeData.txt, check some of the properties */
939 typedef struct UnicodeDataContext {
940 #if UCONFIG_NO_NORMALIZATION
941     const void *dummy;
942 #else
943     const UNormalizer2 *nfc;
944     const UNormalizer2 *nfkc;
945 #endif
946 } UnicodeDataContext;
947 
948 /*
949  * ### TODO
950  * This test fails incorrectly if the First or Last code point of a repetitive area
951  * is overridden, which is allowed and is encouraged for the PUAs.
952  * Currently, this means that both area First/Last and override lines are
953  * tested against the properties from the API,
954  * and the area boundary will not match and cause an error.
955  *
956  * This function should detect area boundaries and skip them for the test of individual
957  * code points' properties.
958  * Then it should check that the areas contain all the same properties except where overridden.
959  * For this, it would have had to set a flag for which code points were listed explicitly.
960  */
961 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)962 unicodeDataLineFn(void *context,
963                   char *fields[][2], int32_t fieldCount,
964                   UErrorCode *pErrorCode)
965 {
966     (void)fieldCount; // suppress compiler warnings about unused variable
967     char buffer[100];
968     const char *d;
969     char *end;
970     uint32_t value;
971     UChar32 c;
972     int32_t i;
973     int8_t type;
974     int32_t dt;
975     UChar dm[32], s[32];
976     int32_t dmLength, length;
977 
978 #if !UCONFIG_NO_NORMALIZATION
979     const UNormalizer2 *nfc, *nfkc;
980 #endif
981 
982     /* get the character code, field 0 */
983     c=strtoul(fields[0][0], &end, 16);
984     if(end<=fields[0][0] || end!=fields[0][1]) {
985         log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
986         return;
987     }
988     if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
989         log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
990         return;
991     }
992 
993     /* get general category, field 2 */
994     *fields[2][1]=0;
995     type = (int8_t)tagValues[MakeProp(fields[2][0])];
996     if(u_charType(c)!=type) {
997         log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
998     }
999     if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1000         log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1001     }
1002 
1003     /* get canonical combining class, field 3 */
1004     value=strtoul(fields[3][0], &end, 10);
1005     if(end<=fields[3][0] || end!=fields[3][1]) {
1006         log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1007         return;
1008     }
1009     if(value>255) {
1010         log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1011         return;
1012     }
1013 #if !UCONFIG_NO_NORMALIZATION
1014     if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1015         log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1016     }
1017     nfkc=((UnicodeDataContext *)context)->nfkc;
1018     if(value!=unorm2_getCombiningClass(nfkc, c)) {
1019         log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1020     }
1021 #endif
1022 
1023     /* get BiDi category, field 4 */
1024     *fields[4][1]=0;
1025     i=MakeDir(fields[4][0]);
1026     if(i!=(int32_t)u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1027         log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1028     }
1029 
1030     /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1031     d=NULL;
1032     if(fields[5][0]==fields[5][1]) {
1033         /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1034         if(c==0xac00 || c==0xd7a3) {
1035             dt=U_DT_CANONICAL;
1036         } else {
1037             dt=U_DT_NONE;
1038         }
1039     } else {
1040         d=fields[5][0];
1041         *fields[5][1]=0;
1042         dt=UCHAR_INVALID_CODE;
1043         if(*d=='<') {
1044             end=strchr(++d, '>');
1045             if(end!=NULL) {
1046                 *end=0;
1047                 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1048                 d=u_skipWhitespace(end+1);
1049             }
1050         } else {
1051             dt=U_DT_CANONICAL;
1052         }
1053     }
1054     if(dt>U_DT_NONE) {
1055         if(c==0xac00) {
1056             dm[0]=0x1100;
1057             dm[1]=0x1161;
1058             dm[2]=0;
1059             dmLength=2;
1060         } else if(c==0xd7a3) {
1061             dm[0]=0xd788;
1062             dm[1]=0x11c2;
1063             dm[2]=0;
1064             dmLength=2;
1065         } else {
1066             dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1067         }
1068     } else {
1069         dmLength=-1;
1070     }
1071     if(dt<0 || U_FAILURE(*pErrorCode)) {
1072         log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1073         return;
1074     }
1075 #if !UCONFIG_NO_NORMALIZATION
1076     i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1077     if(i!=dt) {
1078         log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1079     }
1080     /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1081     length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1082     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1083         log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1084                 "or the Decomposition_Mapping is different (%s)\n",
1085                 c, length, dmLength, u_errorName(*pErrorCode));
1086         return;
1087     }
1088     /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1089     if(dt!=U_DT_CANONICAL) {
1090         dmLength=-1;
1091     }
1092     nfc=((UnicodeDataContext *)context)->nfc;
1093     length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1094     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1095         log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1096                 "or the Decomposition_Mapping is different (%s)\n",
1097                 c, length, dmLength, u_errorName(*pErrorCode));
1098         return;
1099     }
1100     /* recompose */
1101     if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1102         UChar32 a, b, composite;
1103         i=0;
1104         U16_NEXT(dm, i, dmLength, a);
1105         U16_NEXT(dm, i, dmLength, b);
1106         /* i==dmLength */
1107         composite=unorm2_composePair(nfc, a, b);
1108         if(composite!=c) {
1109             log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1110                     (long)c, (long)a, (long)b, (long)composite);
1111         }
1112         /*
1113          * Note: NFKC has fewer round-trip mappings than NFC,
1114          * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1115          */
1116     }
1117 #endif
1118 
1119     /* get ISO Comment, field 11 */
1120     *fields[11][1]=0;
1121     i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1122     if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1123         log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1124             c, u_errorName(*pErrorCode),
1125             U_FAILURE(*pErrorCode) ? buffer : "[error]",
1126             fields[11][0]);
1127     }
1128 
1129     /* get uppercase mapping, field 12 */
1130     if(fields[12][0]!=fields[12][1]) {
1131         value=strtoul(fields[12][0], &end, 16);
1132         if(end!=fields[12][1]) {
1133             log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1134             return;
1135         }
1136         if((UChar32)value!=u_toupper(c)) {
1137             log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1138         }
1139     } else {
1140         /* no case mapping: the API must map the code point to itself */
1141         if(c!=u_toupper(c)) {
1142             log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1143         }
1144     }
1145 
1146     /* get lowercase mapping, field 13 */
1147     if(fields[13][0]!=fields[13][1]) {
1148         value=strtoul(fields[13][0], &end, 16);
1149         if(end!=fields[13][1]) {
1150             log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1151             return;
1152         }
1153         if((UChar32)value!=u_tolower(c)) {
1154             log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1155         }
1156     } else {
1157         /* no case mapping: the API must map the code point to itself */
1158         if(c!=u_tolower(c)) {
1159             log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1160         }
1161     }
1162 
1163     /* get titlecase mapping, field 14 */
1164     if(fields[14][0]!=fields[14][1]) {
1165         value=strtoul(fields[14][0], &end, 16);
1166         if(end!=fields[14][1]) {
1167             log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1168             return;
1169         }
1170         if((UChar32)value!=u_totitle(c)) {
1171             log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1172         }
1173     } else {
1174         /* no case mapping: the API must map the code point to itself */
1175         if(c!=u_totitle(c)) {
1176             log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1177         }
1178     }
1179 }
1180 
1181 static UBool U_CALLCONV
enumTypeRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1182 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1183     static const UChar32 test[][2]={
1184         {0x41, U_UPPERCASE_LETTER},
1185         {0x308, U_NON_SPACING_MARK},
1186         {0xfffe, U_GENERAL_OTHER_TYPES},
1187         {0xe0041, U_FORMAT_CHAR},
1188         {0xeffff, U_UNASSIGNED}
1189     };
1190 
1191     int32_t i, count;
1192 
1193     if(0!=strcmp((const char *)context, "a1")) {
1194         log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1195         return FALSE;
1196     }
1197 
1198     count=UPRV_LENGTHOF(test);
1199     for(i=0; i<count; ++i) {
1200         if(start<=test[i][0] && test[i][0]<limit) {
1201             if(type!=(UCharCategory)test[i][1]) {
1202                 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1203                         start, limit, (long)type, test[i][0], test[i][1]);
1204             }
1205             /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1206             return i==(count-1) ? FALSE : TRUE;
1207         }
1208     }
1209 
1210     if(start>test[count-1][0]) {
1211         log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1212                 start, limit, (long)type);
1213         return FALSE;
1214     }
1215 
1216     return TRUE;
1217 }
1218 
1219 static UBool U_CALLCONV
enumDefaultsRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1220 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1221     (void)context; // suppress compiler warnings about unused variable
1222 
1223     /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1224     static const int32_t defaultBidi[][2]={ /* { limit, class } */
1225         { 0x0590, U_LEFT_TO_RIGHT },
1226         { 0x0600, U_RIGHT_TO_LEFT },
1227         { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1228         { 0x0860, U_RIGHT_TO_LEFT },
1229         { 0x0870, U_RIGHT_TO_LEFT_ARABIC },  // Unicode 10 changes U+0860..U+086F from R to AL.
1230         { 0x08A0, U_RIGHT_TO_LEFT },
1231         { 0x0900, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1232         { 0x20A0, U_LEFT_TO_RIGHT },
1233         { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR },  /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1234         { 0xFB1D, U_LEFT_TO_RIGHT },
1235         { 0xFB50, U_RIGHT_TO_LEFT },
1236         { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1237         { 0xFE70, U_LEFT_TO_RIGHT },
1238         { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1239 
1240         { 0x10800, U_LEFT_TO_RIGHT },
1241         { 0x10D00, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1242         { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1243         { 0x10F30, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1244         { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1245         { 0x11000, U_RIGHT_TO_LEFT },
1246 
1247         { 0x1E800, U_LEFT_TO_RIGHT },  /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1248         { 0x1EC70, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1249         { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1250         { 0x1ED00, U_RIGHT_TO_LEFT },  // Unicode 12 changes U+1ED00..U+1ED4F from R to AL.
1251         { 0x1ED50, U_RIGHT_TO_LEFT_ARABIC },
1252         { 0x1EE00, U_RIGHT_TO_LEFT },
1253         { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1254         { 0x1F000, U_RIGHT_TO_LEFT },
1255         { 0x110000, U_LEFT_TO_RIGHT }
1256     };
1257 
1258     UChar32 c;
1259     int32_t i;
1260     UCharDirection shouldBeDir;
1261 
1262     /*
1263      * LineBreak.txt specifies:
1264      *   #  - Assigned characters that are not listed explicitly are given the value
1265      *   #    "AL".
1266      *   #  - Unassigned characters are given the value "XX".
1267      *
1268      * PUA characters are listed explicitly with "XX".
1269      * Verify that no assigned character has "XX".
1270      */
1271     if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1272         c=start;
1273         while(c<limit) {
1274             if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1275                 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1276             }
1277             ++c;
1278         }
1279     }
1280 
1281     /*
1282      * Verify default Bidi classes.
1283      * See DerivedBidiClass.txt, especially for unassigned code points.
1284      */
1285     if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1286         /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1287         c=start;
1288         for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1289             if((int32_t)c<defaultBidi[i][0]) {
1290                 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1291                     if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1292                         shouldBeDir=U_BOUNDARY_NEUTRAL;
1293                     } else {
1294                         shouldBeDir=(UCharDirection)defaultBidi[i][1];
1295                     }
1296 
1297                     if( u_charDirection(c)!=shouldBeDir ||
1298                         (UCharDirection)u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1299                     ) {
1300                         log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1301                             c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1302                     }
1303                     ++c;
1304                 }
1305             }
1306         }
1307     }
1308 
1309     return TRUE;
1310 }
1311 
1312 /* tests for several properties */
TestUnicodeData()1313 static void TestUnicodeData()
1314 {
1315     UVersionInfo expectVersionArray;
1316     UVersionInfo versionArray;
1317     char *fields[15][2];
1318     UErrorCode errorCode;
1319     UChar32 c;
1320     int8_t type;
1321 
1322     UnicodeDataContext context;
1323 
1324     u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1325     u_getUnicodeVersion(versionArray);
1326     if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1327     {
1328         log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1329         versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1330     }
1331 
1332 #if defined(ICU_UNICODE_VERSION)
1333     /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1334     if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1335     {
1336          log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1337     }
1338 #endif
1339 
1340     if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1341         log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1342     }
1343 
1344     errorCode=U_ZERO_ERROR;
1345 #if !UCONFIG_NO_NORMALIZATION
1346     context.nfc=unorm2_getNFCInstance(&errorCode);
1347     context.nfkc=unorm2_getNFKCInstance(&errorCode);
1348     if(U_FAILURE(errorCode)) {
1349         log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1350         return;
1351     }
1352 #endif
1353     parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1354     if(U_FAILURE(errorCode)) {
1355         return; /* if we couldn't parse UnicodeData.txt, we should return */
1356     }
1357 
1358     /* sanity check on repeated properties */
1359     for(c=0xfffe; c<=0x10ffff;) {
1360         type=u_charType(c);
1361         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1362             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1363         }
1364         if(type!=U_UNASSIGNED) {
1365             log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1366         }
1367         if((c&0xffff)==0xfffe) {
1368             ++c;
1369         } else {
1370             c+=0xffff;
1371         }
1372     }
1373 
1374     /* test that PUA is not "unassigned" */
1375     for(c=0xe000; c<=0x10fffd;) {
1376         type=u_charType(c);
1377         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1378             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1379         }
1380         if(type==U_UNASSIGNED) {
1381             log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1382         } else if(type!=U_PRIVATE_USE_CHAR) {
1383             log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1384         }
1385         if(c==0xf8ff) {
1386             c=0xf0000;
1387         } else if(c==0xffffd) {
1388             c=0x100000;
1389         } else {
1390             ++c;
1391         }
1392     }
1393 
1394     /* test u_enumCharTypes() */
1395     u_enumCharTypes(enumTypeRange, "a1");
1396 
1397     /* check default properties */
1398     u_enumCharTypes(enumDefaultsRange, NULL);
1399 }
1400 
TestCodeUnit()1401 static void TestCodeUnit(){
1402     const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1403 
1404     int32_t i;
1405 
1406     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1407         UChar c=codeunit[i];
1408         if(i<4){
1409             if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1410                     U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1411                 log_err("ERROR: U+%04x is a single", c);
1412             }
1413 
1414         }
1415         if(i >= 4 && i< 8){
1416             if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1417                     !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1418                 log_err("ERROR: U+%04x is a first surrogate", c);
1419             }
1420         }
1421         if(i >= 8 && i< 12){
1422             if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1423                     !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1424                 log_err("ERROR: U+%04x is a second surrogate", c);
1425             }
1426         }
1427 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1428         if(i<4){
1429             if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1430                 log_err("ERROR: U+%04x is a single", c);
1431             }
1432 
1433         }
1434         if(i >= 4 && i< 8){
1435             if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1436                 log_err("ERROR: U+%04x is a first surrogate", c);
1437             }
1438         }
1439         if(i >= 8 && i< 12){
1440             if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1441                 log_err("ERROR: U+%04x is a second surrogate", c);
1442             }
1443         }
1444 #endif
1445     }
1446 }
1447 
TestCodePoint()1448 static void TestCodePoint(){
1449     const UChar32 codePoint[]={
1450         /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1451         0xd800,
1452         0xdbff,
1453         0xdc00,
1454         0xdfff,
1455         0xdc04,
1456         0xd821,
1457         /*not a surrogate, valid, isUnicodeChar , not Error*/
1458         0x20ac,
1459         0xd7ff,
1460         0xe000,
1461         0xe123,
1462         0x0061,
1463         0xe065,
1464         0x20402,
1465         0x24506,
1466         0x23456,
1467         0x20402,
1468         0x10402,
1469         0x23456,
1470         /*not a surrogate, not valid, isUnicodeChar, isError */
1471         0x0015,
1472         0x009f,
1473         /*not a surrogate, not valid, not isUnicodeChar, isError */
1474         0xffff,
1475         0xfffe,
1476     };
1477     int32_t i;
1478     for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1479         UChar32 c=codePoint[i];
1480         if(i<6) {
1481             if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1482                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1483             }
1484             if(U_IS_UNICODE_CHAR(c)) {
1485                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1486             }
1487         } else if(i >=6 && i<18) {
1488             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1489                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1490             }
1491             if(!U_IS_UNICODE_CHAR(c)) {
1492                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1493             }
1494         } else if(i >=18 && i<20) {
1495             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1496                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1497             }
1498             if(!U_IS_UNICODE_CHAR(c)) {
1499                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1500             }
1501         } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1502             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1503                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1504             }
1505             if(U_IS_UNICODE_CHAR(c)) {
1506                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1507             }
1508         }
1509 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1510         if(i<6){
1511             if(!UTF_IS_SURROGATE(c)){
1512                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1513             }
1514             if(UTF_IS_VALID(c)){
1515                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1516             }
1517             if(UTF_IS_UNICODE_CHAR(c)){
1518                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1519             }
1520             if(UTF_IS_ERROR(c)){
1521                 log_err("ERROR: isError() failed for U+%04x\n", c);
1522             }
1523         }else if(i >=6 && i<18){
1524             if(UTF_IS_SURROGATE(c)){
1525                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1526             }
1527             if(!UTF_IS_VALID(c)){
1528                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1529             }
1530             if(!UTF_IS_UNICODE_CHAR(c)){
1531                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1532             }
1533             if(UTF_IS_ERROR(c)){
1534                 log_err("ERROR: isError() failed for U+%04x\n", c);
1535             }
1536         }else if(i >=18 && i<20){
1537             if(UTF_IS_SURROGATE(c)){
1538                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1539             }
1540             if(UTF_IS_VALID(c)){
1541                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1542             }
1543             if(!UTF_IS_UNICODE_CHAR(c)){
1544                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1545             }
1546             if(!UTF_IS_ERROR(c)){
1547                 log_err("ERROR: isError() failed for U+%04x\n", c);
1548             }
1549         }
1550         else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1551             if(UTF_IS_SURROGATE(c)){
1552                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1553             }
1554             if(UTF_IS_VALID(c)){
1555                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1556             }
1557             if(UTF_IS_UNICODE_CHAR(c)){
1558                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1559             }
1560             if(!UTF_IS_ERROR(c)){
1561                 log_err("ERROR: isError() failed for U+%04x\n", c);
1562             }
1563         }
1564 #endif
1565     }
1566 
1567     if(
1568         !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1569         !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1570         U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1571         U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1572     ) {
1573         log_err("error with U_IS_BMP()\n");
1574     }
1575 
1576     if(
1577         U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1578         U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1579         U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1580         !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1581     ) {
1582         log_err("error with U_IS_SUPPLEMENTARY()\n");
1583     }
1584 }
1585 
TestCharLength()1586 static void TestCharLength()
1587 {
1588     const int32_t codepoint[]={
1589         1, 0x0061,
1590         1, 0xe065,
1591         1, 0x20ac,
1592         2, 0x20402,
1593         2, 0x23456,
1594         2, 0x24506,
1595         2, 0x20402,
1596         2, 0x10402,
1597         1, 0xd7ff,
1598         1, 0xe000
1599     };
1600 
1601     int32_t i;
1602 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1603     UBool multiple;
1604 #endif
1605     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1606         UChar32 c=codepoint[i+1];
1607         if(
1608 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1609                 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1610 #endif
1611                 U16_LENGTH(c) != codepoint[i]) {
1612             log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1613         }
1614 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1615         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1616         if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1617             log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1618         }
1619 #endif
1620     }
1621 }
1622 
1623 /*internal functions ----*/
MakeProp(char * str)1624 static int32_t MakeProp(char* str)
1625 {
1626     int32_t result = 0;
1627     char* matchPosition =0;
1628 
1629     matchPosition = strstr(tagStrings, str);
1630     if (matchPosition == 0)
1631     {
1632         log_err("unrecognized type letter ");
1633         log_err(str);
1634     }
1635     else
1636         result = (int32_t)((matchPosition - tagStrings) / 2);
1637     return result;
1638 }
1639 
MakeDir(char * str)1640 static int32_t MakeDir(char* str)
1641 {
1642     int32_t pos = 0;
1643     for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1644         if (strcmp(str, dirStrings[pos]) == 0) {
1645             return pos;
1646         }
1647     }
1648     return -1;
1649 }
1650 
1651 /* test u_charName() -------------------------------------------------------- */
1652 
1653 static const struct {
1654     uint32_t code;
1655     const char *name, *oldName, *extName, *alias;
1656 } names[]={
1657     {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A", NULL},
1658     {0x01a2, "LATIN CAPITAL LETTER OI", "",
1659              "LATIN CAPITAL LETTER OI",
1660              "LATIN CAPITAL LETTER GHA"},
1661     {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1662              "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", NULL},
1663     {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1664              "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1665              "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1666     {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401", NULL},
1667     {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED", NULL},
1668     {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA", NULL},
1669     {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH", NULL},
1670     {0xd800, "", "", "<lead surrogate-D800>", NULL},
1671     {0xdc00, "", "", "<trail surrogate-DC00>", NULL},
1672     {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS", NULL},
1673     {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN", NULL},
1674     {0xffff, "", "", "<noncharacter-FFFF>", NULL},
1675     {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1676               "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1677               "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1678     {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456", NULL}
1679 };
1680 
1681 static UBool
enumCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1682 enumCharNamesFn(void *context,
1683                 UChar32 code, UCharNameChoice nameChoice,
1684                 const char *name, int32_t length) {
1685     int32_t *pCount=(int32_t *)context;
1686     const char *expected;
1687     int i;
1688 
1689     if(length<=0 || length!=(int32_t)strlen(name)) {
1690         /* should not be called with an empty string or invalid length */
1691         log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1692         return TRUE;
1693     }
1694 
1695     ++*pCount;
1696     for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1697         if(code==(UChar32)names[i].code) {
1698             switch (nameChoice) {
1699                 case U_EXTENDED_CHAR_NAME:
1700                     if(0!=strcmp(name, names[i].extName)) {
1701                         log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1702                     }
1703                     break;
1704                 case U_UNICODE_CHAR_NAME:
1705                     if(0!=strcmp(name, names[i].name)) {
1706                         log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1707                     }
1708                     break;
1709                 case U_UNICODE_10_CHAR_NAME:
1710                     expected=names[i].oldName;
1711                     if(expected[0]==0 || 0!=strcmp(name, expected)) {
1712                         log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1713                     }
1714                     break;
1715                 case U_CHAR_NAME_ALIAS:
1716                     expected=names[i].alias;
1717                     if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1718                         log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1719                     }
1720                     break;
1721                 case U_CHAR_NAME_CHOICE_COUNT:
1722                     break;
1723             }
1724             break;
1725         }
1726     }
1727     return TRUE;
1728 }
1729 
1730 struct enumExtCharNamesContext {
1731     uint32_t length;
1732     int32_t last;
1733 };
1734 
1735 static UBool
enumExtCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1736 enumExtCharNamesFn(void *context,
1737                 UChar32 code, UCharNameChoice nameChoice,
1738                 const char *name, int32_t length) {
1739     struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1740 
1741     if (ecncp->last != (int32_t) code - 1) {
1742         if (ecncp->last < 0) {
1743             log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1744         } else {
1745             log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1746         }
1747     }
1748     ecncp->last = (int32_t) code;
1749 
1750     if (!*name) {
1751         log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1752     }
1753 
1754     return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1755 }
1756 
1757 /**
1758  * This can be made more efficient by moving it into putil.c and having
1759  * it directly access the ebcdic translation tables.
1760  * TODO: If we get this method in putil.c, then delete it from here.
1761  */
1762 static UChar
u_charToUChar(char c)1763 u_charToUChar(char c) {
1764     UChar uc;
1765     u_charsToUChars(&c, &uc, 1);
1766     return uc;
1767 }
1768 
1769 static void
TestCharNames()1770 TestCharNames() {
1771     static char name[80];
1772     UErrorCode errorCode=U_ZERO_ERROR;
1773     struct enumExtCharNamesContext extContext;
1774     const char *expected;
1775     int32_t length;
1776     UChar32 c;
1777     int32_t i;
1778 
1779     log_verbose("Testing uprv_getMaxCharNameLength()\n");
1780     length=uprv_getMaxCharNameLength();
1781     if(length==0) {
1782         /* no names data available */
1783         return;
1784     }
1785     if(length<83) { /* Unicode 3.2 max char name length */
1786         log_err("uprv_getMaxCharNameLength()=%d is too short");
1787     }
1788     /* ### TODO same tests for max ISO comment length as for max name length */
1789 
1790     log_verbose("Testing u_charName()\n");
1791     for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1792         /* modern Unicode character name */
1793         length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1794         if(U_FAILURE(errorCode)) {
1795             log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1796             return;
1797         }
1798         if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1799             log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1800         }
1801 
1802         /* find the modern name */
1803         if (*names[i].name) {
1804             c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1805             if(U_FAILURE(errorCode)) {
1806                 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1807                 return;
1808             }
1809             if(c!=(UChar32)names[i].code) {
1810                 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1811             }
1812         }
1813 
1814         /* Unicode 1.0 character name */
1815         length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1816         if(U_FAILURE(errorCode)) {
1817             log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1818             return;
1819         }
1820         if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1821             log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1822         }
1823 
1824         /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1825         if(names[i].oldName[0]!=0 /* && length>0 */) {
1826             c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1827             if(U_FAILURE(errorCode)) {
1828                 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1829                 return;
1830             }
1831             if(c!=(UChar32)names[i].code) {
1832                 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1833             }
1834         }
1835 
1836         /* Unicode character name alias */
1837         length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1838         if(U_FAILURE(errorCode)) {
1839             log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1840             return;
1841         }
1842         expected=names[i].alias;
1843         if(expected==NULL) {
1844             expected="";
1845         }
1846         if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1847             log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1848                     names[i].code, name, length, expected);
1849         }
1850 
1851         /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1852         if(expected[0]!=0 /* && length>0 */) {
1853             c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1854             if(U_FAILURE(errorCode)) {
1855                 log_err("u_charFromName(%s - alias) error %s\n",
1856                         expected, u_errorName(errorCode));
1857                 return;
1858             }
1859             if(c!=(UChar32)names[i].code) {
1860                 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1861                         expected, c, names[i].code);
1862             }
1863         }
1864     }
1865 
1866     /* test u_enumCharNames() */
1867     length=0;
1868     errorCode=U_ZERO_ERROR;
1869     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1870     if(U_FAILURE(errorCode) || length<94140) {
1871         log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1872     }
1873 
1874     extContext.length = 0;
1875     extContext.last = -1;
1876     errorCode=U_ZERO_ERROR;
1877     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1878     if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1879         log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1880     }
1881 
1882     /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1883     if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1884         log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1885     }
1886 
1887     /* Test getCharNameCharacters */
1888     if(!getTestOption(QUICK_OPTION)) {
1889         enum { BUFSIZE = 256 };
1890         UErrorCode ec = U_ZERO_ERROR;
1891         char buf[BUFSIZE];
1892         int32_t maxLength;
1893         UChar32 cp;
1894         UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1895         int32_t l1, l2;
1896         UBool map[256];
1897         UBool ok;
1898 
1899         USet* set = uset_open(1, 0); /* empty set */
1900         USet* dumb = uset_open(1, 0); /* empty set */
1901 
1902         /*
1903          * uprv_getCharNameCharacters() will likely return more lowercase
1904          * letters than actual character names contain because
1905          * it includes all the characters in lowercased names of
1906          * general categories, for the full possible set of extended names.
1907          */
1908         {
1909             USetAdder sa={
1910                 NULL,
1911                 uset_add,
1912                 uset_addRange,
1913                 uset_addString,
1914                 NULL, /* don't need remove() */
1915                 NULL  /* don't need removeRange() */
1916             };
1917             sa.set=set;
1918             uprv_getCharNameCharacters(&sa);
1919         }
1920 
1921         /* build set the dumb (but sure-fire) way */
1922         for (i=0; i<256; ++i) {
1923             map[i] = FALSE;
1924         }
1925 
1926         maxLength=0;
1927         for (cp=0; cp<0x110000; ++cp) {
1928             int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1929                                      buf, BUFSIZE, &ec);
1930             if (U_FAILURE(ec)) {
1931                 log_err("FAIL: u_charName failed when it shouldn't\n");
1932                 uset_close(set);
1933                 uset_close(dumb);
1934                 return;
1935             }
1936             if(len>maxLength) {
1937                 maxLength=len;
1938             }
1939 
1940             for (i=0; i<len; ++i) {
1941                 if (!map[(uint8_t) buf[i]]) {
1942                     uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1943                     map[(uint8_t) buf[i]] = TRUE;
1944                 }
1945             }
1946 
1947             /* test for leading/trailing whitespace */
1948             if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1949                 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1950             }
1951         }
1952 
1953         if(map[(uint8_t)'\t']) {
1954             log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1955         }
1956 
1957         length=uprv_getMaxCharNameLength();
1958         if(length!=maxLength) {
1959             log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1960                     length, maxLength);
1961         }
1962 
1963         /* compare the sets.  Where is my uset_equals?!! */
1964         ok=TRUE;
1965         for(i=0; i<256; ++i) {
1966             if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1967                 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1968                     /* ignore lowercase a-z that are in set but not in dumb */
1969                     ok=TRUE;
1970                 } else {
1971                     ok=FALSE;
1972                     break;
1973                 }
1974             }
1975         }
1976 
1977         l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1978         l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1979         if (U_FAILURE(ec)) {
1980             log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1981             uset_close(set);
1982             uset_close(dumb);
1983             return;
1984         }
1985 
1986         if (l1 >= BUFSIZE) {
1987             l1 = BUFSIZE-1;
1988             pat[l1] = 0;
1989         }
1990         if (l2 >= BUFSIZE) {
1991             l2 = BUFSIZE-1;
1992             dumbPat[l2] = 0;
1993         }
1994 
1995         if (!ok) {
1996             log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1997                     aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1998         } else if(getTestOption(VERBOSITY_OPTION)) {
1999             log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
2000         }
2001 
2002         uset_close(set);
2003         uset_close(dumb);
2004     }
2005 
2006     /* ### TODO: test error cases and other interesting things */
2007 }
2008 
2009 static void
TestUCharFromNameUnderflow()2010 TestUCharFromNameUnderflow() {
2011     // Ticket #10889: Underflow crash when there is no dash.
2012     const char *name="<NO BREAK SPACE>";
2013     UErrorCode errorCode=U_ZERO_ERROR;
2014     UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2015     if(U_SUCCESS(errorCode)) {
2016         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2017                 name, c, u_errorName(errorCode));
2018     }
2019 
2020     // Test related edge cases.
2021     name="<-00a0>";
2022     errorCode=U_ZERO_ERROR;
2023     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2024     if(U_SUCCESS(errorCode)) {
2025         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2026                 name, c, u_errorName(errorCode));
2027     }
2028 
2029     errorCode=U_ZERO_ERROR;
2030     name="<control->";
2031     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2032     if(U_SUCCESS(errorCode)) {
2033         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2034                 name, c, u_errorName(errorCode));
2035     }
2036 
2037     errorCode=U_ZERO_ERROR;
2038     name="<control-111111>";
2039     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2040     if(U_SUCCESS(errorCode)) {
2041         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2042                 name, c, u_errorName(errorCode));
2043     }
2044 
2045     // ICU-20292: integer overflow
2046     errorCode=U_ZERO_ERROR;
2047     name="<noncharacter-10010FFFF>";
2048     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2049     if(U_SUCCESS(errorCode)) {
2050         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2051                 name, c, u_errorName(errorCode));
2052     }
2053 
2054     errorCode=U_ZERO_ERROR;
2055     name="<noncharacter-00010FFFF>";  // too many digits even if only leading 0s
2056     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2057     if(U_SUCCESS(errorCode)) {
2058         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2059                 name, c, u_errorName(errorCode));
2060     }
2061 
2062     errorCode=U_ZERO_ERROR;
2063     name="<noncharacter-fFFf>>";
2064     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2065     if(U_SUCCESS(errorCode)) {
2066         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2067                 name, c, u_errorName(errorCode));
2068     }
2069 }
2070 
2071 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2072 
2073 static void
TestMirroring()2074 TestMirroring() {
2075     USet *set;
2076     UErrorCode errorCode;
2077 
2078     UChar32 start, end, c2, c3;
2079     int32_t i;
2080 
2081     U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2082 
2083     U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2084 
2085     log_verbose("Testing u_isMirrored()\n");
2086     if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2087          !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2088         )
2089     ) {
2090         log_err("u_isMirrored() does not work correctly\n");
2091     }
2092 
2093     log_verbose("Testing u_charMirror()\n");
2094     if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2095          u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2096          u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2097          /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2098          u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2099          )
2100     ) {
2101         log_err("u_charMirror() does not work correctly\n");
2102     }
2103 
2104     /* verify that Bidi_Mirroring_Glyph roundtrips */
2105     errorCode=U_ZERO_ERROR;
2106     set=uset_openPattern(mirroredPattern, 17, &errorCode);
2107 
2108     if (U_FAILURE(errorCode)) {
2109         log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2110     } else {
2111         for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2112             do {
2113                 c2=u_charMirror(start);
2114                 c3=u_charMirror(c2);
2115                 if(c3!=start) {
2116                     log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2117                 }
2118                 c3=u_getBidiPairedBracket(start);
2119                 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2120                     if(c3!=start) {
2121                         log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2122                                 (long)start);
2123                     }
2124                 } else {
2125                     if(c3!=c2) {
2126                         log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2127                                 (long)start, (long)c2);
2128                     }
2129                 }
2130             } while(++start<=end);
2131         }
2132     }
2133 
2134     uset_close(set);
2135 }
2136 
2137 
2138 struct RunTestData
2139 {
2140     const char *runText;
2141     UScriptCode runCode;
2142 };
2143 
2144 typedef struct RunTestData RunTestData;
2145 
2146 static void
CheckScriptRuns(UScriptRun * scriptRun,int32_t * runStarts,const RunTestData * testData,int32_t nRuns,const char * prefix)2147 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2148                 const char *prefix)
2149 {
2150     int32_t run, runStart, runLimit;
2151     UScriptCode runCode;
2152 
2153     /* iterate over all the runs */
2154     run = 0;
2155     while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2156         if (runStart != runStarts[run]) {
2157             log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2158                 prefix, run, runStarts[run], runStart);
2159         }
2160 
2161         if (runLimit != runStarts[run + 1]) {
2162             log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2163                 prefix, run, runStarts[run + 1], runLimit);
2164         }
2165 
2166         if (runCode != testData[run].runCode) {
2167             log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2168                 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2169         }
2170 
2171         run += 1;
2172 
2173         /* stop when we've seen all the runs we expect to see */
2174         if (run >= nRuns) {
2175             break;
2176         }
2177     }
2178 
2179     /* Complain if we didn't see then number of runs we expected */
2180     if (run != nRuns) {
2181         log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2182     }
2183 }
2184 
2185 static void
TestUScriptRunAPI()2186 TestUScriptRunAPI()
2187 {
2188     static const RunTestData testData1[] = {
2189         {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2190         {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2191         {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2192         {"English (", USCRIPT_LATIN},
2193         {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2194         {") ", USCRIPT_LATIN},
2195         {"\\u6F22\\u5B75", USCRIPT_HAN},
2196         {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2197         {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2198         {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2199     };
2200 
2201     static const RunTestData testData2[] = {
2202        {"((((((((((abc))))))))))", USCRIPT_LATIN}
2203     };
2204 
2205     static const struct {
2206       const RunTestData *testData;
2207       int32_t nRuns;
2208     } testDataEntries[] = {
2209         {testData1, UPRV_LENGTHOF(testData1)},
2210         {testData2, UPRV_LENGTHOF(testData2)}
2211     };
2212 
2213     static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2214     int32_t testEntry;
2215 
2216     for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2217         UChar testString[1024];
2218         int32_t runStarts[256];
2219         int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2220         const RunTestData *testData = testDataEntries[testEntry].testData;
2221 
2222         int32_t run, stringLimit;
2223         UScriptRun *scriptRun = NULL;
2224         UErrorCode err;
2225 
2226         /*
2227          * Fill in the test string and the runStarts array.
2228          */
2229         stringLimit = 0;
2230         for (run = 0; run < nTestRuns; run += 1) {
2231             runStarts[run] = stringLimit;
2232             stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2233             /*stringLimit -= 1;*/
2234         }
2235 
2236         /* The limit of the last run */
2237         runStarts[nTestRuns] = stringLimit;
2238 
2239         /*
2240          * Make sure that calling uscript_OpenRun with a NULL text pointer
2241          * and a non-zero text length returns the correct error.
2242          */
2243         err = U_ZERO_ERROR;
2244         scriptRun = uscript_openRun(NULL, stringLimit, &err);
2245 
2246         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2247             log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2248         }
2249 
2250         if (scriptRun != NULL) {
2251             log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2252             uscript_closeRun(scriptRun);
2253         }
2254 
2255         /*
2256          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2257          * and a zero text length returns the correct error.
2258          */
2259         err = U_ZERO_ERROR;
2260         scriptRun = uscript_openRun(testString, 0, &err);
2261 
2262         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2263             log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2264         }
2265 
2266         if (scriptRun != NULL) {
2267             log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2268             uscript_closeRun(scriptRun);
2269         }
2270 
2271         /*
2272          * Make sure that calling uscript_openRun with a NULL text pointer
2273          * and a zero text length doesn't return an error.
2274          */
2275         err = U_ZERO_ERROR;
2276         scriptRun = uscript_openRun(NULL, 0, &err);
2277 
2278         if (U_FAILURE(err)) {
2279             log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2280         }
2281 
2282         /* Make sure that the empty iterator doesn't find any runs */
2283         if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2284             log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2285         }
2286 
2287         /*
2288          * Make sure that calling uscript_setRunText with a NULL text pointer
2289          * and a non-zero text length returns the correct error.
2290          */
2291         err = U_ZERO_ERROR;
2292         uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2293 
2294         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2295             log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2296         }
2297 
2298         /*
2299          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2300          * and a zero text length returns the correct error.
2301          */
2302         err = U_ZERO_ERROR;
2303         uscript_setRunText(scriptRun, testString, 0, &err);
2304 
2305         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2306             log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2307         }
2308 
2309         /*
2310          * Now call uscript_setRunText on the empty iterator
2311          * and make sure that it works.
2312          */
2313         err = U_ZERO_ERROR;
2314         uscript_setRunText(scriptRun, testString, stringLimit, &err);
2315 
2316         if (U_FAILURE(err)) {
2317             log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2318         } else {
2319             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2320         }
2321 
2322         uscript_closeRun(scriptRun);
2323 
2324         /*
2325          * Now open an interator over the testString
2326          * using uscript_openRun and make sure that it works
2327          */
2328         scriptRun = uscript_openRun(testString, stringLimit, &err);
2329 
2330         if (U_FAILURE(err)) {
2331             log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2332         } else {
2333             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2334         }
2335 
2336         /* Now reset the iterator, and make sure
2337          * that it still works.
2338          */
2339         uscript_resetRun(scriptRun);
2340 
2341         CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2342 
2343         /* Close the iterator */
2344         uscript_closeRun(scriptRun);
2345     }
2346 }
2347 
2348 /* test additional, non-core properties */
2349 static void
TestAdditionalProperties()2350 TestAdditionalProperties() {
2351     /* test data for u_charAge() */
2352     static const struct {
2353         UChar32 c;
2354         UVersionInfo version;
2355     } charAges[]={
2356         {0x41,    { 1, 1, 0, 0 }},
2357         {0xffff,  { 1, 1, 0, 0 }},
2358         {0x20ab,  { 2, 0, 0, 0 }},
2359         {0x2fffe, { 2, 0, 0, 0 }},
2360         {0x20ac,  { 2, 1, 0, 0 }},
2361         {0xfb1d,  { 3, 0, 0, 0 }},
2362         {0x3f4,   { 3, 1, 0, 0 }},
2363         {0x10300, { 3, 1, 0, 0 }},
2364         {0x220,   { 3, 2, 0, 0 }},
2365         {0xff60,  { 3, 2, 0, 0 }}
2366     };
2367 
2368     /* test data for u_hasBinaryProperty() */
2369     static const int32_t
2370     props[][3]={ /* code point, property, value */
2371         { 0x0627, UCHAR_ALPHABETIC, TRUE },
2372         { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2373         { 0x2028, UCHAR_ALPHABETIC, FALSE },
2374 
2375         { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2376         { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2377 
2378         { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2379         { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2380 
2381         { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2382         { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2383 
2384         /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2385         { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2386         { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2387         { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2388         { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2389 
2390         { 0x058a, UCHAR_DASH, TRUE },
2391         { 0x007e, UCHAR_DASH, FALSE },
2392 
2393         { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2394         { 0x3000, UCHAR_DIACRITIC, FALSE },
2395 
2396         { 0x0e46, UCHAR_EXTENDER, TRUE },
2397         { 0x0020, UCHAR_EXTENDER, FALSE },
2398 
2399 #if !UCONFIG_NO_NORMALIZATION
2400         { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2401         { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2402         { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2403 
2404         { 0x110a, UCHAR_NFD_INERT, TRUE },      /* Jamo L */
2405         { 0x0308, UCHAR_NFD_INERT, FALSE },
2406 
2407         { 0x1164, UCHAR_NFKD_INERT, TRUE },     /* Jamo V */
2408         { 0x1d79d, UCHAR_NFKD_INERT, FALSE },   /* math compat version of xi */
2409 
2410         { 0x0021, UCHAR_NFC_INERT, TRUE },      /* ! */
2411         { 0x0061, UCHAR_NFC_INERT, FALSE },     /* a */
2412         { 0x00e4, UCHAR_NFC_INERT, FALSE },     /* a-umlaut */
2413         { 0x0102, UCHAR_NFC_INERT, FALSE },     /* a-breve */
2414         { 0xac1c, UCHAR_NFC_INERT, FALSE },     /* Hangul LV */
2415         { 0xac1d, UCHAR_NFC_INERT, TRUE },      /* Hangul LVT */
2416 
2417         { 0x1d79d, UCHAR_NFKC_INERT, FALSE },   /* math compat version of xi */
2418         { 0x2a6d6, UCHAR_NFKC_INERT, TRUE },    /* Han, last of CJK ext. B */
2419 
2420         { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2421         { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2422         { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2423         { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2424         { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2425         { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2426 #endif
2427 
2428         { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2429         { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2430         { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2431 
2432         { 0x30fb, UCHAR_HYPHEN, TRUE },
2433         { 0xfe58, UCHAR_HYPHEN, FALSE },
2434 
2435         { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2436         { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2437         { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2438 
2439         { 0x2172, UCHAR_ID_START, TRUE },
2440         { 0x007a, UCHAR_ID_START, TRUE },
2441         { 0x0039, UCHAR_ID_START, FALSE },
2442 
2443         { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2444         { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2445         { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2446 
2447         { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2448         { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2449 
2450         { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2451         { 0x0345, UCHAR_LOWERCASE, TRUE },
2452         { 0x0030, UCHAR_LOWERCASE, FALSE },
2453 
2454         { 0x1d7a9, UCHAR_MATH, TRUE },
2455         { 0x2135, UCHAR_MATH, TRUE },
2456         { 0x0062, UCHAR_MATH, FALSE },
2457 
2458         { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2459         { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2460         { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2461 
2462         { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2463         { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2464         { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2465 
2466         { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2467         { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2468 
2469         { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2470         { 0x2162, UCHAR_UPPERCASE, TRUE },
2471         { 0x0345, UCHAR_UPPERCASE, FALSE },
2472 
2473         { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2474         { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2475         { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2476 
2477         { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2478         { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2479         { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2480 
2481         { 0x16ee, UCHAR_XID_START, TRUE },
2482         { 0x23456, UCHAR_XID_START, TRUE },
2483         { 0x1d1aa, UCHAR_XID_START, FALSE },
2484 
2485         /*
2486          * Version break:
2487          * The following properties are only supported starting with the
2488          * Unicode version indicated in the second field.
2489          */
2490         { -1, 0x320, 0 },
2491 
2492         { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2493         { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2494         { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2495 
2496         { 0x0149, UCHAR_DEPRECATED, TRUE },         /* changed in Unicode 5.2 */
2497         { 0x0341, UCHAR_DEPRECATED, FALSE },        /* changed in Unicode 5.2 */
2498         { 0xe0001, UCHAR_DEPRECATED, TRUE },        /* changed from Unicode 5 to 5.1 */
2499         { 0xe0100, UCHAR_DEPRECATED, FALSE },
2500 
2501         { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2502         { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2503         { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2504         { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE },     /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2505 
2506         { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2507         { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2508         { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE },    /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2509         { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2510 
2511         { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2512         { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2513 
2514         { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2515         { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2516 
2517         { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2518         { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2519 
2520         { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2521         { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2522 
2523         { 0x2e9b, UCHAR_RADICAL, TRUE },
2524         { 0x4e00, UCHAR_RADICAL, FALSE },
2525 
2526         { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2527         { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2528 
2529         { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2530         { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2531 
2532         { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2533 
2534         { 0x002e, UCHAR_S_TERM, TRUE },
2535         { 0x0061, UCHAR_S_TERM, FALSE },
2536 
2537         { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2538         { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2539         { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2540         { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2541 
2542         /* enum/integer type properties */
2543 
2544         /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2545         /* test default Bidi classes for unassigned code points */
2546         { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2547         { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2548         { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2549         { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2550         { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2551         { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2552         { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2553         { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2554         { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2555         { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2556         { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2557 
2558         { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2559         { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2560         { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2561         { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2562         { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2563         { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2564         { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2565 
2566         { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2567         { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2568         { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2569         { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2570         { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2571         { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2572         { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2573         { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2574         { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2575         { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2576         { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2577 
2578         /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2579         { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2580 
2581         { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2582         { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2583         { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2584         { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2585         { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2586         { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2587         { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2588         { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2589         { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2590 
2591         { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2592         { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2593         { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2594         { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2595         { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2596         { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2597         { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2598         { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2599         { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2600         { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2601         { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2602         { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2603         { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2604         { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2605         { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2606         { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2607         { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2608 
2609         /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2610         { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2611         { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER },     /* changed in Unicode 5.2 */
2612 
2613         { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2614         { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2615         { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2616         { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2617         { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2618 
2619         { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2620         { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2621         { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2622         { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2623         { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2624         { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2625         { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2626         { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2627 
2628         /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2629         { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2630         { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2631         { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2632         { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2633         { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2634         { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2635         { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2636         { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2637         { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2638         { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2639         { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2640         { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2641         { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2642         { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2643         { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2644 
2645         /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2646 
2647         /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2648 
2649         { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2650         { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2651         { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2652         { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2653         { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2654         { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2655         { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2656 
2657         { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2658         { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2659         { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2660         { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2661 
2662         { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2663         { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2664         { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2665         { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2666         { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2667         { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2668 
2669         { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2670         { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2671         { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2672         { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2673 
2674         { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2675         { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2676         { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2677         { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2678         { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2679         { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2680         { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2681 
2682         { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2683         { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2684         { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2685         { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2686 
2687         { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2688         { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2689         { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2690         { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2691 
2692         { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2693         { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2694         { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2695         { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2696         { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2697 
2698         { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2699 
2700         { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2701 
2702         { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2703         { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2704         { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2705 
2706         { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2707         { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2708         { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2709         { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2710         { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2711 
2712         { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2713         { 0x2c8e,  UCHAR_BLOCK, UBLOCK_COPTIC },
2714         { 0xfe17,  UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2715 
2716         { 0x1a00,  UCHAR_SCRIPT, USCRIPT_BUGINESE },
2717         { 0x2cea,  UCHAR_SCRIPT, USCRIPT_COPTIC },
2718         { 0xa82b,  UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2719         { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2720 
2721         { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2722         { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2723         { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2724         { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2725         { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2726         { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2727 
2728         { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2729         { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2730         { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2731         { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2732 
2733         { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2734         { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2735         { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2736         { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2737 
2738         { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2739         { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2740         { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2741         { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2742 
2743         { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2744 
2745         /* unassigned code points in new default Bidi R blocks */
2746         { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2747         { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2748 
2749         /* test some script codes >127 */
2750         { 0xa6e6,  UCHAR_SCRIPT, USCRIPT_BAMUM },
2751         { 0xa4d0,  UCHAR_SCRIPT, USCRIPT_LISU },
2752         { 0x10a7f,  UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2753 
2754         { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2755 
2756         /* value changed in Unicode 6.0 */
2757         { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2758 
2759         { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2760 
2761         /* unassigned code points in new/changed default Bidi AL blocks */
2762         { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2763         { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2764 
2765         { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2766 
2767         /* unassigned code points in the currency symbols block now default to ET */
2768         { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2769         { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2770 
2771         /* new property in Unicode 6.3 */
2772         { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2773         { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2774         { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2775         { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2776         { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2777         { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2778 
2779         { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2780 
2781         /* new character range with Joining_Group values */
2782         { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2783         { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2784         { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2785         { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2786         { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2787 
2788         { -1, 0xa00, 0 },  // version break for Unicode 10
2789 
2790         { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2791         { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2792         { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2793         { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2794 
2795         { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2796         { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2797         { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2798 
2799         /* undefined UProperty values */
2800         { 0x61, 0x4a7, 0 },
2801         { 0x234bc, 0x15ed, 0 }
2802     };
2803 
2804     UVersionInfo version;
2805     UChar32 c;
2806     int32_t i, result, uVersion;
2807     UProperty which;
2808 
2809     /* what is our Unicode version? */
2810     u_getUnicodeVersion(version);
2811     uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2812 
2813     u_charAge(0x20, version);
2814     if(version[0]==0) {
2815         /* no additional properties available */
2816         log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2817         return;
2818     }
2819 
2820     /* test u_charAge() */
2821     for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2822         u_charAge(charAges[i].c, version);
2823         if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2824             log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2825                 charAges[i].c,
2826                 version[0], version[1], version[2], version[3],
2827                 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2828         }
2829     }
2830 
2831     if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2832         u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2833         u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 ||   /* j2478 */
2834         u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2835         u_getIntPropertyMinValue(0x2345)!=0
2836     ) {
2837         log_err("error: u_getIntPropertyMinValue() wrong\n");
2838     }
2839     if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2840         log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2841     }
2842     if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2843         log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2844     }
2845     if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2846         log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2847     }
2848     if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2849         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2850     }
2851     if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2852         log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2853     }
2854     if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2855         log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2856     }
2857     if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2858         log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2859     }
2860     if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2861         log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2862     }
2863     if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2864         log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2865     }
2866     if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2867         log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2868     }
2869     if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2870         log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2871     }
2872     if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2873         log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2874     }
2875     if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2876         log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2877     }
2878     if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2879         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2880     }
2881     /*JB#2410*/
2882     if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2883         log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2884     }
2885     if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2886         log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2887     }
2888     if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) !=  (int32_t) (U_JG_COUNT -1)) {
2889         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2890     }
2891     if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2892         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2893     }
2894     if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2895         log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2896     }
2897 
2898     /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2899     for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2900         const char *whichName;
2901 
2902         if(props[i][0]<0) {
2903             /* Unicode version break */
2904             if(uVersion<props[i][1]) {
2905                 break; /* do not test properties that are not yet supported */
2906             } else {
2907                 continue; /* skip this row */
2908             }
2909         }
2910 
2911         c=(UChar32)props[i][0];
2912         which=(UProperty)props[i][1];
2913         whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2914 
2915         if(which<UCHAR_INT_START) {
2916             result=u_hasBinaryProperty(c, which);
2917             if(result!=props[i][2]) {
2918                 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2919                         c, whichName, result, i);
2920             }
2921         }
2922 
2923         result=u_getIntPropertyValue(c, which);
2924         if(result!=props[i][2]) {
2925             log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2926                     c, whichName, result, props[i][2], i);
2927         }
2928 
2929         /* test separate functions, too */
2930         switch((UProperty)props[i][1]) {
2931         case UCHAR_ALPHABETIC:
2932             if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2933                 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2934                         props[i][0], result, i);
2935             }
2936             break;
2937         case UCHAR_LOWERCASE:
2938             if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2939                 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2940                         props[i][0], result, i);
2941             }
2942             break;
2943         case UCHAR_UPPERCASE:
2944             if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2945                 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2946                         props[i][0], result, i);
2947             }
2948             break;
2949         case UCHAR_WHITE_SPACE:
2950             if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2951                 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2952                         props[i][0], result, i);
2953             }
2954             break;
2955         default:
2956             break;
2957         }
2958     }
2959 }
2960 
2961 static void
TestNumericProperties(void)2962 TestNumericProperties(void) {
2963     /* see UnicodeData.txt, DerivedNumericValues.txt */
2964     static const struct {
2965         UChar32 c;
2966         int32_t type;
2967         double numValue;
2968     } values[]={
2969         { 0x0F33, U_NT_NUMERIC, -1./2. },
2970         { 0x0C66, U_NT_DECIMAL, 0 },
2971         { 0x96f6, U_NT_NUMERIC, 0 },
2972         { 0xa833, U_NT_NUMERIC, 1./16. },
2973         { 0x2152, U_NT_NUMERIC, 1./10. },
2974         { 0x2151, U_NT_NUMERIC, 1./9. },
2975         { 0x1245f, U_NT_NUMERIC, 1./8. },
2976         { 0x2150, U_NT_NUMERIC, 1./7. },
2977         { 0x2159, U_NT_NUMERIC, 1./6. },
2978         { 0x09f6, U_NT_NUMERIC, 3./16. },
2979         { 0x2155, U_NT_NUMERIC, 1./5. },
2980         { 0x00BD, U_NT_NUMERIC, 1./2. },
2981         { 0x0031, U_NT_DECIMAL, 1. },
2982         { 0x4e00, U_NT_NUMERIC, 1. },
2983         { 0x58f1, U_NT_NUMERIC, 1. },
2984         { 0x10320, U_NT_NUMERIC, 1. },
2985         { 0x0F2B, U_NT_NUMERIC, 3./2. },
2986         { 0x00B2, U_NT_DIGIT, 2. },
2987         { 0x5f10, U_NT_NUMERIC, 2. },
2988         { 0x1813, U_NT_DECIMAL, 3. },
2989         { 0x5f0e, U_NT_NUMERIC, 3. },
2990         { 0x2173, U_NT_NUMERIC, 4. },
2991         { 0x8086, U_NT_NUMERIC, 4. },
2992         { 0x278E, U_NT_DIGIT, 5. },
2993         { 0x1D7F2, U_NT_DECIMAL, 6. },
2994         { 0x247A, U_NT_DIGIT, 7. },
2995         { 0x7396, U_NT_NUMERIC, 9. },
2996         { 0x1372, U_NT_NUMERIC, 10. },
2997         { 0x216B, U_NT_NUMERIC, 12. },
2998         { 0x16EE, U_NT_NUMERIC, 17. },
2999         { 0x249A, U_NT_NUMERIC, 19. },
3000         { 0x303A, U_NT_NUMERIC, 30. },
3001         { 0x5345, U_NT_NUMERIC, 30. },
3002         { 0x32B2, U_NT_NUMERIC, 37. },
3003         { 0x1375, U_NT_NUMERIC, 40. },
3004         { 0x10323, U_NT_NUMERIC, 50. },
3005         { 0x0BF1, U_NT_NUMERIC, 100. },
3006         { 0x964c, U_NT_NUMERIC, 100. },
3007         { 0x217E, U_NT_NUMERIC, 500. },
3008         { 0x2180, U_NT_NUMERIC, 1000. },
3009         { 0x4edf, U_NT_NUMERIC, 1000. },
3010         { 0x2181, U_NT_NUMERIC, 5000. },
3011         { 0x137C, U_NT_NUMERIC, 10000. },
3012         { 0x4e07, U_NT_NUMERIC, 10000. },
3013         { 0x12432, U_NT_NUMERIC, 216000. },
3014         { 0x12433, U_NT_NUMERIC, 432000. },
3015         { 0x4ebf, U_NT_NUMERIC, 100000000. },
3016         { 0x5146, U_NT_NUMERIC, 1000000000000. },
3017         { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
3018         { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
3019         { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
3020         { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
3021         { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
3022         { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
3023         { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
3024         { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
3025     };
3026 
3027     double nv;
3028     UChar32 c;
3029     int32_t i, type;
3030 
3031     for(i=0; i<UPRV_LENGTHOF(values); ++i) {
3032         c=values[i].c;
3033         type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
3034         nv=u_getNumericValue(c);
3035 
3036         if(type!=values[i].type) {
3037             log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
3038         }
3039         if(0.000001 <= fabs(nv - values[i].numValue)) {
3040             log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3041         }
3042     }
3043 }
3044 
3045 /**
3046  * Test the property names and property value names API.
3047  */
3048 static void
TestPropertyNames(void)3049 TestPropertyNames(void) {
3050     int32_t p, v, choice=0, rev;
3051     UBool atLeastSomething = FALSE;
3052 
3053     for (p=0; ; ++p) {
3054         UProperty propEnum = (UProperty)p;
3055         UBool sawProp = FALSE;
3056         if(p > 10 && !atLeastSomething) {
3057           log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3058           return;
3059         }
3060 
3061         for (choice=0; ; ++choice) {
3062             const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3063             if (name) {
3064                 if (!sawProp)
3065                     log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3066                 log_verbose("%d=\"%s\"", choice, name);
3067                 sawProp = TRUE;
3068                 atLeastSomething = TRUE;
3069 
3070                 /* test reverse mapping */
3071                 rev = u_getPropertyEnum(name);
3072                 if (rev != p) {
3073                     log_err("Property round-trip failure: %d -> %s -> %d\n",
3074                             p, name, rev);
3075                 }
3076             }
3077             if (!name && choice>0) break;
3078         }
3079         if (sawProp) {
3080             /* looks like a valid property; check the values */
3081             const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3082             int32_t max = 0;
3083             if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3084                 max = 255;
3085             } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3086                 /* it's far too slow to iterate all the way up to
3087                    the real max, U_GC_P_MASK */
3088                 max = U_GC_NL_MASK;
3089             } else if (p == UCHAR_BLOCK) {
3090                 /* UBlockCodes, unlike other values, start at 1 */
3091                 max = 1;
3092             }
3093             log_verbose("\n");
3094             for (v=-1; ; ++v) {
3095                 UBool sawValue = FALSE;
3096                 for (choice=0; ; ++choice) {
3097                     const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3098                     if (vname) {
3099                         if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3100                         log_verbose("%d=\"%s\"", choice, vname);
3101                         sawValue = TRUE;
3102 
3103                         /* test reverse mapping */
3104                         rev = u_getPropertyValueEnum(propEnum, vname);
3105                         if (rev != v) {
3106                             log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3107                                     pname, v, vname, rev);
3108                         }
3109                     }
3110                     if (!vname && choice>0) break;
3111                 }
3112                 if (sawValue) {
3113                     log_verbose("\n");
3114                 }
3115                 if (!sawValue && v>=max) break;
3116             }
3117         }
3118         if (!sawProp) {
3119             if (p>=UCHAR_STRING_LIMIT) {
3120                 break;
3121             } else if (p>=UCHAR_DOUBLE_LIMIT) {
3122                 p = UCHAR_STRING_START - 1;
3123             } else if (p>=UCHAR_MASK_LIMIT) {
3124                 p = UCHAR_DOUBLE_START - 1;
3125             } else if (p>=UCHAR_INT_LIMIT) {
3126                 p = UCHAR_MASK_START - 1;
3127             } else if (p>=UCHAR_BINARY_LIMIT) {
3128                 p = UCHAR_INT_START - 1;
3129             }
3130         }
3131     }
3132 }
3133 
3134 /**
3135  * Test the property values API.  See JB#2410.
3136  */
3137 static void
TestPropertyValues(void)3138 TestPropertyValues(void) {
3139     int32_t i, p, min, max;
3140     UErrorCode ec;
3141 
3142     /* Min should be 0 for everything. */
3143     /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3144     for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3145         UProperty propEnum = (UProperty)p;
3146         min = u_getIntPropertyMinValue(propEnum);
3147         if (min != 0) {
3148             if (p == UCHAR_BLOCK) {
3149                 /* This is okay...for now.  See JB#2487.
3150                    TODO Update this for JB#2487. */
3151             } else {
3152                 const char* name;
3153                 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3154                 if (name == NULL)
3155                     name = "<ERROR>";
3156                 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3157                         name, min);
3158             }
3159         }
3160     }
3161 
3162     if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3163         u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3164         log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3165     }
3166 
3167     /* Max should be -1 for invalid properties. */
3168     max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3169     if (max != -1) {
3170         log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3171                 max);
3172     }
3173 
3174     /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3175     for (i=0; i<2; ++i) {
3176         int32_t script;
3177         const char* desc;
3178         ec = U_ZERO_ERROR;
3179         switch (i) {
3180         case 0:
3181             script = uscript_getScript(-1, &ec);
3182             desc = "uscript_getScript(-1)";
3183             break;
3184         case 1:
3185             script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3186             desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3187             break;
3188         default:
3189             log_err("Internal test error. Too many scripts\n");
3190             return;
3191         }
3192         /* We don't explicitly test ec.  It should be U_FAILURE but it
3193            isn't documented as such. */
3194         if (script != (int32_t)USCRIPT_INVALID_CODE) {
3195             log_err("FAIL: %s = %d, exp. 0\n",
3196                     desc, script);
3197         }
3198     }
3199 }
3200 
3201 /* various tests for consistency of UCD data and API behavior */
3202 static void
TestConsistency()3203 TestConsistency() {
3204     char buffer[300];
3205     USet *set1, *set2, *set3, *set4;
3206     UErrorCode errorCode;
3207 
3208     UChar32 start, end;
3209     int32_t i, length;
3210 
3211     U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3212     U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3213     U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3214     U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3215     U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3216 
3217     U_STRING_DECL(mathBlocksPattern,
3218         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3219         214);
3220     U_STRING_DECL(mathPattern, "[:Math:]", 8);
3221     U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3222     U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3223     U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3224 
3225     U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3226     U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3227     U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3228     U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3229     U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3230 
3231     U_STRING_INIT(mathBlocksPattern,
3232         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3233         214);
3234     U_STRING_INIT(mathPattern, "[:Math:]", 8);
3235     U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3236     U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3237     U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3238 
3239     /*
3240      * It used to be that UCD.html and its precursors said
3241      * "Those dashes used to mark connections between pieces of words,
3242      *  plus the Katakana middle dot."
3243      *
3244      * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3245      * but not from Hyphen.
3246      * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3247      * Therefore, do not show errors when testing the Hyphen property.
3248      */
3249     log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3250                 "known to the UTC and not considered errors.\n");
3251 
3252     errorCode=U_ZERO_ERROR;
3253     set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3254     set2=uset_openPattern(dashPattern, 8, &errorCode);
3255     if(U_SUCCESS(errorCode)) {
3256         /* remove the Katakana middle dot(s) from set1 */
3257         uset_remove(set1, 0x30fb);
3258         uset_remove(set1, 0xff65); /* halfwidth variant */
3259         showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3260     } else {
3261         log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3262     }
3263 
3264     /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3265     set3=uset_openPattern(formatPattern, 6, &errorCode);
3266     set4=uset_openPattern(alphaPattern, 14, &errorCode);
3267     if(U_SUCCESS(errorCode)) {
3268         showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3269         showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3270         showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3271     } else {
3272         log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3273     }
3274 
3275     uset_close(set1);
3276     uset_close(set2);
3277     uset_close(set3);
3278     uset_close(set4);
3279 
3280     /*
3281      * Check that each lowercase character has "small" in its name
3282      * and not "capital".
3283      * There are some such characters, some of which seem odd.
3284      * Use the verbose flag to see these notices.
3285      */
3286     errorCode=U_ZERO_ERROR;
3287     set1=uset_openPattern(lowerPattern, 13, &errorCode);
3288     if(U_SUCCESS(errorCode)) {
3289         for(i=0;; ++i) {
3290             length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3291             if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3292                 break; /* done */
3293             }
3294             if(U_FAILURE(errorCode)) {
3295                 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3296                         i, u_errorName(errorCode));
3297                 break;
3298             }
3299             if(length!=0) {
3300                 break; /* done with code points, got a string or -1 */
3301             }
3302 
3303             while(start<=end) {
3304                 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3305                 if(U_FAILURE(errorCode)) {
3306                     log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3307                     errorCode=U_ZERO_ERROR;
3308                 }
3309                 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3310                     strstr(buffer, "SMALL CAPITAL")==NULL
3311                 ) {
3312                     log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3313                 }
3314                 ++start;
3315             }
3316         }
3317     } else {
3318         log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3319     }
3320     uset_close(set1);
3321 
3322     /* verify that all assigned characters in Math blocks are exactly Math characters */
3323     errorCode=U_ZERO_ERROR;
3324     set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3325     set2=uset_openPattern(mathPattern, 8, &errorCode);
3326     set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3327     if(U_SUCCESS(errorCode)) {
3328         uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3329         uset_complement(set3);      /* assigned characters */
3330         uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3331         compareUSets(set1, set2,
3332                      "[assigned Math block chars]", "[math blocks]&[:Math:]",
3333                      TRUE);
3334     } else {
3335         log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3336     }
3337     uset_close(set1);
3338     uset_close(set2);
3339     uset_close(set3);
3340 
3341     /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3342     errorCode=U_ZERO_ERROR;
3343     set1=uset_openPattern(unknownPattern, 14, &errorCode);
3344     set2=uset_openPattern(reservedPattern, 20, &errorCode);
3345     if(U_SUCCESS(errorCode)) {
3346         compareUSets(set1, set2,
3347                      "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3348                      TRUE);
3349     } else {
3350         log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3351     }
3352     uset_close(set1);
3353     uset_close(set2);
3354 }
3355 
3356 /* test case folding, compare return values with CaseFolding.txt ------------ */
3357 
3358 /* bit set for which case foldings for a character have been tested already */
3359 enum {
3360     CF_SIMPLE=1,
3361     CF_FULL=2,
3362     CF_TURKIC=4,
3363     CF_ALL=7
3364 };
3365 
3366 static void
testFold(UChar32 c,int which,UChar32 simple,UChar32 turkic,const UChar * full,int32_t fullLength,const UChar * turkicFull,int32_t turkicFullLength)3367 testFold(UChar32 c, int which,
3368          UChar32 simple, UChar32 turkic,
3369          const UChar *full, int32_t fullLength,
3370          const UChar *turkicFull, int32_t turkicFullLength) {
3371     UChar s[2], t[32];
3372     UChar32 c2;
3373     int32_t length, length2;
3374 
3375     UErrorCode errorCode=U_ZERO_ERROR;
3376 
3377     length=0;
3378     U16_APPEND_UNSAFE(s, length, c);
3379 
3380     if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3381         log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3382     }
3383     if((which&CF_FULL)!=0) {
3384         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3385         if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3386             log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3387         }
3388     }
3389     if((which&CF_TURKIC)!=0) {
3390         if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3391             log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3392         }
3393 
3394         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3395         if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3396             log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3397         }
3398     }
3399 }
3400 
3401 /* test that c case-folds to itself */
3402 static void
testFoldToSelf(UChar32 c,int which)3403 testFoldToSelf(UChar32 c, int which) {
3404     UChar s[2];
3405     int32_t length;
3406 
3407     length=0;
3408     U16_APPEND_UNSAFE(s, length, c);
3409     testFold(c, which, c, c, s, length, s, length);
3410 }
3411 
3412 struct CaseFoldingData {
3413     USet *notSeen;
3414     UChar32 prev, prevSimple;
3415     UChar prevFull[32];
3416     int32_t prevFullLength;
3417     int which;
3418 };
3419 typedef struct CaseFoldingData CaseFoldingData;
3420 
3421 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)3422 caseFoldingLineFn(void *context,
3423                   char *fields[][2], int32_t fieldCount,
3424                   UErrorCode *pErrorCode) {
3425     (void)fieldCount; // suppress compiler warnings about unused variable
3426 
3427     CaseFoldingData *pData=(CaseFoldingData *)context;
3428     char *end;
3429     UChar full[32];
3430     UChar32 c, prev, simple;
3431     int32_t count;
3432     int which;
3433     char status;
3434 
3435     /* get code point */
3436     const char *s=u_skipWhitespace(fields[0][0]);
3437     if(0==strncmp(s, "0000..10FFFF", 12)) {
3438         /*
3439          * Ignore the line
3440          * # @missing: 0000..10FFFF; C; <code point>
3441          * because maps-to-self is already our default, and this line breaks this parser.
3442          */
3443         return;
3444     }
3445     c=(UChar32)strtoul(s, &end, 16);
3446     end=(char *)u_skipWhitespace(end);
3447     if(end<=fields[0][0] || end!=fields[0][1]) {
3448         log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3449         *pErrorCode=U_PARSE_ERROR;
3450         return;
3451     }
3452 
3453     /* get the status of this mapping */
3454     status=*u_skipWhitespace(fields[1][0]);
3455     if(status!='C' && status!='S' && status!='F' && status!='T') {
3456         log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3457         *pErrorCode=U_PARSE_ERROR;
3458         return;
3459     }
3460 
3461     /* get the mapping */
3462     count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3463     if(U_FAILURE(*pErrorCode)) {
3464         log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3465         return;
3466     }
3467 
3468     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3469     if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3470         simple=c;
3471     }
3472 
3473     if(c!=(prev=pData->prev)) {
3474         /*
3475          * Test remaining mappings for the previous code point.
3476          * If a turkic folding was not mentioned, then it should fold the same
3477          * as the regular simple case folding.
3478          */
3479         UChar prevString[2];
3480         int32_t length;
3481 
3482         length=0;
3483         U16_APPEND_UNSAFE(prevString, length, prev);
3484         testFold(prev, (~pData->which)&CF_ALL,
3485                  prev, pData->prevSimple,
3486                  prevString, length,
3487                  pData->prevFull, pData->prevFullLength);
3488         pData->prev=pData->prevSimple=c;
3489         length=0;
3490         U16_APPEND_UNSAFE(pData->prevFull, length, c);
3491         pData->prevFullLength=length;
3492         pData->which=0;
3493     }
3494 
3495     /*
3496      * Turn the status into a bit set of case foldings to test.
3497      * Remember non-Turkic case foldings as defaults for Turkic mode.
3498      */
3499     switch(status) {
3500     case 'C':
3501         which=CF_SIMPLE|CF_FULL;
3502         pData->prevSimple=simple;
3503         u_memcpy(pData->prevFull, full, count);
3504         pData->prevFullLength=count;
3505         break;
3506     case 'S':
3507         which=CF_SIMPLE;
3508         pData->prevSimple=simple;
3509         break;
3510     case 'F':
3511         which=CF_FULL;
3512         u_memcpy(pData->prevFull, full, count);
3513         pData->prevFullLength=count;
3514         break;
3515     case 'T':
3516         which=CF_TURKIC;
3517         break;
3518     default:
3519         which=0;
3520         break; /* won't happen because of test above */
3521     }
3522 
3523     testFold(c, which, simple, simple, full, count, full, count);
3524 
3525     /* remember which case foldings of c have been tested */
3526     pData->which|=which;
3527 
3528     /* remove c from the set of ones not mentioned in CaseFolding.txt */
3529     uset_remove(pData->notSeen, c);
3530 }
3531 
3532 static void
TestCaseFolding()3533 TestCaseFolding() {
3534     CaseFoldingData data={ NULL, 0, 0, {0}, 0, 0 };
3535     char *fields[3][2];
3536     UErrorCode errorCode;
3537 
3538     static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3539 
3540     errorCode=U_ZERO_ERROR;
3541     /* test BMP & plane 1 - nothing interesting above */
3542     data.notSeen=uset_open(0, 0x1ffff);
3543     data.prevFullLength=1; /* length of full case folding of U+0000 */
3544 
3545     parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3546     if(U_SUCCESS(errorCode)) {
3547         int32_t i, start, end;
3548 
3549         /* add a pseudo-last line to finish testing of the actual last one */
3550         fields[0][0]=lastLine;
3551         fields[0][1]=lastLine+6;
3552         fields[1][0]=lastLine+7;
3553         fields[1][1]=lastLine+9;
3554         fields[2][0]=lastLine+10;
3555         fields[2][1]=lastLine+17;
3556         caseFoldingLineFn(&data, fields, 3, &errorCode);
3557 
3558         /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3559         for(i=0;
3560             0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3561                 U_SUCCESS(errorCode);
3562             ++i
3563         ) {
3564             do {
3565                 testFoldToSelf(start, CF_ALL);
3566             } while(++start<=end);
3567         }
3568     }
3569 
3570     uset_close(data.notSeen);
3571 }
3572 
TestBinaryCharacterPropertiesAPI()3573 static void TestBinaryCharacterPropertiesAPI() {
3574     // API test only. See intltest/ucdtest.cpp for functional test.
3575     UErrorCode errorCode = U_ZERO_ERROR;
3576     const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3577     if (U_SUCCESS(errorCode)) {
3578         log_err("u_getBinaryPropertySet(-1) did not fail\n");
3579     }
3580     errorCode = U_ZERO_ERROR;
3581     set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3582     if (U_SUCCESS(errorCode)) {
3583         log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3584     }
3585     errorCode = U_ZERO_ERROR;
3586     set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3587     if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3588         log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3589     }
3590 }
3591 
TestIntCharacterPropertiesAPI()3592 static void TestIntCharacterPropertiesAPI() {
3593     // API test only. See intltest/ucdtest.cpp for functional test.
3594     UErrorCode errorCode = U_ZERO_ERROR;
3595     const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3596     if (U_SUCCESS(errorCode)) {
3597         log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3598     }
3599     errorCode = U_ZERO_ERROR;
3600     map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3601     if (U_SUCCESS(errorCode)) {
3602         log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3603     }
3604     errorCode = U_ZERO_ERROR;
3605     map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3606     if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3607         log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3608     }
3609 }
3610