1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28 #include "unicode/utf16.h"
29 #include "unicode/utf_old.h"
30 #include "cintltst.h"
31 #include "putilimp.h"
32 #include "uparse.h"
33 #include "ucase.h"
34 #include "ubidi_props.h"
35 #include "uprops.h"
36 #include "uset_imp.h"
37 #include "usc_impl.h"
38 #include "udatamem.h"
39 #include "cucdapi.h"
40 #include "cmemory.h"
41
42 /* prototypes --------------------------------------------------------------- */
43
44 static void TestUpperLower(void);
45 static void TestLetterNumber(void);
46 static void TestMisc(void);
47 static void TestPOSIX(void);
48 static void TestControlPrint(void);
49 static void TestIdentifier(void);
50 static void TestUnicodeData(void);
51 static void TestCodeUnit(void);
52 static void TestCodePoint(void);
53 static void TestCharLength(void);
54 static void TestCharNames(void);
55 static void TestUCharFromNameUnderflow(void);
56 static void TestMirroring(void);
57 static void TestUScriptRunAPI(void);
58 static void TestAdditionalProperties(void);
59 static void TestNumericProperties(void);
60 static void TestPropertyNames(void);
61 static void TestPropertyValues(void);
62 static void TestConsistency(void);
63 static void TestUBiDiProps(void);
64 static void TestCaseFolding(void);
65
66 /* internal methods used */
67 static int32_t MakeProp(char* str);
68 static int32_t MakeDir(char* str);
69
70 /* helpers ------------------------------------------------------------------ */
71
72 static void
parseUCDFile(const char * filename,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)73 parseUCDFile(const char *filename,
74 char *fields[][2], int32_t fieldCount,
75 UParseLineFn *lineFn, void *context,
76 UErrorCode *pErrorCode) {
77 char path[256];
78 char backupPath[256];
79
80 if(U_FAILURE(*pErrorCode)) {
81 return;
82 }
83
84 /* Look inside ICU_DATA first */
85 strcpy(path, u_getDataDirectory());
86 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
87 strcat(path, filename);
88
89 /* As a fallback, try to guess where the source data was located
90 * at the time ICU was built, and look there.
91 */
92 strcpy(backupPath, ctest_dataSrcDir());
93 strcat(backupPath, U_FILE_SEP_STRING);
94 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
95 strcat(backupPath, filename);
96
97 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
98 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
99 *pErrorCode=U_ZERO_ERROR;
100 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
101 }
102 if(U_FAILURE(*pErrorCode)) {
103 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
104 }
105 }
106
107 /* test data ---------------------------------------------------------------- */
108
109 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
110 static const int32_t tagValues[] =
111 {
112 /* Mn */ U_NON_SPACING_MARK,
113 /* Mc */ U_COMBINING_SPACING_MARK,
114 /* Me */ U_ENCLOSING_MARK,
115 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
116 /* Nl */ U_LETTER_NUMBER,
117 /* No */ U_OTHER_NUMBER,
118 /* Zs */ U_SPACE_SEPARATOR,
119 /* Zl */ U_LINE_SEPARATOR,
120 /* Zp */ U_PARAGRAPH_SEPARATOR,
121 /* Cc */ U_CONTROL_CHAR,
122 /* Cf */ U_FORMAT_CHAR,
123 /* Cs */ U_SURROGATE,
124 /* Co */ U_PRIVATE_USE_CHAR,
125 /* Cn */ U_UNASSIGNED,
126 /* Lu */ U_UPPERCASE_LETTER,
127 /* Ll */ U_LOWERCASE_LETTER,
128 /* Lt */ U_TITLECASE_LETTER,
129 /* Lm */ U_MODIFIER_LETTER,
130 /* Lo */ U_OTHER_LETTER,
131 /* Pc */ U_CONNECTOR_PUNCTUATION,
132 /* Pd */ U_DASH_PUNCTUATION,
133 /* Ps */ U_START_PUNCTUATION,
134 /* Pe */ U_END_PUNCTUATION,
135 /* Po */ U_OTHER_PUNCTUATION,
136 /* Sm */ U_MATH_SYMBOL,
137 /* Sc */ U_CURRENCY_SYMBOL,
138 /* Sk */ U_MODIFIER_SYMBOL,
139 /* So */ U_OTHER_SYMBOL,
140 /* Pi */ U_INITIAL_PUNCTUATION,
141 /* Pf */ U_FINAL_PUNCTUATION
142 };
143
144 static const char dirStrings[][5] = {
145 "L",
146 "R",
147 "EN",
148 "ES",
149 "ET",
150 "AN",
151 "CS",
152 "B",
153 "S",
154 "WS",
155 "ON",
156 "LRE",
157 "LRO",
158 "AL",
159 "RLE",
160 "RLO",
161 "PDF",
162 "NSM",
163 "BN",
164 /* new in Unicode 6.3/ICU 52 */
165 "FSI",
166 "LRI",
167 "RLI",
168 "PDI"
169 };
170
171 void addUnicodeTest(TestNode** root);
172
addUnicodeTest(TestNode ** root)173 void addUnicodeTest(TestNode** root)
174 {
175 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
176 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
177 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
178 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
179 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
180 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
181 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
182 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
183 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
184 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
185 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
186 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
187 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
188 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
189 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
190 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
191 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
192 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
193 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
194 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
195 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
196 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
197 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
198 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
199 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
200 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
201 }
202
203 /*==================================================== */
204 /* test u_toupper() and u_tolower() */
205 /*==================================================== */
TestUpperLower()206 static void TestUpperLower()
207 {
208 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
209 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
210 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
211 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
212 int32_t i;
213
214 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
215 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
216
217 /*
218 Checks LetterLike Symbols which were previously a source of confusion
219 [Bertrand A. D. 02/04/98]
220 */
221 for (i=0x2100;i<0x2138;i++)
222 {
223 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
224 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
225 {
226 if (i != (int)u_tolower(i)) /* itself */
227 log_err("Failed case conversion with itself: U+%04x\n", i);
228 if (i != (int)u_toupper(i))
229 log_err("Failed case conversion with itself: U+%04x\n", i);
230 }
231 }
232
233 for(i=0; i < u_strlen(upper); i++){
234 if(u_tolower(upper[i]) != lower[i]){
235 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
236 }
237 }
238
239 log_verbose("testing upper lower\n");
240 for (i = 0; i < 21; i++) {
241
242 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
243 {
244 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
245 }
246 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
247 {
248 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
249 }
250 else if (upperTest[i] != u_tolower(lowerTest[i]))
251 {
252 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
253 }
254 else if (lowerTest[i] != u_toupper(upperTest[i]))
255 {
256 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
257 }
258 else if (upperTest[i] != u_tolower(upperTest[i]))
259 {
260 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
261 }
262 else if (lowerTest[i] != u_toupper(lowerTest[i]))
263 {
264 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
265 }
266 }
267 log_verbose("done testing upper lower\n");
268
269 log_verbose("testing u_istitle\n");
270 {
271 static const UChar expected[] = {
272 0x1F88,
273 0x1F89,
274 0x1F8A,
275 0x1F8B,
276 0x1F8C,
277 0x1F8D,
278 0x1F8E,
279 0x1F8F,
280 0x1F88,
281 0x1F89,
282 0x1F8A,
283 0x1F8B,
284 0x1F8C,
285 0x1F8D,
286 0x1F8E,
287 0x1F8F,
288 0x1F98,
289 0x1F99,
290 0x1F9A,
291 0x1F9B,
292 0x1F9C,
293 0x1F9D,
294 0x1F9E,
295 0x1F9F,
296 0x1F98,
297 0x1F99,
298 0x1F9A,
299 0x1F9B,
300 0x1F9C,
301 0x1F9D,
302 0x1F9E,
303 0x1F9F,
304 0x1FA8,
305 0x1FA9,
306 0x1FAA,
307 0x1FAB,
308 0x1FAC,
309 0x1FAD,
310 0x1FAE,
311 0x1FAF,
312 0x1FA8,
313 0x1FA9,
314 0x1FAA,
315 0x1FAB,
316 0x1FAC,
317 0x1FAD,
318 0x1FAE,
319 0x1FAF,
320 0x1FBC,
321 0x1FBC,
322 0x1FCC,
323 0x1FCC,
324 0x1FFC,
325 0x1FFC,
326 };
327 int32_t num = UPRV_LENGTHOF(expected);
328 for(i=0; i<num; i++){
329 if(!u_istitle(expected[i])){
330 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
331 }
332 }
333
334 }
335 }
336
337 /* compare two sets and verify that their difference or intersection is empty */
338 static UBool
showADiffB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool expect,UBool diffIsError)339 showADiffB(const USet *a, const USet *b,
340 const char *a_name, const char *b_name,
341 UBool expect, UBool diffIsError) {
342 USet *aa;
343 int32_t i, start, end, length;
344 UErrorCode errorCode;
345
346 /*
347 * expect:
348 * TRUE -> a-b should be empty, that is, b should contain all of a
349 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
350 */
351 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
352 return TRUE;
353 }
354
355 /* clone a to aa because a is const */
356 aa=uset_open(1, 0);
357 if(aa==NULL) {
358 /* unusual problem - out of memory? */
359 return FALSE;
360 }
361 uset_addAll(aa, a);
362
363 /* compute the set in question */
364 if(expect) {
365 /* a-b */
366 uset_removeAll(aa, b);
367 } else {
368 /* a&b */
369 uset_retainAll(aa, b);
370 }
371
372 /* aa is not empty because of the initial tests above; show its contents */
373 errorCode=U_ZERO_ERROR;
374 i=0;
375 for(;;) {
376 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
377 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
378 break; /* done */
379 }
380 if(U_FAILURE(errorCode)) {
381 log_err("error comparing %s with %s at difference item %d: %s\n",
382 a_name, b_name, i, u_errorName(errorCode));
383 break;
384 }
385 if(length!=0) {
386 break; /* done with code points, got a string or -1 */
387 }
388
389 if(diffIsError) {
390 if(expect) {
391 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
392 } else {
393 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
394 }
395 } else {
396 if(expect) {
397 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
398 } else {
399 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
400 }
401 }
402
403 ++i;
404 }
405
406 uset_close(aa);
407 return FALSE;
408 }
409
410 static UBool
showAMinusB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)411 showAMinusB(const USet *a, const USet *b,
412 const char *a_name, const char *b_name,
413 UBool diffIsError) {
414 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
415 }
416
417 static UBool
showAIntersectB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)418 showAIntersectB(const USet *a, const USet *b,
419 const char *a_name, const char *b_name,
420 UBool diffIsError) {
421 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
422 }
423
424 static UBool
compareUSets(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)425 compareUSets(const USet *a, const USet *b,
426 const char *a_name, const char *b_name,
427 UBool diffIsError) {
428 /*
429 * Use an arithmetic & not a logical && so that both branches
430 * are always taken and all differences are shown.
431 */
432 return
433 showAMinusB(a, b, a_name, b_name, diffIsError) &
434 showAMinusB(b, a, b_name, a_name, diffIsError);
435 }
436
437 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumber()438 static void TestLetterNumber()
439 {
440 UChar i = 0x0000;
441
442 log_verbose("Testing for isalpha\n");
443 for (i = 0x0041; i < 0x005B; i++) {
444 if (!u_isalpha(i))
445 {
446 log_err("Failed isLetter test at %.4X\n", i);
447 }
448 }
449 for (i = 0x0660; i < 0x066A; i++) {
450 if (u_isalpha(i))
451 {
452 log_err("Failed isLetter test with numbers at %.4X\n", i);
453 }
454 }
455
456 log_verbose("Testing for isdigit\n");
457 for (i = 0x0660; i < 0x066A; i++) {
458 if (!u_isdigit(i))
459 {
460 log_verbose("Failed isNumber test at %.4X\n", i);
461 }
462 }
463
464 log_verbose("Testing for isalnum\n");
465 for (i = 0x0041; i < 0x005B; i++) {
466 if (!u_isalnum(i))
467 {
468 log_err("Failed isAlNum test at %.4X\n", i);
469 }
470 }
471 for (i = 0x0660; i < 0x066A; i++) {
472 if (!u_isalnum(i))
473 {
474 log_err("Failed isAlNum test at %.4X\n", i);
475 }
476 }
477
478 {
479 /*
480 * The following checks work only starting from Unicode 4.0.
481 * Check the version number here.
482 */
483 static UVersionInfo u401={ 4, 0, 1, 0 };
484 UVersionInfo version;
485 u_getUnicodeVersion(version);
486 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
487 return;
488 }
489 }
490
491 {
492 /*
493 * Sanity check:
494 * Verify that exactly the digit characters have decimal digit values.
495 * This assumption is used in the implementation of u_digit()
496 * (which checks nt=de)
497 * compared with the parallel java.lang.Character.digit()
498 * (which checks Nd).
499 *
500 * This was not true in Unicode 3.2 and earlier.
501 * Unicode 4.0 fixed discrepancies.
502 * Unicode 4.0.1 re-introduced problems in this area due to an
503 * unintentionally incomplete last-minute change.
504 */
505 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
506 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
507
508 USet *digits, *decimalValues;
509 UErrorCode errorCode;
510
511 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
512 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
513 errorCode=U_ZERO_ERROR;
514 digits=uset_openPattern(digitsPattern, 6, &errorCode);
515 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
516
517 if(U_SUCCESS(errorCode)) {
518 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
519 }
520
521 uset_close(digits);
522 uset_close(decimalValues);
523 }
524 }
525
testSampleCharProps(UBool propFn (UChar32),const char * propName,const UChar32 * sampleChars,int32_t sampleCharsLength,UBool expected)526 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
527 const UChar32 *sampleChars, int32_t sampleCharsLength,
528 UBool expected) {
529 int32_t i;
530 for (i = 0; i < sampleCharsLength; ++i) {
531 UBool result = propFn(sampleChars[i]);
532 if (result != expected) {
533 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
534 propName, sampleChars[i], result);
535 }
536 }
537 }
538
539 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMisc()540 static void TestMisc()
541 {
542 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
543 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
544 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
545 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
546 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
547 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
548 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
549 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
550 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
551 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
552 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
553
554 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
555
556 uint32_t mask;
557
558 int32_t i;
559 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
560 UVersionInfo realVersion;
561
562 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
563
564 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
565 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
566
567 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
568 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
569 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
570 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
571
572 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
573 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
574 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
575 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
576
577 testSampleCharProps(u_isdefined, "u_isdefined",
578 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
579 testSampleCharProps(u_isdefined, "u_isdefined",
580 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
581
582 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
583 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
584
585 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
586 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
587
588 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
589 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
590 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
591 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
592 }
593 }
594
595 /* Tests the ICU version #*/
596 u_getVersion(realVersion);
597 u_versionToString(realVersion, icuVersion);
598 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
599 {
600 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
601 }
602 #if defined(ICU_VERSION)
603 /* test only happens where we have configure.in with VERSION - sanity check. */
604 if(strcmp(U_ICU_VERSION, ICU_VERSION))
605 {
606 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
607 }
608 #endif
609
610 /* test U_GC_... */
611 if(
612 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
613 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
614 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
615 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
616 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
617 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
618 ) {
619 log_err("error: U_GET_GC_MASK does not work properly\n");
620 }
621
622 mask=0;
623 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
624
625 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
626 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
627 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
628 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
629 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
630
631 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
632 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
633 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
634
635 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
636 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
637 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
638
639 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
640 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
641 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
642
643 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
644 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
645 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
646 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
647
648 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
649 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
650 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
651 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
652 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
653
654 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
655 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
656 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
657 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
658
659 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
660 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
661
662 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
663 log_err("error: problems with U_GC_XX_MASK constants\n");
664 }
665
666 mask=0;
667 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
668 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
669 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
670 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
671 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
672 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
673 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
674
675 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
676 log_err("error: problems with U_GC_Y_MASK constants\n");
677 }
678 {
679 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
680 for(i=0; i<10; i++){
681 if(digit[i]!=u_forDigit(i,10)){
682 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
683 }
684 }
685 }
686
687 /* test u_digit() */
688 {
689 static const struct {
690 UChar32 c;
691 int8_t radix, value;
692 } data[]={
693 /* base 16 */
694 { 0x0031, 16, 1 },
695 { 0x0038, 16, 8 },
696 { 0x0043, 16, 12 },
697 { 0x0066, 16, 15 },
698 { 0x00e4, 16, -1 },
699 { 0x0662, 16, 2 },
700 { 0x06f5, 16, 5 },
701 { 0xff13, 16, 3 },
702 { 0xff41, 16, 10 },
703
704 /* base 8 */
705 { 0x0031, 8, 1 },
706 { 0x0038, 8, -1 },
707 { 0x0043, 8, -1 },
708 { 0x0066, 8, -1 },
709 { 0x00e4, 8, -1 },
710 { 0x0662, 8, 2 },
711 { 0x06f5, 8, 5 },
712 { 0xff13, 8, 3 },
713 { 0xff41, 8, -1 },
714
715 /* base 36 */
716 { 0x5a, 36, 35 },
717 { 0x7a, 36, 35 },
718 { 0xff3a, 36, 35 },
719 { 0xff5a, 36, 35 },
720
721 /* wrong radix values */
722 { 0x0031, 1, -1 },
723 { 0xff3a, 37, -1 }
724 };
725
726 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
727 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
728 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
729 data[i].c,
730 data[i].radix,
731 u_digit(data[i].c, data[i].radix),
732 data[i].value);
733 }
734 }
735 }
736 }
737
738 /* test C/POSIX-style functions --------------------------------------------- */
739
740 /* bit flags */
741 #define ISAL 1
742 #define ISLO 2
743 #define ISUP 4
744
745 #define ISDI 8
746 #define ISXD 0x10
747
748 #define ISAN 0x20
749
750 #define ISPU 0x40
751 #define ISGR 0x80
752 #define ISPR 0x100
753
754 #define ISSP 0x200
755 #define ISBL 0x400
756 #define ISCN 0x800
757
758 /* C/POSIX-style functions, in the same order as the bit flags */
759 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
760
761 static const struct {
762 IsPOSIXClass *fn;
763 const char *name;
764 } posixClasses[]={
765 { u_isalpha, "isalpha" },
766 { u_islower, "islower" },
767 { u_isupper, "isupper" },
768 { u_isdigit, "isdigit" },
769 { u_isxdigit, "isxdigit" },
770 { u_isalnum, "isalnum" },
771 { u_ispunct, "ispunct" },
772 { u_isgraph, "isgraph" },
773 { u_isprint, "isprint" },
774 { u_isspace, "isspace" },
775 { u_isblank, "isblank" },
776 { u_iscntrl, "iscntrl" }
777 };
778
779 static const struct {
780 UChar32 c;
781 uint32_t posixResults;
782 } posixData[]={
783 { 0x0008, ISCN }, /* backspace */
784 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
785 { 0x000a, ISSP| ISCN }, /* LF */
786 { 0x000c, ISSP| ISCN }, /* FF */
787 { 0x000d, ISSP| ISCN }, /* CR */
788 { 0x0020, ISPR|ISSP|ISBL }, /* space */
789 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
790 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
791 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
792 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
793 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
794 { 0x007b, ISPU|ISGR|ISPR }, /* { */
795 { 0x0085, ISSP| ISCN }, /* NEL */
796 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
797 { 0x00a4, ISGR|ISPR }, /* currency sign */
798 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
799 { 0x0300, ISGR|ISPR }, /* combining grave */
800 { 0x0600, ISCN }, /* arabic number sign */
801 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
802 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
803 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
804 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
805 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
806 { 0x200b, ISCN }, /* ZWSP */
807 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
808 { 0x200e, ISCN }, /* LRM */
809 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
810 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
811 { 0x20ac, ISGR|ISPR }, /* Euro */
812 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
813 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
814 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
815 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
816 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
817 };
818
819 static void
TestPOSIX()820 TestPOSIX() {
821 uint32_t mask;
822 int32_t cl, i;
823 UBool expect;
824
825 mask=1;
826 for(cl=0; cl<12; ++cl) {
827 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
828 expect=(UBool)((posixData[i].posixResults&mask)!=0);
829 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
830 log_err("u_%s(U+%04x)=%s is wrong\n",
831 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
832 }
833 }
834 mask<<=1;
835 }
836 }
837
838 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrint()839 static void TestControlPrint()
840 {
841 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
842 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
843 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
844 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
845 UChar32 c;
846
847 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
848 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
849
850 testSampleCharProps(u_isprint, "u_isprint",
851 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
852 testSampleCharProps(u_isprint, "u_isprint",
853 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
854
855 /* test all ISO 8 controls */
856 for(c=0; c<=0x9f; ++c) {
857 if(c==0x20) {
858 /* skip ASCII graphic characters and continue with DEL */
859 c=0x7f;
860 }
861 if(!u_iscntrl(c)) {
862 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
863 }
864 if(!u_isISOControl(c)) {
865 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
866 }
867 if(u_isprint(c)) {
868 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
869 }
870 }
871
872 /* test all Latin-1 graphic characters */
873 for(c=0x20; c<=0xff; ++c) {
874 if(c==0x7f) {
875 c=0xa0;
876 } else if(c==0xad) {
877 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
878 ++c;
879 }
880 if(!u_isprint(c)) {
881 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
882 }
883 }
884 }
885
886 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifier()887 static void TestIdentifier()
888 {
889 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
890 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
891 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
892 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
893 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
894 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
895 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
896 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
897 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
898 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
899
900 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
901 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
902 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
903 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
904
905 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
906 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
907 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
908 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
909
910 /* IDPart should imply IDStart */
911 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
912 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
913
914 testSampleCharProps(u_isIDStart, "u_isIDStart",
915 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
916 testSampleCharProps(u_isIDStart, "u_isIDStart",
917 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
918
919 testSampleCharProps(u_isIDPart, "u_isIDPart",
920 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
921 testSampleCharProps(u_isIDPart, "u_isIDPart",
922 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
923
924 /* IDPart should imply IDStart */
925 testSampleCharProps(u_isIDPart, "u_isIDPart",
926 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
927
928 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
929 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
930 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
931 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
932 }
933
934 /* for each line of UnicodeData.txt, check some of the properties */
935 typedef struct UnicodeDataContext {
936 #if UCONFIG_NO_NORMALIZATION
937 const void *dummy;
938 #else
939 const UNormalizer2 *nfc;
940 const UNormalizer2 *nfkc;
941 #endif
942 } UnicodeDataContext;
943
944 /*
945 * ### TODO
946 * This test fails incorrectly if the First or Last code point of a repetitive area
947 * is overridden, which is allowed and is encouraged for the PUAs.
948 * Currently, this means that both area First/Last and override lines are
949 * tested against the properties from the API,
950 * and the area boundary will not match and cause an error.
951 *
952 * This function should detect area boundaries and skip them for the test of individual
953 * code points' properties.
954 * Then it should check that the areas contain all the same properties except where overridden.
955 * For this, it would have had to set a flag for which code points were listed explicitly.
956 */
957 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)958 unicodeDataLineFn(void *context,
959 char *fields[][2], int32_t fieldCount,
960 UErrorCode *pErrorCode)
961 {
962 char buffer[100];
963 const char *d;
964 char *end;
965 uint32_t value;
966 UChar32 c;
967 int32_t i;
968 int8_t type;
969 int32_t dt;
970 UChar dm[32], s[32];
971 int32_t dmLength, length;
972
973 #if !UCONFIG_NO_NORMALIZATION
974 const UNormalizer2 *nfc, *nfkc;
975 #endif
976
977 /* get the character code, field 0 */
978 c=strtoul(fields[0][0], &end, 16);
979 if(end<=fields[0][0] || end!=fields[0][1]) {
980 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
981 return;
982 }
983 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
984 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
985 return;
986 }
987
988 /* get general category, field 2 */
989 *fields[2][1]=0;
990 type = (int8_t)tagValues[MakeProp(fields[2][0])];
991 if(u_charType(c)!=type) {
992 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
993 }
994 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
995 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
996 }
997
998 /* get canonical combining class, field 3 */
999 value=strtoul(fields[3][0], &end, 10);
1000 if(end<=fields[3][0] || end!=fields[3][1]) {
1001 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1002 return;
1003 }
1004 if(value>255) {
1005 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1006 return;
1007 }
1008 #if !UCONFIG_NO_NORMALIZATION
1009 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1010 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1011 }
1012 nfkc=((UnicodeDataContext *)context)->nfkc;
1013 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1014 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1015 }
1016 #endif
1017
1018 /* get BiDi category, field 4 */
1019 *fields[4][1]=0;
1020 i=MakeDir(fields[4][0]);
1021 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1022 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1023 }
1024
1025 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1026 d=NULL;
1027 if(fields[5][0]==fields[5][1]) {
1028 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1029 if(c==0xac00 || c==0xd7a3) {
1030 dt=U_DT_CANONICAL;
1031 } else {
1032 dt=U_DT_NONE;
1033 }
1034 } else {
1035 d=fields[5][0];
1036 *fields[5][1]=0;
1037 dt=UCHAR_INVALID_CODE;
1038 if(*d=='<') {
1039 end=strchr(++d, '>');
1040 if(end!=NULL) {
1041 *end=0;
1042 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1043 d=u_skipWhitespace(end+1);
1044 }
1045 } else {
1046 dt=U_DT_CANONICAL;
1047 }
1048 }
1049 if(dt>U_DT_NONE) {
1050 if(c==0xac00) {
1051 dm[0]=0x1100;
1052 dm[1]=0x1161;
1053 dm[2]=0;
1054 dmLength=2;
1055 } else if(c==0xd7a3) {
1056 dm[0]=0xd788;
1057 dm[1]=0x11c2;
1058 dm[2]=0;
1059 dmLength=2;
1060 } else {
1061 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1062 }
1063 } else {
1064 dmLength=-1;
1065 }
1066 if(dt<0 || U_FAILURE(*pErrorCode)) {
1067 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1068 return;
1069 }
1070 #if !UCONFIG_NO_NORMALIZATION
1071 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1072 if(i!=dt) {
1073 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1074 }
1075 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1076 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1077 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1078 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1079 "or the Decomposition_Mapping is different (%s)\n",
1080 c, length, dmLength, u_errorName(*pErrorCode));
1081 return;
1082 }
1083 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1084 if(dt!=U_DT_CANONICAL) {
1085 dmLength=-1;
1086 }
1087 nfc=((UnicodeDataContext *)context)->nfc;
1088 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1089 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1090 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1091 "or the Decomposition_Mapping is different (%s)\n",
1092 c, length, dmLength, u_errorName(*pErrorCode));
1093 return;
1094 }
1095 /* recompose */
1096 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1097 UChar32 a, b, composite;
1098 i=0;
1099 U16_NEXT(dm, i, dmLength, a);
1100 U16_NEXT(dm, i, dmLength, b);
1101 /* i==dmLength */
1102 composite=unorm2_composePair(nfc, a, b);
1103 if(composite!=c) {
1104 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1105 (long)c, (long)a, (long)b, (long)composite);
1106 }
1107 /*
1108 * Note: NFKC has fewer round-trip mappings than NFC,
1109 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1110 */
1111 }
1112 #endif
1113
1114 /* get ISO Comment, field 11 */
1115 *fields[11][1]=0;
1116 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1117 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1118 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1119 c, u_errorName(*pErrorCode),
1120 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1121 fields[11][0]);
1122 }
1123
1124 /* get uppercase mapping, field 12 */
1125 if(fields[12][0]!=fields[12][1]) {
1126 value=strtoul(fields[12][0], &end, 16);
1127 if(end!=fields[12][1]) {
1128 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1129 return;
1130 }
1131 if((UChar32)value!=u_toupper(c)) {
1132 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1133 }
1134 } else {
1135 /* no case mapping: the API must map the code point to itself */
1136 if(c!=u_toupper(c)) {
1137 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1138 }
1139 }
1140
1141 /* get lowercase mapping, field 13 */
1142 if(fields[13][0]!=fields[13][1]) {
1143 value=strtoul(fields[13][0], &end, 16);
1144 if(end!=fields[13][1]) {
1145 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1146 return;
1147 }
1148 if((UChar32)value!=u_tolower(c)) {
1149 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1150 }
1151 } else {
1152 /* no case mapping: the API must map the code point to itself */
1153 if(c!=u_tolower(c)) {
1154 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1155 }
1156 }
1157
1158 /* get titlecase mapping, field 14 */
1159 if(fields[14][0]!=fields[14][1]) {
1160 value=strtoul(fields[14][0], &end, 16);
1161 if(end!=fields[14][1]) {
1162 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1163 return;
1164 }
1165 if((UChar32)value!=u_totitle(c)) {
1166 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1167 }
1168 } else {
1169 /* no case mapping: the API must map the code point to itself */
1170 if(c!=u_totitle(c)) {
1171 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1172 }
1173 }
1174 }
1175
1176 static UBool U_CALLCONV
enumTypeRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1177 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1178 static const UChar32 test[][2]={
1179 {0x41, U_UPPERCASE_LETTER},
1180 {0x308, U_NON_SPACING_MARK},
1181 {0xfffe, U_GENERAL_OTHER_TYPES},
1182 {0xe0041, U_FORMAT_CHAR},
1183 {0xeffff, U_UNASSIGNED}
1184 };
1185
1186 int32_t i, count;
1187
1188 if(0!=strcmp((const char *)context, "a1")) {
1189 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1190 return FALSE;
1191 }
1192
1193 count=UPRV_LENGTHOF(test);
1194 for(i=0; i<count; ++i) {
1195 if(start<=test[i][0] && test[i][0]<limit) {
1196 if(type!=(UCharCategory)test[i][1]) {
1197 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1198 start, limit, (long)type, test[i][0], test[i][1]);
1199 }
1200 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1201 return i==(count-1) ? FALSE : TRUE;
1202 }
1203 }
1204
1205 if(start>test[count-1][0]) {
1206 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1207 start, limit, (long)type);
1208 return FALSE;
1209 }
1210
1211 return TRUE;
1212 }
1213
1214 static UBool U_CALLCONV
enumDefaultsRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1215 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1216 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1217 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1218 { 0x0590, U_LEFT_TO_RIGHT },
1219 { 0x0600, U_RIGHT_TO_LEFT },
1220 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1221 { 0x0860, U_RIGHT_TO_LEFT },
1222 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
1223 { 0x08A0, U_RIGHT_TO_LEFT },
1224 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1225 { 0x20A0, U_LEFT_TO_RIGHT },
1226 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1227 { 0xFB1D, U_LEFT_TO_RIGHT },
1228 { 0xFB50, U_RIGHT_TO_LEFT },
1229 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1230 { 0xFE70, U_LEFT_TO_RIGHT },
1231 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1232 { 0x10800, U_LEFT_TO_RIGHT },
1233 { 0x11000, U_RIGHT_TO_LEFT },
1234 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1235 { 0x1EE00, U_RIGHT_TO_LEFT },
1236 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1237 { 0x1F000, U_RIGHT_TO_LEFT },
1238 { 0x110000, U_LEFT_TO_RIGHT }
1239 };
1240
1241 UChar32 c;
1242 int32_t i;
1243 UCharDirection shouldBeDir;
1244
1245 /*
1246 * LineBreak.txt specifies:
1247 * # - Assigned characters that are not listed explicitly are given the value
1248 * # "AL".
1249 * # - Unassigned characters are given the value "XX".
1250 *
1251 * PUA characters are listed explicitly with "XX".
1252 * Verify that no assigned character has "XX".
1253 */
1254 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1255 c=start;
1256 while(c<limit) {
1257 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1258 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1259 }
1260 ++c;
1261 }
1262 }
1263
1264 /*
1265 * Verify default Bidi classes.
1266 * See DerivedBidiClass.txt, especially for unassigned code points.
1267 */
1268 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1269 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1270 c=start;
1271 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1272 if((int32_t)c<defaultBidi[i][0]) {
1273 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1274 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1275 shouldBeDir=U_BOUNDARY_NEUTRAL;
1276 } else {
1277 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1278 }
1279
1280 if( u_charDirection(c)!=shouldBeDir ||
1281 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1282 ) {
1283 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1284 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1285 }
1286 ++c;
1287 }
1288 }
1289 }
1290 }
1291
1292 return TRUE;
1293 }
1294
1295 /* tests for several properties */
TestUnicodeData()1296 static void TestUnicodeData()
1297 {
1298 UVersionInfo expectVersionArray;
1299 UVersionInfo versionArray;
1300 char *fields[15][2];
1301 UErrorCode errorCode;
1302 UChar32 c;
1303 int8_t type;
1304
1305 UnicodeDataContext context;
1306
1307 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1308 u_getUnicodeVersion(versionArray);
1309 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1310 {
1311 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1312 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1313 }
1314
1315 #if defined(ICU_UNICODE_VERSION)
1316 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1317 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1318 {
1319 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1320 }
1321 #endif
1322
1323 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1324 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1325 }
1326
1327 errorCode=U_ZERO_ERROR;
1328 #if !UCONFIG_NO_NORMALIZATION
1329 context.nfc=unorm2_getNFCInstance(&errorCode);
1330 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1331 if(U_FAILURE(errorCode)) {
1332 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1333 return;
1334 }
1335 #endif
1336 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1337 if(U_FAILURE(errorCode)) {
1338 return; /* if we couldn't parse UnicodeData.txt, we should return */
1339 }
1340
1341 /* sanity check on repeated properties */
1342 for(c=0xfffe; c<=0x10ffff;) {
1343 type=u_charType(c);
1344 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1345 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1346 }
1347 if(type!=U_UNASSIGNED) {
1348 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1349 }
1350 if((c&0xffff)==0xfffe) {
1351 ++c;
1352 } else {
1353 c+=0xffff;
1354 }
1355 }
1356
1357 /* test that PUA is not "unassigned" */
1358 for(c=0xe000; c<=0x10fffd;) {
1359 type=u_charType(c);
1360 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1361 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1362 }
1363 if(type==U_UNASSIGNED) {
1364 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1365 } else if(type!=U_PRIVATE_USE_CHAR) {
1366 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1367 }
1368 if(c==0xf8ff) {
1369 c=0xf0000;
1370 } else if(c==0xffffd) {
1371 c=0x100000;
1372 } else {
1373 ++c;
1374 }
1375 }
1376
1377 /* test u_enumCharTypes() */
1378 u_enumCharTypes(enumTypeRange, "a1");
1379
1380 /* check default properties */
1381 u_enumCharTypes(enumDefaultsRange, NULL);
1382 }
1383
TestCodeUnit()1384 static void TestCodeUnit(){
1385 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1386
1387 int32_t i;
1388
1389 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1390 UChar c=codeunit[i];
1391 if(i<4){
1392 if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1393 U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1394 log_err("ERROR: U+%04x is a single", c);
1395 }
1396
1397 }
1398 if(i >= 4 && i< 8){
1399 if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1400 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1401 log_err("ERROR: U+%04x is a first surrogate", c);
1402 }
1403 }
1404 if(i >= 8 && i< 12){
1405 if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1406 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1407 log_err("ERROR: U+%04x is a second surrogate", c);
1408 }
1409 }
1410 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1411 if(i<4){
1412 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1413 log_err("ERROR: U+%04x is a single", c);
1414 }
1415
1416 }
1417 if(i >= 4 && i< 8){
1418 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1419 log_err("ERROR: U+%04x is a first surrogate", c);
1420 }
1421 }
1422 if(i >= 8 && i< 12){
1423 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1424 log_err("ERROR: U+%04x is a second surrogate", c);
1425 }
1426 }
1427 #endif
1428 }
1429 }
1430
TestCodePoint()1431 static void TestCodePoint(){
1432 const UChar32 codePoint[]={
1433 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1434 0xd800,
1435 0xdbff,
1436 0xdc00,
1437 0xdfff,
1438 0xdc04,
1439 0xd821,
1440 /*not a surrogate, valid, isUnicodeChar , not Error*/
1441 0x20ac,
1442 0xd7ff,
1443 0xe000,
1444 0xe123,
1445 0x0061,
1446 0xe065,
1447 0x20402,
1448 0x24506,
1449 0x23456,
1450 0x20402,
1451 0x10402,
1452 0x23456,
1453 /*not a surrogate, not valid, isUnicodeChar, isError */
1454 0x0015,
1455 0x009f,
1456 /*not a surrogate, not valid, not isUnicodeChar, isError */
1457 0xffff,
1458 0xfffe,
1459 };
1460 int32_t i;
1461 for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1462 UChar32 c=codePoint[i];
1463 if(i<6) {
1464 if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1465 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1466 }
1467 if(U_IS_UNICODE_CHAR(c)) {
1468 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1469 }
1470 } else if(i >=6 && i<18) {
1471 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1472 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1473 }
1474 if(!U_IS_UNICODE_CHAR(c)) {
1475 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1476 }
1477 } else if(i >=18 && i<20) {
1478 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1479 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1480 }
1481 if(!U_IS_UNICODE_CHAR(c)) {
1482 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1483 }
1484 } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1485 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1486 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1487 }
1488 if(U_IS_UNICODE_CHAR(c)) {
1489 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1490 }
1491 }
1492 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1493 if(i<6){
1494 if(!UTF_IS_SURROGATE(c)){
1495 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1496 }
1497 if(UTF_IS_VALID(c)){
1498 log_err("ERROR: isValid() failed for U+%04x\n", c);
1499 }
1500 if(UTF_IS_UNICODE_CHAR(c)){
1501 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1502 }
1503 if(UTF_IS_ERROR(c)){
1504 log_err("ERROR: isError() failed for U+%04x\n", c);
1505 }
1506 }else if(i >=6 && i<18){
1507 if(UTF_IS_SURROGATE(c)){
1508 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1509 }
1510 if(!UTF_IS_VALID(c)){
1511 log_err("ERROR: isValid() failed for U+%04x\n", c);
1512 }
1513 if(!UTF_IS_UNICODE_CHAR(c)){
1514 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1515 }
1516 if(UTF_IS_ERROR(c)){
1517 log_err("ERROR: isError() failed for U+%04x\n", c);
1518 }
1519 }else if(i >=18 && i<20){
1520 if(UTF_IS_SURROGATE(c)){
1521 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1522 }
1523 if(UTF_IS_VALID(c)){
1524 log_err("ERROR: isValid() failed for U+%04x\n", c);
1525 }
1526 if(!UTF_IS_UNICODE_CHAR(c)){
1527 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1528 }
1529 if(!UTF_IS_ERROR(c)){
1530 log_err("ERROR: isError() failed for U+%04x\n", c);
1531 }
1532 }
1533 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1534 if(UTF_IS_SURROGATE(c)){
1535 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1536 }
1537 if(UTF_IS_VALID(c)){
1538 log_err("ERROR: isValid() failed for U+%04x\n", c);
1539 }
1540 if(UTF_IS_UNICODE_CHAR(c)){
1541 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1542 }
1543 if(!UTF_IS_ERROR(c)){
1544 log_err("ERROR: isError() failed for U+%04x\n", c);
1545 }
1546 }
1547 #endif
1548 }
1549
1550 if(
1551 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1552 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1553 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1554 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1555 ) {
1556 log_err("error with U_IS_BMP()\n");
1557 }
1558
1559 if(
1560 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1561 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1562 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1563 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1564 ) {
1565 log_err("error with U_IS_SUPPLEMENTARY()\n");
1566 }
1567 }
1568
TestCharLength()1569 static void TestCharLength()
1570 {
1571 const int32_t codepoint[]={
1572 1, 0x0061,
1573 1, 0xe065,
1574 1, 0x20ac,
1575 2, 0x20402,
1576 2, 0x23456,
1577 2, 0x24506,
1578 2, 0x20402,
1579 2, 0x10402,
1580 1, 0xd7ff,
1581 1, 0xe000
1582 };
1583
1584 int32_t i;
1585 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1586 UBool multiple;
1587 #endif
1588 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1589 UChar32 c=codepoint[i+1];
1590 if(
1591 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1592 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1593 #endif
1594 U16_LENGTH(c) != codepoint[i]) {
1595 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1596 }
1597 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1598 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1599 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1600 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1601 }
1602 #endif
1603 }
1604 }
1605
1606 /*internal functions ----*/
MakeProp(char * str)1607 static int32_t MakeProp(char* str)
1608 {
1609 int32_t result = 0;
1610 char* matchPosition =0;
1611
1612 matchPosition = strstr(tagStrings, str);
1613 if (matchPosition == 0)
1614 {
1615 log_err("unrecognized type letter ");
1616 log_err(str);
1617 }
1618 else
1619 result = (int32_t)((matchPosition - tagStrings) / 2);
1620 return result;
1621 }
1622
MakeDir(char * str)1623 static int32_t MakeDir(char* str)
1624 {
1625 int32_t pos = 0;
1626 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1627 if (strcmp(str, dirStrings[pos]) == 0) {
1628 return pos;
1629 }
1630 }
1631 return -1;
1632 }
1633
1634 /* test u_charName() -------------------------------------------------------- */
1635
1636 static const struct {
1637 uint32_t code;
1638 const char *name, *oldName, *extName, *alias;
1639 } names[]={
1640 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1641 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1642 "LATIN CAPITAL LETTER OI",
1643 "LATIN CAPITAL LETTER GHA"},
1644 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1645 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1646 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1647 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1648 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1649 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1650 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1651 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1652 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1653 {0xd800, "", "", "<lead surrogate-D800>" },
1654 {0xdc00, "", "", "<trail surrogate-DC00>" },
1655 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1656 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1657 {0xffff, "", "", "<noncharacter-FFFF>" },
1658 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1659 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1660 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1661 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1662 };
1663
1664 static UBool
enumCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1665 enumCharNamesFn(void *context,
1666 UChar32 code, UCharNameChoice nameChoice,
1667 const char *name, int32_t length) {
1668 int32_t *pCount=(int32_t *)context;
1669 const char *expected;
1670 int i;
1671
1672 if(length<=0 || length!=(int32_t)strlen(name)) {
1673 /* should not be called with an empty string or invalid length */
1674 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1675 return TRUE;
1676 }
1677
1678 ++*pCount;
1679 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1680 if(code==(UChar32)names[i].code) {
1681 switch (nameChoice) {
1682 case U_EXTENDED_CHAR_NAME:
1683 if(0!=strcmp(name, names[i].extName)) {
1684 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1685 }
1686 break;
1687 case U_UNICODE_CHAR_NAME:
1688 if(0!=strcmp(name, names[i].name)) {
1689 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1690 }
1691 break;
1692 case U_UNICODE_10_CHAR_NAME:
1693 expected=names[i].oldName;
1694 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1695 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1696 }
1697 break;
1698 case U_CHAR_NAME_ALIAS:
1699 expected=names[i].alias;
1700 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1701 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1702 }
1703 break;
1704 case U_CHAR_NAME_CHOICE_COUNT:
1705 break;
1706 }
1707 break;
1708 }
1709 }
1710 return TRUE;
1711 }
1712
1713 struct enumExtCharNamesContext {
1714 uint32_t length;
1715 int32_t last;
1716 };
1717
1718 static UBool
enumExtCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1719 enumExtCharNamesFn(void *context,
1720 UChar32 code, UCharNameChoice nameChoice,
1721 const char *name, int32_t length) {
1722 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1723
1724 if (ecncp->last != (int32_t) code - 1) {
1725 if (ecncp->last < 0) {
1726 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1727 } else {
1728 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1729 }
1730 }
1731 ecncp->last = (int32_t) code;
1732
1733 if (!*name) {
1734 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1735 }
1736
1737 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1738 }
1739
1740 /**
1741 * This can be made more efficient by moving it into putil.c and having
1742 * it directly access the ebcdic translation tables.
1743 * TODO: If we get this method in putil.c, then delete it from here.
1744 */
1745 static UChar
u_charToUChar(char c)1746 u_charToUChar(char c) {
1747 UChar uc;
1748 u_charsToUChars(&c, &uc, 1);
1749 return uc;
1750 }
1751
1752 static void
TestCharNames()1753 TestCharNames() {
1754 static char name[80];
1755 UErrorCode errorCode=U_ZERO_ERROR;
1756 struct enumExtCharNamesContext extContext;
1757 const char *expected;
1758 int32_t length;
1759 UChar32 c;
1760 int32_t i;
1761
1762 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1763 length=uprv_getMaxCharNameLength();
1764 if(length==0) {
1765 /* no names data available */
1766 return;
1767 }
1768 if(length<83) { /* Unicode 3.2 max char name length */
1769 log_err("uprv_getMaxCharNameLength()=%d is too short");
1770 }
1771 /* ### TODO same tests for max ISO comment length as for max name length */
1772
1773 log_verbose("Testing u_charName()\n");
1774 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1775 /* modern Unicode character name */
1776 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1777 if(U_FAILURE(errorCode)) {
1778 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1779 return;
1780 }
1781 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1782 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1783 }
1784
1785 /* find the modern name */
1786 if (*names[i].name) {
1787 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1788 if(U_FAILURE(errorCode)) {
1789 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1790 return;
1791 }
1792 if(c!=(UChar32)names[i].code) {
1793 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1794 }
1795 }
1796
1797 /* Unicode 1.0 character name */
1798 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1799 if(U_FAILURE(errorCode)) {
1800 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1801 return;
1802 }
1803 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1804 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1805 }
1806
1807 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1808 if(names[i].oldName[0]!=0 /* && length>0 */) {
1809 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1810 if(U_FAILURE(errorCode)) {
1811 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1812 return;
1813 }
1814 if(c!=(UChar32)names[i].code) {
1815 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1816 }
1817 }
1818
1819 /* Unicode character name alias */
1820 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1821 if(U_FAILURE(errorCode)) {
1822 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1823 return;
1824 }
1825 expected=names[i].alias;
1826 if(expected==NULL) {
1827 expected="";
1828 }
1829 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1830 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1831 names[i].code, name, length, expected);
1832 }
1833
1834 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1835 if(expected[0]!=0 /* && length>0 */) {
1836 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1837 if(U_FAILURE(errorCode)) {
1838 log_err("u_charFromName(%s - alias) error %s\n",
1839 expected, u_errorName(errorCode));
1840 return;
1841 }
1842 if(c!=(UChar32)names[i].code) {
1843 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1844 expected, c, names[i].code);
1845 }
1846 }
1847 }
1848
1849 /* test u_enumCharNames() */
1850 length=0;
1851 errorCode=U_ZERO_ERROR;
1852 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1853 if(U_FAILURE(errorCode) || length<94140) {
1854 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1855 }
1856
1857 extContext.length = 0;
1858 extContext.last = -1;
1859 errorCode=U_ZERO_ERROR;
1860 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1861 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1862 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1863 }
1864
1865 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1866 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1867 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1868 }
1869
1870 /* Test getCharNameCharacters */
1871 if(!getTestOption(QUICK_OPTION)) {
1872 enum { BUFSIZE = 256 };
1873 UErrorCode ec = U_ZERO_ERROR;
1874 char buf[BUFSIZE];
1875 int32_t maxLength;
1876 UChar32 cp;
1877 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1878 int32_t l1, l2;
1879 UBool map[256];
1880 UBool ok;
1881
1882 USet* set = uset_open(1, 0); /* empty set */
1883 USet* dumb = uset_open(1, 0); /* empty set */
1884
1885 /*
1886 * uprv_getCharNameCharacters() will likely return more lowercase
1887 * letters than actual character names contain because
1888 * it includes all the characters in lowercased names of
1889 * general categories, for the full possible set of extended names.
1890 */
1891 {
1892 USetAdder sa={
1893 NULL,
1894 uset_add,
1895 uset_addRange,
1896 uset_addString,
1897 NULL /* don't need remove() */
1898 };
1899 sa.set=set;
1900 uprv_getCharNameCharacters(&sa);
1901 }
1902
1903 /* build set the dumb (but sure-fire) way */
1904 for (i=0; i<256; ++i) {
1905 map[i] = FALSE;
1906 }
1907
1908 maxLength=0;
1909 for (cp=0; cp<0x110000; ++cp) {
1910 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1911 buf, BUFSIZE, &ec);
1912 if (U_FAILURE(ec)) {
1913 log_err("FAIL: u_charName failed when it shouldn't\n");
1914 uset_close(set);
1915 uset_close(dumb);
1916 return;
1917 }
1918 if(len>maxLength) {
1919 maxLength=len;
1920 }
1921
1922 for (i=0; i<len; ++i) {
1923 if (!map[(uint8_t) buf[i]]) {
1924 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1925 map[(uint8_t) buf[i]] = TRUE;
1926 }
1927 }
1928
1929 /* test for leading/trailing whitespace */
1930 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1931 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1932 }
1933 }
1934
1935 if(map[(uint8_t)'\t']) {
1936 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1937 }
1938
1939 length=uprv_getMaxCharNameLength();
1940 if(length!=maxLength) {
1941 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1942 length, maxLength);
1943 }
1944
1945 /* compare the sets. Where is my uset_equals?!! */
1946 ok=TRUE;
1947 for(i=0; i<256; ++i) {
1948 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1949 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1950 /* ignore lowercase a-z that are in set but not in dumb */
1951 ok=TRUE;
1952 } else {
1953 ok=FALSE;
1954 break;
1955 }
1956 }
1957 }
1958
1959 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1960 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1961 if (U_FAILURE(ec)) {
1962 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1963 uset_close(set);
1964 uset_close(dumb);
1965 return;
1966 }
1967
1968 if (l1 >= BUFSIZE) {
1969 l1 = BUFSIZE-1;
1970 pat[l1] = 0;
1971 }
1972 if (l2 >= BUFSIZE) {
1973 l2 = BUFSIZE-1;
1974 dumbPat[l2] = 0;
1975 }
1976
1977 if (!ok) {
1978 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1979 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1980 } else if(getTestOption(VERBOSITY_OPTION)) {
1981 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1982 }
1983
1984 uset_close(set);
1985 uset_close(dumb);
1986 }
1987
1988 /* ### TODO: test error cases and other interesting things */
1989 }
1990
1991 static void
TestUCharFromNameUnderflow()1992 TestUCharFromNameUnderflow() {
1993 // Ticket #10889: Underflow crash when there is no dash.
1994 UErrorCode errorCode=U_ZERO_ERROR;
1995 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
1996 if(U_SUCCESS(errorCode)) {
1997 log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1998 }
1999
2000 // Test related edge cases.
2001 errorCode=U_ZERO_ERROR;
2002 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
2003 if(U_SUCCESS(errorCode)) {
2004 log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2005 }
2006
2007 errorCode=U_ZERO_ERROR;
2008 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
2009 if(U_SUCCESS(errorCode)) {
2010 log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2011 }
2012
2013 errorCode=U_ZERO_ERROR;
2014 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
2015 if(U_SUCCESS(errorCode)) {
2016 log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2017 }
2018 }
2019
2020 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2021
2022 static void
TestMirroring()2023 TestMirroring() {
2024 USet *set;
2025 UErrorCode errorCode;
2026
2027 UChar32 start, end, c2, c3;
2028 int32_t i;
2029
2030 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2031
2032 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2033
2034 log_verbose("Testing u_isMirrored()\n");
2035 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2036 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2037 )
2038 ) {
2039 log_err("u_isMirrored() does not work correctly\n");
2040 }
2041
2042 log_verbose("Testing u_charMirror()\n");
2043 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2044 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2045 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2046 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2047 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2048 )
2049 ) {
2050 log_err("u_charMirror() does not work correctly\n");
2051 }
2052
2053 /* verify that Bidi_Mirroring_Glyph roundtrips */
2054 errorCode=U_ZERO_ERROR;
2055 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2056
2057 if (U_FAILURE(errorCode)) {
2058 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2059 } else {
2060 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2061 do {
2062 c2=u_charMirror(start);
2063 c3=u_charMirror(c2);
2064 if(c3!=start) {
2065 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2066 }
2067 c3=u_getBidiPairedBracket(start);
2068 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2069 if(c3!=start) {
2070 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2071 (long)start);
2072 }
2073 } else {
2074 if(c3!=c2) {
2075 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2076 (long)start, (long)c2);
2077 }
2078 }
2079 } while(++start<=end);
2080 }
2081 }
2082
2083 uset_close(set);
2084 }
2085
2086
2087 struct RunTestData
2088 {
2089 const char *runText;
2090 UScriptCode runCode;
2091 };
2092
2093 typedef struct RunTestData RunTestData;
2094
2095 static void
CheckScriptRuns(UScriptRun * scriptRun,int32_t * runStarts,const RunTestData * testData,int32_t nRuns,const char * prefix)2096 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2097 const char *prefix)
2098 {
2099 int32_t run, runStart, runLimit;
2100 UScriptCode runCode;
2101
2102 /* iterate over all the runs */
2103 run = 0;
2104 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2105 if (runStart != runStarts[run]) {
2106 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2107 prefix, run, runStarts[run], runStart);
2108 }
2109
2110 if (runLimit != runStarts[run + 1]) {
2111 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2112 prefix, run, runStarts[run + 1], runLimit);
2113 }
2114
2115 if (runCode != testData[run].runCode) {
2116 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2117 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2118 }
2119
2120 run += 1;
2121
2122 /* stop when we've seen all the runs we expect to see */
2123 if (run >= nRuns) {
2124 break;
2125 }
2126 }
2127
2128 /* Complain if we didn't see then number of runs we expected */
2129 if (run != nRuns) {
2130 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2131 }
2132 }
2133
2134 static void
TestUScriptRunAPI()2135 TestUScriptRunAPI()
2136 {
2137 static const RunTestData testData1[] = {
2138 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2139 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2140 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2141 {"English (", USCRIPT_LATIN},
2142 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2143 {") ", USCRIPT_LATIN},
2144 {"\\u6F22\\u5B75", USCRIPT_HAN},
2145 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2146 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2147 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2148 };
2149
2150 static const RunTestData testData2[] = {
2151 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2152 };
2153
2154 static const struct {
2155 const RunTestData *testData;
2156 int32_t nRuns;
2157 } testDataEntries[] = {
2158 {testData1, UPRV_LENGTHOF(testData1)},
2159 {testData2, UPRV_LENGTHOF(testData2)}
2160 };
2161
2162 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2163 int32_t testEntry;
2164
2165 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2166 UChar testString[1024];
2167 int32_t runStarts[256];
2168 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2169 const RunTestData *testData = testDataEntries[testEntry].testData;
2170
2171 int32_t run, stringLimit;
2172 UScriptRun *scriptRun = NULL;
2173 UErrorCode err;
2174
2175 /*
2176 * Fill in the test string and the runStarts array.
2177 */
2178 stringLimit = 0;
2179 for (run = 0; run < nTestRuns; run += 1) {
2180 runStarts[run] = stringLimit;
2181 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2182 /*stringLimit -= 1;*/
2183 }
2184
2185 /* The limit of the last run */
2186 runStarts[nTestRuns] = stringLimit;
2187
2188 /*
2189 * Make sure that calling uscript_OpenRun with a NULL text pointer
2190 * and a non-zero text length returns the correct error.
2191 */
2192 err = U_ZERO_ERROR;
2193 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2194
2195 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2196 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2197 }
2198
2199 if (scriptRun != NULL) {
2200 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2201 uscript_closeRun(scriptRun);
2202 }
2203
2204 /*
2205 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2206 * and a zero text length returns the correct error.
2207 */
2208 err = U_ZERO_ERROR;
2209 scriptRun = uscript_openRun(testString, 0, &err);
2210
2211 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2212 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2213 }
2214
2215 if (scriptRun != NULL) {
2216 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2217 uscript_closeRun(scriptRun);
2218 }
2219
2220 /*
2221 * Make sure that calling uscript_openRun with a NULL text pointer
2222 * and a zero text length doesn't return an error.
2223 */
2224 err = U_ZERO_ERROR;
2225 scriptRun = uscript_openRun(NULL, 0, &err);
2226
2227 if (U_FAILURE(err)) {
2228 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2229 }
2230
2231 /* Make sure that the empty iterator doesn't find any runs */
2232 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2233 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2234 }
2235
2236 /*
2237 * Make sure that calling uscript_setRunText with a NULL text pointer
2238 * and a non-zero text length returns the correct error.
2239 */
2240 err = U_ZERO_ERROR;
2241 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2242
2243 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2244 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2245 }
2246
2247 /*
2248 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2249 * and a zero text length returns the correct error.
2250 */
2251 err = U_ZERO_ERROR;
2252 uscript_setRunText(scriptRun, testString, 0, &err);
2253
2254 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2255 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2256 }
2257
2258 /*
2259 * Now call uscript_setRunText on the empty iterator
2260 * and make sure that it works.
2261 */
2262 err = U_ZERO_ERROR;
2263 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2264
2265 if (U_FAILURE(err)) {
2266 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2267 } else {
2268 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2269 }
2270
2271 uscript_closeRun(scriptRun);
2272
2273 /*
2274 * Now open an interator over the testString
2275 * using uscript_openRun and make sure that it works
2276 */
2277 scriptRun = uscript_openRun(testString, stringLimit, &err);
2278
2279 if (U_FAILURE(err)) {
2280 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2281 } else {
2282 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2283 }
2284
2285 /* Now reset the iterator, and make sure
2286 * that it still works.
2287 */
2288 uscript_resetRun(scriptRun);
2289
2290 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2291
2292 /* Close the iterator */
2293 uscript_closeRun(scriptRun);
2294 }
2295 }
2296
2297 /* test additional, non-core properties */
2298 static void
TestAdditionalProperties()2299 TestAdditionalProperties() {
2300 /* test data for u_charAge() */
2301 static const struct {
2302 UChar32 c;
2303 UVersionInfo version;
2304 } charAges[]={
2305 {0x41, { 1, 1, 0, 0 }},
2306 {0xffff, { 1, 1, 0, 0 }},
2307 {0x20ab, { 2, 0, 0, 0 }},
2308 {0x2fffe, { 2, 0, 0, 0 }},
2309 {0x20ac, { 2, 1, 0, 0 }},
2310 {0xfb1d, { 3, 0, 0, 0 }},
2311 {0x3f4, { 3, 1, 0, 0 }},
2312 {0x10300, { 3, 1, 0, 0 }},
2313 {0x220, { 3, 2, 0, 0 }},
2314 {0xff60, { 3, 2, 0, 0 }}
2315 };
2316
2317 /* test data for u_hasBinaryProperty() */
2318 static const int32_t
2319 props[][3]={ /* code point, property, value */
2320 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2321 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2322 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2323
2324 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2325 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2326
2327 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2328 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2329
2330 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2331 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2332
2333 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2334 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2335 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2336 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2337 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2338
2339 { 0x058a, UCHAR_DASH, TRUE },
2340 { 0x007e, UCHAR_DASH, FALSE },
2341
2342 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2343 { 0x3000, UCHAR_DIACRITIC, FALSE },
2344
2345 { 0x0e46, UCHAR_EXTENDER, TRUE },
2346 { 0x0020, UCHAR_EXTENDER, FALSE },
2347
2348 #if !UCONFIG_NO_NORMALIZATION
2349 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2350 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2351 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2352
2353 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2354 { 0x0308, UCHAR_NFD_INERT, FALSE },
2355
2356 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2357 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2358
2359 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2360 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2361 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2362 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2363 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2364 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2365
2366 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2367 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2368
2369 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2370 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2371 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2372 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2373 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2374 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2375 #endif
2376
2377 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2378 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2379 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2380
2381 { 0x30fb, UCHAR_HYPHEN, TRUE },
2382 { 0xfe58, UCHAR_HYPHEN, FALSE },
2383
2384 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2385 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2386 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2387
2388 { 0x2172, UCHAR_ID_START, TRUE },
2389 { 0x007a, UCHAR_ID_START, TRUE },
2390 { 0x0039, UCHAR_ID_START, FALSE },
2391
2392 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2393 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2394 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2395
2396 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2397 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2398
2399 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2400 { 0x0345, UCHAR_LOWERCASE, TRUE },
2401 { 0x0030, UCHAR_LOWERCASE, FALSE },
2402
2403 { 0x1d7a9, UCHAR_MATH, TRUE },
2404 { 0x2135, UCHAR_MATH, TRUE },
2405 { 0x0062, UCHAR_MATH, FALSE },
2406
2407 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2408 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2409 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2410
2411 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2412 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2413 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2414
2415 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2416 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2417
2418 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2419 { 0x2162, UCHAR_UPPERCASE, TRUE },
2420 { 0x0345, UCHAR_UPPERCASE, FALSE },
2421
2422 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2423 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2424 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2425
2426 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2427 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2428 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2429
2430 { 0x16ee, UCHAR_XID_START, TRUE },
2431 { 0x23456, UCHAR_XID_START, TRUE },
2432 { 0x1d1aa, UCHAR_XID_START, FALSE },
2433
2434 /*
2435 * Version break:
2436 * The following properties are only supported starting with the
2437 * Unicode version indicated in the second field.
2438 */
2439 { -1, 0x320, 0 },
2440
2441 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2442 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2443 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2444
2445 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2446 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2447 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2448 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2449
2450 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2451 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2452 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2453 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2454
2455 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2456 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2457 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2458 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2459
2460 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2461 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2462
2463 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2464 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2465
2466 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2467 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2468
2469 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2470 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2471
2472 { 0x2e9b, UCHAR_RADICAL, TRUE },
2473 { 0x4e00, UCHAR_RADICAL, FALSE },
2474
2475 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2476 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2477
2478 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2479 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2480
2481 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2482
2483 { 0x002e, UCHAR_S_TERM, TRUE },
2484 { 0x0061, UCHAR_S_TERM, FALSE },
2485
2486 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2487 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2488 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2489 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2490
2491 /* enum/integer type properties */
2492
2493 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2494 /* test default Bidi classes for unassigned code points */
2495 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2496 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2497 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2498 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2499 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2500 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2501 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2502 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2503 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2504 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2505 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2506
2507 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2508 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2509 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2510 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2511 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2512 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2513 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2514
2515 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2516 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2517 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2518 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2519 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2520 { 0x1CBF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2521 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2522 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2523 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2524 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2525 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2526
2527 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2528 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2529
2530 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2531 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2532 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2533 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2534 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2535 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2536 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2537 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2538 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2539
2540 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2541 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2542 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2543 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2544 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2545 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2546 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2547 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2548 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2549 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2550 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2551 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2552 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2553 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2554 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2555 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2556 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2557
2558 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2559 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2560 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2561
2562 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2563 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2564 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2565 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2566 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2567
2568 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2569 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2570 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2571 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2572 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2573 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2574 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2575 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2576
2577 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2578 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2579 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2580 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2581 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2582 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2583 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2584 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2585 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2586 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2587 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2588 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2589 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2590 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2591 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2592 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2593
2594 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2595
2596 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2597
2598 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2599 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2600 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2601 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2602 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2603 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2604 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2605
2606 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2607 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2608 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2609 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2610
2611 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2612 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2613 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2614 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2615 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2616 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2617
2618 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2619 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2620 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2621 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2622
2623 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2624 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2625 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2626 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2627 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2628 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2629 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2630
2631 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2632 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2633 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2634 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2635
2636 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2637 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2638 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2639 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2640
2641 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2642 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2643 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2644 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2645 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2646
2647 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2648
2649 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2650
2651 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2652 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2653 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2654
2655 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2656 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2657 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2658 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2659 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2660
2661 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2662 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2663 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2664
2665 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2666 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2667 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2668 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2669
2670 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2671 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2672 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2673 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2674 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2675 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2676
2677 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2678 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2679 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2680 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2681
2682 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2683 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2684 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2685 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2686
2687 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2688 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2689 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2690 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2691
2692 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2693
2694 /* unassigned code points in new default Bidi R blocks */
2695 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2696 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2697
2698 /* test some script codes >127 */
2699 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2700 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2701 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2702
2703 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2704
2705 /* value changed in Unicode 6.0 */
2706 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2707
2708 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2709
2710 /* unassigned code points in new/changed default Bidi AL blocks */
2711 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2712 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2713
2714 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2715
2716 /* unassigned code points in the currency symbols block now default to ET */
2717 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2718 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2719
2720 /* new property in Unicode 6.3 */
2721 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2722 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2723 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2724 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2725 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2726 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2727
2728 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2729
2730 /* new character range with Joining_Group values */
2731 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2732 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2733 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2734 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2735 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2736
2737 { -1, 0xa00, 0 }, // version break for Unicode 10
2738
2739 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2740 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2741 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2742 { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2743
2744 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2745 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2746 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2747
2748 /* undefined UProperty values */
2749 { 0x61, 0x4a7, 0 },
2750 { 0x234bc, 0x15ed, 0 }
2751 };
2752
2753 UVersionInfo version;
2754 UChar32 c;
2755 int32_t i, result, uVersion;
2756 UProperty which;
2757
2758 /* what is our Unicode version? */
2759 u_getUnicodeVersion(version);
2760 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2761
2762 u_charAge(0x20, version);
2763 if(version[0]==0) {
2764 /* no additional properties available */
2765 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2766 return;
2767 }
2768
2769 /* test u_charAge() */
2770 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2771 u_charAge(charAges[i].c, version);
2772 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2773 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2774 charAges[i].c,
2775 version[0], version[1], version[2], version[3],
2776 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2777 }
2778 }
2779
2780 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2781 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2782 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2783 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2784 u_getIntPropertyMinValue(0x2345)!=0
2785 ) {
2786 log_err("error: u_getIntPropertyMinValue() wrong\n");
2787 }
2788 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2789 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2790 }
2791 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2792 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2793 }
2794 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2795 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2796 }
2797 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2798 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2799 }
2800 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2801 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2802 }
2803 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2804 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2805 }
2806 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2807 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2808 }
2809 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2810 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2811 }
2812 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2813 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2814 }
2815 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2816 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2817 }
2818 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2819 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2820 }
2821 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2822 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2823 }
2824 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2825 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2826 }
2827 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2828 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2829 }
2830 /*JB#2410*/
2831 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2832 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2833 }
2834 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2835 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2836 }
2837 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2838 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2839 }
2840 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2841 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2842 }
2843 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2844 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2845 }
2846
2847 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2848 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2849 const char *whichName;
2850
2851 if(props[i][0]<0) {
2852 /* Unicode version break */
2853 if(uVersion<props[i][1]) {
2854 break; /* do not test properties that are not yet supported */
2855 } else {
2856 continue; /* skip this row */
2857 }
2858 }
2859
2860 c=(UChar32)props[i][0];
2861 which=(UProperty)props[i][1];
2862 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2863
2864 if(which<UCHAR_INT_START) {
2865 result=u_hasBinaryProperty(c, which);
2866 if(result!=props[i][2]) {
2867 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2868 c, whichName, result, i);
2869 }
2870 }
2871
2872 result=u_getIntPropertyValue(c, which);
2873 if(result!=props[i][2]) {
2874 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2875 c, whichName, result, props[i][2], i);
2876 }
2877
2878 /* test separate functions, too */
2879 switch((UProperty)props[i][1]) {
2880 case UCHAR_ALPHABETIC:
2881 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2882 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2883 props[i][0], result, i);
2884 }
2885 break;
2886 case UCHAR_LOWERCASE:
2887 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2888 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2889 props[i][0], result, i);
2890 }
2891 break;
2892 case UCHAR_UPPERCASE:
2893 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2894 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2895 props[i][0], result, i);
2896 }
2897 break;
2898 case UCHAR_WHITE_SPACE:
2899 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2900 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2901 props[i][0], result, i);
2902 }
2903 break;
2904 default:
2905 break;
2906 }
2907 }
2908 }
2909
2910 static void
TestNumericProperties(void)2911 TestNumericProperties(void) {
2912 /* see UnicodeData.txt, DerivedNumericValues.txt */
2913 static const struct {
2914 UChar32 c;
2915 int32_t type;
2916 double numValue;
2917 } values[]={
2918 { 0x0F33, U_NT_NUMERIC, -1./2. },
2919 { 0x0C66, U_NT_DECIMAL, 0 },
2920 { 0x96f6, U_NT_NUMERIC, 0 },
2921 { 0xa833, U_NT_NUMERIC, 1./16. },
2922 { 0x2152, U_NT_NUMERIC, 1./10. },
2923 { 0x2151, U_NT_NUMERIC, 1./9. },
2924 { 0x1245f, U_NT_NUMERIC, 1./8. },
2925 { 0x2150, U_NT_NUMERIC, 1./7. },
2926 { 0x2159, U_NT_NUMERIC, 1./6. },
2927 { 0x09f6, U_NT_NUMERIC, 3./16. },
2928 { 0x2155, U_NT_NUMERIC, 1./5. },
2929 { 0x00BD, U_NT_NUMERIC, 1./2. },
2930 { 0x0031, U_NT_DECIMAL, 1. },
2931 { 0x4e00, U_NT_NUMERIC, 1. },
2932 { 0x58f1, U_NT_NUMERIC, 1. },
2933 { 0x10320, U_NT_NUMERIC, 1. },
2934 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2935 { 0x00B2, U_NT_DIGIT, 2. },
2936 { 0x5f10, U_NT_NUMERIC, 2. },
2937 { 0x1813, U_NT_DECIMAL, 3. },
2938 { 0x5f0e, U_NT_NUMERIC, 3. },
2939 { 0x2173, U_NT_NUMERIC, 4. },
2940 { 0x8086, U_NT_NUMERIC, 4. },
2941 { 0x278E, U_NT_DIGIT, 5. },
2942 { 0x1D7F2, U_NT_DECIMAL, 6. },
2943 { 0x247A, U_NT_DIGIT, 7. },
2944 { 0x7396, U_NT_NUMERIC, 9. },
2945 { 0x1372, U_NT_NUMERIC, 10. },
2946 { 0x216B, U_NT_NUMERIC, 12. },
2947 { 0x16EE, U_NT_NUMERIC, 17. },
2948 { 0x249A, U_NT_NUMERIC, 19. },
2949 { 0x303A, U_NT_NUMERIC, 30. },
2950 { 0x5345, U_NT_NUMERIC, 30. },
2951 { 0x32B2, U_NT_NUMERIC, 37. },
2952 { 0x1375, U_NT_NUMERIC, 40. },
2953 { 0x10323, U_NT_NUMERIC, 50. },
2954 { 0x0BF1, U_NT_NUMERIC, 100. },
2955 { 0x964c, U_NT_NUMERIC, 100. },
2956 { 0x217E, U_NT_NUMERIC, 500. },
2957 { 0x2180, U_NT_NUMERIC, 1000. },
2958 { 0x4edf, U_NT_NUMERIC, 1000. },
2959 { 0x2181, U_NT_NUMERIC, 5000. },
2960 { 0x137C, U_NT_NUMERIC, 10000. },
2961 { 0x4e07, U_NT_NUMERIC, 10000. },
2962 { 0x12432, U_NT_NUMERIC, 216000. },
2963 { 0x12433, U_NT_NUMERIC, 432000. },
2964 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2965 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2966 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2967 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2968 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2969 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2970 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2971 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2972 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2973 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2974 };
2975
2976 double nv;
2977 UChar32 c;
2978 int32_t i, type;
2979
2980 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
2981 c=values[i].c;
2982 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2983 nv=u_getNumericValue(c);
2984
2985 if(type!=values[i].type) {
2986 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2987 }
2988 if(0.000001 <= fabs(nv - values[i].numValue)) {
2989 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2990 }
2991 }
2992 }
2993
2994 /**
2995 * Test the property names and property value names API.
2996 */
2997 static void
TestPropertyNames(void)2998 TestPropertyNames(void) {
2999 int32_t p, v, choice=0, rev;
3000 UBool atLeastSomething = FALSE;
3001
3002 for (p=0; ; ++p) {
3003 UProperty propEnum = (UProperty)p;
3004 UBool sawProp = FALSE;
3005 if(p > 10 && !atLeastSomething) {
3006 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3007 return;
3008 }
3009
3010 for (choice=0; ; ++choice) {
3011 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3012 if (name) {
3013 if (!sawProp)
3014 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3015 log_verbose("%d=\"%s\"", choice, name);
3016 sawProp = TRUE;
3017 atLeastSomething = TRUE;
3018
3019 /* test reverse mapping */
3020 rev = u_getPropertyEnum(name);
3021 if (rev != p) {
3022 log_err("Property round-trip failure: %d -> %s -> %d\n",
3023 p, name, rev);
3024 }
3025 }
3026 if (!name && choice>0) break;
3027 }
3028 if (sawProp) {
3029 /* looks like a valid property; check the values */
3030 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3031 int32_t max = 0;
3032 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3033 max = 255;
3034 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3035 /* it's far too slow to iterate all the way up to
3036 the real max, U_GC_P_MASK */
3037 max = U_GC_NL_MASK;
3038 } else if (p == UCHAR_BLOCK) {
3039 /* UBlockCodes, unlike other values, start at 1 */
3040 max = 1;
3041 }
3042 log_verbose("\n");
3043 for (v=-1; ; ++v) {
3044 UBool sawValue = FALSE;
3045 for (choice=0; ; ++choice) {
3046 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3047 if (vname) {
3048 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3049 log_verbose("%d=\"%s\"", choice, vname);
3050 sawValue = TRUE;
3051
3052 /* test reverse mapping */
3053 rev = u_getPropertyValueEnum(propEnum, vname);
3054 if (rev != v) {
3055 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3056 pname, v, vname, rev);
3057 }
3058 }
3059 if (!vname && choice>0) break;
3060 }
3061 if (sawValue) {
3062 log_verbose("\n");
3063 }
3064 if (!sawValue && v>=max) break;
3065 }
3066 }
3067 if (!sawProp) {
3068 if (p>=UCHAR_STRING_LIMIT) {
3069 break;
3070 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3071 p = UCHAR_STRING_START - 1;
3072 } else if (p>=UCHAR_MASK_LIMIT) {
3073 p = UCHAR_DOUBLE_START - 1;
3074 } else if (p>=UCHAR_INT_LIMIT) {
3075 p = UCHAR_MASK_START - 1;
3076 } else if (p>=UCHAR_BINARY_LIMIT) {
3077 p = UCHAR_INT_START - 1;
3078 }
3079 }
3080 }
3081 }
3082
3083 /**
3084 * Test the property values API. See JB#2410.
3085 */
3086 static void
TestPropertyValues(void)3087 TestPropertyValues(void) {
3088 int32_t i, p, min, max;
3089 UErrorCode ec;
3090
3091 /* Min should be 0 for everything. */
3092 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3093 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3094 UProperty propEnum = (UProperty)p;
3095 min = u_getIntPropertyMinValue(propEnum);
3096 if (min != 0) {
3097 if (p == UCHAR_BLOCK) {
3098 /* This is okay...for now. See JB#2487.
3099 TODO Update this for JB#2487. */
3100 } else {
3101 const char* name;
3102 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3103 if (name == NULL)
3104 name = "<ERROR>";
3105 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3106 name, min);
3107 }
3108 }
3109 }
3110
3111 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3112 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3113 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3114 }
3115
3116 /* Max should be -1 for invalid properties. */
3117 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3118 if (max != -1) {
3119 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3120 max);
3121 }
3122
3123 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3124 for (i=0; i<2; ++i) {
3125 int32_t script;
3126 const char* desc;
3127 ec = U_ZERO_ERROR;
3128 switch (i) {
3129 case 0:
3130 script = uscript_getScript(-1, &ec);
3131 desc = "uscript_getScript(-1)";
3132 break;
3133 case 1:
3134 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3135 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3136 break;
3137 default:
3138 log_err("Internal test error. Too many scripts\n");
3139 return;
3140 }
3141 /* We don't explicitly test ec. It should be U_FAILURE but it
3142 isn't documented as such. */
3143 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3144 log_err("FAIL: %s = %d, exp. 0\n",
3145 desc, script);
3146 }
3147 }
3148 }
3149
3150 /* various tests for consistency of UCD data and API behavior */
3151 static void
TestConsistency()3152 TestConsistency() {
3153 char buffer[300];
3154 USet *set1, *set2, *set3, *set4;
3155 UErrorCode errorCode;
3156
3157 UChar32 start, end;
3158 int32_t i, length;
3159
3160 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3161 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3162 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3163 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3164 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3165
3166 U_STRING_DECL(mathBlocksPattern,
3167 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3168 214);
3169 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3170 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3171 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3172 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3173
3174 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3175 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3176 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3177 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3178 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3179
3180 U_STRING_INIT(mathBlocksPattern,
3181 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3182 214);
3183 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3184 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3185 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3186 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3187
3188 /*
3189 * It used to be that UCD.html and its precursors said
3190 * "Those dashes used to mark connections between pieces of words,
3191 * plus the Katakana middle dot."
3192 *
3193 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3194 * but not from Hyphen.
3195 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3196 * Therefore, do not show errors when testing the Hyphen property.
3197 */
3198 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3199 "known to the UTC and not considered errors.\n");
3200
3201 errorCode=U_ZERO_ERROR;
3202 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3203 set2=uset_openPattern(dashPattern, 8, &errorCode);
3204 if(U_SUCCESS(errorCode)) {
3205 /* remove the Katakana middle dot(s) from set1 */
3206 uset_remove(set1, 0x30fb);
3207 uset_remove(set1, 0xff65); /* halfwidth variant */
3208 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3209 } else {
3210 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3211 }
3212
3213 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3214 set3=uset_openPattern(formatPattern, 6, &errorCode);
3215 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3216 if(U_SUCCESS(errorCode)) {
3217 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3218 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3219 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3220 } else {
3221 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3222 }
3223
3224 uset_close(set1);
3225 uset_close(set2);
3226 uset_close(set3);
3227 uset_close(set4);
3228
3229 /*
3230 * Check that each lowercase character has "small" in its name
3231 * and not "capital".
3232 * There are some such characters, some of which seem odd.
3233 * Use the verbose flag to see these notices.
3234 */
3235 errorCode=U_ZERO_ERROR;
3236 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3237 if(U_SUCCESS(errorCode)) {
3238 for(i=0;; ++i) {
3239 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3240 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3241 break; /* done */
3242 }
3243 if(U_FAILURE(errorCode)) {
3244 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3245 i, u_errorName(errorCode));
3246 break;
3247 }
3248 if(length!=0) {
3249 break; /* done with code points, got a string or -1 */
3250 }
3251
3252 while(start<=end) {
3253 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3254 if(U_FAILURE(errorCode)) {
3255 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3256 errorCode=U_ZERO_ERROR;
3257 }
3258 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3259 strstr(buffer, "SMALL CAPITAL")==NULL
3260 ) {
3261 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3262 }
3263 ++start;
3264 }
3265 }
3266 } else {
3267 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3268 }
3269 uset_close(set1);
3270
3271 /* verify that all assigned characters in Math blocks are exactly Math characters */
3272 errorCode=U_ZERO_ERROR;
3273 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3274 set2=uset_openPattern(mathPattern, 8, &errorCode);
3275 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3276 if(U_SUCCESS(errorCode)) {
3277 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3278 uset_complement(set3); /* assigned characters */
3279 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3280 compareUSets(set1, set2,
3281 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3282 TRUE);
3283 } else {
3284 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3285 }
3286 uset_close(set1);
3287 uset_close(set2);
3288 uset_close(set3);
3289
3290 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3291 errorCode=U_ZERO_ERROR;
3292 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3293 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3294 if(U_SUCCESS(errorCode)) {
3295 compareUSets(set1, set2,
3296 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3297 TRUE);
3298 } else {
3299 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3300 }
3301 uset_close(set1);
3302 uset_close(set2);
3303 }
3304
3305 /*
3306 * Starting with ICU4C 3.4, the core Unicode properties files
3307 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3308 * are hardcoded in the common DLL and therefore not included
3309 * in the data package any more.
3310 * Test requiring these files are disabled so that
3311 * we need not jump through hoops (like adding snapshots of these files
3312 * to testdata).
3313 * See Jitterbug 4497.
3314 */
3315 #define HARDCODED_DATA_4497 1
3316
3317 /* API coverage for ubidi_props.c */
TestUBiDiProps()3318 static void TestUBiDiProps() {
3319 #if !HARDCODED_DATA_4497
3320 UDataMemory *pData;
3321 UBiDiProps *bdp;
3322 const UBiDiProps *cbdp;
3323 UErrorCode errorCode;
3324
3325 /* coverage for ubidi_openBinary() */
3326 errorCode=U_ZERO_ERROR;
3327 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3328 if(U_FAILURE(errorCode)) {
3329 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3330 u_errorName(errorCode));
3331 return;
3332 }
3333
3334 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3335 if(U_FAILURE(errorCode)) {
3336 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3337 u_errorName(errorCode));
3338 udata_close(pData);
3339 return;
3340 }
3341
3342 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3343 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3344 }
3345
3346 ubidi_closeProps(bdp);
3347 udata_close(pData);
3348
3349 /* coverage for ubidi_getDummy() */
3350 errorCode=U_ZERO_ERROR;
3351 cbdp=ubidi_getDummy(&errorCode);
3352 if(ubidi_getClass(cbdp, 0x20)!=0) {
3353 log_err("ubidi_getClass(dummy, space)!=0\n");
3354 }
3355 #endif
3356 }
3357
3358 /* test case folding, compare return values with CaseFolding.txt ------------ */
3359
3360 /* bit set for which case foldings for a character have been tested already */
3361 enum {
3362 CF_SIMPLE=1,
3363 CF_FULL=2,
3364 CF_TURKIC=4,
3365 CF_ALL=7
3366 };
3367
3368 static void
testFold(UChar32 c,int which,UChar32 simple,UChar32 turkic,const UChar * full,int32_t fullLength,const UChar * turkicFull,int32_t turkicFullLength)3369 testFold(UChar32 c, int which,
3370 UChar32 simple, UChar32 turkic,
3371 const UChar *full, int32_t fullLength,
3372 const UChar *turkicFull, int32_t turkicFullLength) {
3373 UChar s[2], t[32];
3374 UChar32 c2;
3375 int32_t length, length2;
3376
3377 UErrorCode errorCode=U_ZERO_ERROR;
3378
3379 length=0;
3380 U16_APPEND_UNSAFE(s, length, c);
3381
3382 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3383 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3384 }
3385 if((which&CF_FULL)!=0) {
3386 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3387 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3388 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3389 }
3390 }
3391 if((which&CF_TURKIC)!=0) {
3392 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3393 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3394 }
3395
3396 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3397 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3398 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3399 }
3400 }
3401 }
3402
3403 /* test that c case-folds to itself */
3404 static void
testFoldToSelf(UChar32 c,int which)3405 testFoldToSelf(UChar32 c, int which) {
3406 UChar s[2];
3407 int32_t length;
3408
3409 length=0;
3410 U16_APPEND_UNSAFE(s, length, c);
3411 testFold(c, which, c, c, s, length, s, length);
3412 }
3413
3414 struct CaseFoldingData {
3415 USet *notSeen;
3416 UChar32 prev, prevSimple;
3417 UChar prevFull[32];
3418 int32_t prevFullLength;
3419 int which;
3420 };
3421 typedef struct CaseFoldingData CaseFoldingData;
3422
3423 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)3424 caseFoldingLineFn(void *context,
3425 char *fields[][2], int32_t fieldCount,
3426 UErrorCode *pErrorCode) {
3427 CaseFoldingData *pData=(CaseFoldingData *)context;
3428 char *end;
3429 UChar full[32];
3430 UChar32 c, prev, simple;
3431 int32_t count;
3432 int which;
3433 char status;
3434
3435 /* get code point */
3436 const char *s=u_skipWhitespace(fields[0][0]);
3437 if(0==strncmp(s, "0000..10FFFF", 12)) {
3438 /*
3439 * Ignore the line
3440 * # @missing: 0000..10FFFF; C; <code point>
3441 * because maps-to-self is already our default, and this line breaks this parser.
3442 */
3443 return;
3444 }
3445 c=(UChar32)strtoul(s, &end, 16);
3446 end=(char *)u_skipWhitespace(end);
3447 if(end<=fields[0][0] || end!=fields[0][1]) {
3448 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3449 *pErrorCode=U_PARSE_ERROR;
3450 return;
3451 }
3452
3453 /* get the status of this mapping */
3454 status=*u_skipWhitespace(fields[1][0]);
3455 if(status!='C' && status!='S' && status!='F' && status!='T') {
3456 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3457 *pErrorCode=U_PARSE_ERROR;
3458 return;
3459 }
3460
3461 /* get the mapping */
3462 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3463 if(U_FAILURE(*pErrorCode)) {
3464 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3465 return;
3466 }
3467
3468 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3469 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3470 simple=c;
3471 }
3472
3473 if(c!=(prev=pData->prev)) {
3474 /*
3475 * Test remaining mappings for the previous code point.
3476 * If a turkic folding was not mentioned, then it should fold the same
3477 * as the regular simple case folding.
3478 */
3479 UChar prevString[2];
3480 int32_t length;
3481
3482 length=0;
3483 U16_APPEND_UNSAFE(prevString, length, prev);
3484 testFold(prev, (~pData->which)&CF_ALL,
3485 prev, pData->prevSimple,
3486 prevString, length,
3487 pData->prevFull, pData->prevFullLength);
3488 pData->prev=pData->prevSimple=c;
3489 length=0;
3490 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3491 pData->prevFullLength=length;
3492 pData->which=0;
3493 }
3494
3495 /*
3496 * Turn the status into a bit set of case foldings to test.
3497 * Remember non-Turkic case foldings as defaults for Turkic mode.
3498 */
3499 switch(status) {
3500 case 'C':
3501 which=CF_SIMPLE|CF_FULL;
3502 pData->prevSimple=simple;
3503 u_memcpy(pData->prevFull, full, count);
3504 pData->prevFullLength=count;
3505 break;
3506 case 'S':
3507 which=CF_SIMPLE;
3508 pData->prevSimple=simple;
3509 break;
3510 case 'F':
3511 which=CF_FULL;
3512 u_memcpy(pData->prevFull, full, count);
3513 pData->prevFullLength=count;
3514 break;
3515 case 'T':
3516 which=CF_TURKIC;
3517 break;
3518 default:
3519 which=0;
3520 break; /* won't happen because of test above */
3521 }
3522
3523 testFold(c, which, simple, simple, full, count, full, count);
3524
3525 /* remember which case foldings of c have been tested */
3526 pData->which|=which;
3527
3528 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3529 uset_remove(pData->notSeen, c);
3530 }
3531
3532 static void
TestCaseFolding()3533 TestCaseFolding() {
3534 CaseFoldingData data={ NULL };
3535 char *fields[3][2];
3536 UErrorCode errorCode;
3537
3538 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3539
3540 errorCode=U_ZERO_ERROR;
3541 /* test BMP & plane 1 - nothing interesting above */
3542 data.notSeen=uset_open(0, 0x1ffff);
3543 data.prevFullLength=1; /* length of full case folding of U+0000 */
3544
3545 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3546 if(U_SUCCESS(errorCode)) {
3547 int32_t i, start, end;
3548
3549 /* add a pseudo-last line to finish testing of the actual last one */
3550 fields[0][0]=lastLine;
3551 fields[0][1]=lastLine+6;
3552 fields[1][0]=lastLine+7;
3553 fields[1][1]=lastLine+9;
3554 fields[2][0]=lastLine+10;
3555 fields[2][1]=lastLine+17;
3556 caseFoldingLineFn(&data, fields, 3, &errorCode);
3557
3558 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3559 for(i=0;
3560 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3561 U_SUCCESS(errorCode);
3562 ++i
3563 ) {
3564 do {
3565 testFoldToSelf(start, CF_ALL);
3566 } while(++start<=end);
3567 }
3568 }
3569
3570 uset_close(data.notSeen);
3571 }
3572