1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28 #include "unicode/utf16.h"
29 #include "unicode/utf_old.h"
30 #include "cintltst.h"
31 #include "putilimp.h"
32 #include "uparse.h"
33 #include "ucase.h"
34 #include "ubidi_props.h"
35 #include "uprops.h"
36 #include "uset_imp.h"
37 #include "usc_impl.h"
38 #include "udatamem.h"
39 #include "cucdapi.h"
40 #include "cmemory.h"
41
42 /* prototypes --------------------------------------------------------------- */
43
44 static void TestUpperLower(void);
45 static void TestLetterNumber(void);
46 static void TestMisc(void);
47 static void TestPOSIX(void);
48 static void TestControlPrint(void);
49 static void TestIdentifier(void);
50 static void TestUnicodeData(void);
51 static void TestCodeUnit(void);
52 static void TestCodePoint(void);
53 static void TestCharLength(void);
54 static void TestCharNames(void);
55 static void TestUCharFromNameUnderflow(void);
56 static void TestMirroring(void);
57 static void TestUScriptRunAPI(void);
58 static void TestAdditionalProperties(void);
59 static void TestNumericProperties(void);
60 static void TestPropertyNames(void);
61 static void TestPropertyValues(void);
62 static void TestConsistency(void);
63 static void TestCaseFolding(void);
64 static void TestBinaryCharacterPropertiesAPI(void);
65 static void TestIntCharacterPropertiesAPI(void);
66
67 /* internal methods used */
68 static int32_t MakeProp(char* str);
69 static int32_t MakeDir(char* str);
70
71 /* helpers ------------------------------------------------------------------ */
72
73 static void
parseUCDFile(const char * filename,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)74 parseUCDFile(const char *filename,
75 char *fields[][2], int32_t fieldCount,
76 UParseLineFn *lineFn, void *context,
77 UErrorCode *pErrorCode) {
78 char path[256];
79 char backupPath[256];
80
81 if(U_FAILURE(*pErrorCode)) {
82 return;
83 }
84
85 /* Look inside ICU_DATA first */
86 strcpy(path, u_getDataDirectory());
87 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
88 strcat(path, filename);
89
90 /* As a fallback, try to guess where the source data was located
91 * at the time ICU was built, and look there.
92 */
93 strcpy(backupPath, ctest_dataSrcDir());
94 strcat(backupPath, U_FILE_SEP_STRING);
95 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
96 strcat(backupPath, filename);
97
98 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
99 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
100 *pErrorCode=U_ZERO_ERROR;
101 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
102 }
103 if(U_FAILURE(*pErrorCode)) {
104 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
105 }
106 }
107
108 /* test data ---------------------------------------------------------------- */
109
110 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
111 static const int32_t tagValues[] =
112 {
113 /* Mn */ U_NON_SPACING_MARK,
114 /* Mc */ U_COMBINING_SPACING_MARK,
115 /* Me */ U_ENCLOSING_MARK,
116 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
117 /* Nl */ U_LETTER_NUMBER,
118 /* No */ U_OTHER_NUMBER,
119 /* Zs */ U_SPACE_SEPARATOR,
120 /* Zl */ U_LINE_SEPARATOR,
121 /* Zp */ U_PARAGRAPH_SEPARATOR,
122 /* Cc */ U_CONTROL_CHAR,
123 /* Cf */ U_FORMAT_CHAR,
124 /* Cs */ U_SURROGATE,
125 /* Co */ U_PRIVATE_USE_CHAR,
126 /* Cn */ U_UNASSIGNED,
127 /* Lu */ U_UPPERCASE_LETTER,
128 /* Ll */ U_LOWERCASE_LETTER,
129 /* Lt */ U_TITLECASE_LETTER,
130 /* Lm */ U_MODIFIER_LETTER,
131 /* Lo */ U_OTHER_LETTER,
132 /* Pc */ U_CONNECTOR_PUNCTUATION,
133 /* Pd */ U_DASH_PUNCTUATION,
134 /* Ps */ U_START_PUNCTUATION,
135 /* Pe */ U_END_PUNCTUATION,
136 /* Po */ U_OTHER_PUNCTUATION,
137 /* Sm */ U_MATH_SYMBOL,
138 /* Sc */ U_CURRENCY_SYMBOL,
139 /* Sk */ U_MODIFIER_SYMBOL,
140 /* So */ U_OTHER_SYMBOL,
141 /* Pi */ U_INITIAL_PUNCTUATION,
142 /* Pf */ U_FINAL_PUNCTUATION
143 };
144
145 static const char dirStrings[][5] = {
146 "L",
147 "R",
148 "EN",
149 "ES",
150 "ET",
151 "AN",
152 "CS",
153 "B",
154 "S",
155 "WS",
156 "ON",
157 "LRE",
158 "LRO",
159 "AL",
160 "RLE",
161 "RLO",
162 "PDF",
163 "NSM",
164 "BN",
165 /* new in Unicode 6.3/ICU 52 */
166 "FSI",
167 "LRI",
168 "RLI",
169 "PDI"
170 };
171
172 void addUnicodeTest(TestNode** root);
173
addUnicodeTest(TestNode ** root)174 void addUnicodeTest(TestNode** root)
175 {
176 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
177 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
178 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
179 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
180 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
181 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
182 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
183 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
184 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
185 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
186 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
187 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
188 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
189 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
190 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
191 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
192 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
193 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
194 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
195 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
196 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
197 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
198 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
199 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
200 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
201 addTest(root, &TestBinaryCharacterPropertiesAPI,
202 "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
203 addTest(root, &TestIntCharacterPropertiesAPI,
204 "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
205 }
206
207 /*==================================================== */
208 /* test u_toupper() and u_tolower() */
209 /*==================================================== */
TestUpperLower()210 static void TestUpperLower()
211 {
212 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
213 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
214 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
215 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
216 int32_t i;
217
218 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
219 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
220
221 /*
222 Checks LetterLike Symbols which were previously a source of confusion
223 [Bertrand A. D. 02/04/98]
224 */
225 for (i=0x2100;i<0x2138;i++)
226 {
227 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
228 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
229 {
230 if (i != (int)u_tolower(i)) /* itself */
231 log_err("Failed case conversion with itself: U+%04x\n", i);
232 if (i != (int)u_toupper(i))
233 log_err("Failed case conversion with itself: U+%04x\n", i);
234 }
235 }
236
237 for(i=0; i < u_strlen(upper); i++){
238 if(u_tolower(upper[i]) != lower[i]){
239 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
240 }
241 }
242
243 log_verbose("testing upper lower\n");
244 for (i = 0; i < 21; i++) {
245
246 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
247 {
248 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
249 }
250 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
251 {
252 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
253 }
254 else if (upperTest[i] != u_tolower(lowerTest[i]))
255 {
256 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
257 }
258 else if (lowerTest[i] != u_toupper(upperTest[i]))
259 {
260 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
261 }
262 else if (upperTest[i] != u_tolower(upperTest[i]))
263 {
264 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
265 }
266 else if (lowerTest[i] != u_toupper(lowerTest[i]))
267 {
268 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
269 }
270 }
271 log_verbose("done testing upper lower\n");
272
273 log_verbose("testing u_istitle\n");
274 {
275 static const UChar expected[] = {
276 0x1F88,
277 0x1F89,
278 0x1F8A,
279 0x1F8B,
280 0x1F8C,
281 0x1F8D,
282 0x1F8E,
283 0x1F8F,
284 0x1F88,
285 0x1F89,
286 0x1F8A,
287 0x1F8B,
288 0x1F8C,
289 0x1F8D,
290 0x1F8E,
291 0x1F8F,
292 0x1F98,
293 0x1F99,
294 0x1F9A,
295 0x1F9B,
296 0x1F9C,
297 0x1F9D,
298 0x1F9E,
299 0x1F9F,
300 0x1F98,
301 0x1F99,
302 0x1F9A,
303 0x1F9B,
304 0x1F9C,
305 0x1F9D,
306 0x1F9E,
307 0x1F9F,
308 0x1FA8,
309 0x1FA9,
310 0x1FAA,
311 0x1FAB,
312 0x1FAC,
313 0x1FAD,
314 0x1FAE,
315 0x1FAF,
316 0x1FA8,
317 0x1FA9,
318 0x1FAA,
319 0x1FAB,
320 0x1FAC,
321 0x1FAD,
322 0x1FAE,
323 0x1FAF,
324 0x1FBC,
325 0x1FBC,
326 0x1FCC,
327 0x1FCC,
328 0x1FFC,
329 0x1FFC,
330 };
331 int32_t num = UPRV_LENGTHOF(expected);
332 for(i=0; i<num; i++){
333 if(!u_istitle(expected[i])){
334 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
335 }
336 }
337
338 }
339 }
340
341 /* compare two sets and verify that their difference or intersection is empty */
342 static UBool
showADiffB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool expect,UBool diffIsError)343 showADiffB(const USet *a, const USet *b,
344 const char *a_name, const char *b_name,
345 UBool expect, UBool diffIsError) {
346 USet *aa;
347 int32_t i, start, end, length;
348 UErrorCode errorCode;
349
350 /*
351 * expect:
352 * TRUE -> a-b should be empty, that is, b should contain all of a
353 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
354 */
355 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
356 return TRUE;
357 }
358
359 /* clone a to aa because a is const */
360 aa=uset_open(1, 0);
361 if(aa==NULL) {
362 /* unusual problem - out of memory? */
363 return FALSE;
364 }
365 uset_addAll(aa, a);
366
367 /* compute the set in question */
368 if(expect) {
369 /* a-b */
370 uset_removeAll(aa, b);
371 } else {
372 /* a&b */
373 uset_retainAll(aa, b);
374 }
375
376 /* aa is not empty because of the initial tests above; show its contents */
377 errorCode=U_ZERO_ERROR;
378 i=0;
379 for(;;) {
380 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
381 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
382 break; /* done */
383 }
384 if(U_FAILURE(errorCode)) {
385 log_err("error comparing %s with %s at difference item %d: %s\n",
386 a_name, b_name, i, u_errorName(errorCode));
387 break;
388 }
389 if(length!=0) {
390 break; /* done with code points, got a string or -1 */
391 }
392
393 if(diffIsError) {
394 if(expect) {
395 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
396 } else {
397 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
398 }
399 } else {
400 if(expect) {
401 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
402 } else {
403 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
404 }
405 }
406
407 ++i;
408 }
409
410 uset_close(aa);
411 return FALSE;
412 }
413
414 static UBool
showAMinusB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)415 showAMinusB(const USet *a, const USet *b,
416 const char *a_name, const char *b_name,
417 UBool diffIsError) {
418 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
419 }
420
421 static UBool
showAIntersectB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)422 showAIntersectB(const USet *a, const USet *b,
423 const char *a_name, const char *b_name,
424 UBool diffIsError) {
425 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
426 }
427
428 static UBool
compareUSets(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)429 compareUSets(const USet *a, const USet *b,
430 const char *a_name, const char *b_name,
431 UBool diffIsError) {
432 /*
433 * Use an arithmetic & not a logical && so that both branches
434 * are always taken and all differences are shown.
435 */
436 return
437 showAMinusB(a, b, a_name, b_name, diffIsError) &
438 showAMinusB(b, a, b_name, a_name, diffIsError);
439 }
440
441 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumber()442 static void TestLetterNumber()
443 {
444 UChar i = 0x0000;
445
446 log_verbose("Testing for isalpha\n");
447 for (i = 0x0041; i < 0x005B; i++) {
448 if (!u_isalpha(i))
449 {
450 log_err("Failed isLetter test at %.4X\n", i);
451 }
452 }
453 for (i = 0x0660; i < 0x066A; i++) {
454 if (u_isalpha(i))
455 {
456 log_err("Failed isLetter test with numbers at %.4X\n", i);
457 }
458 }
459
460 log_verbose("Testing for isdigit\n");
461 for (i = 0x0660; i < 0x066A; i++) {
462 if (!u_isdigit(i))
463 {
464 log_verbose("Failed isNumber test at %.4X\n", i);
465 }
466 }
467
468 log_verbose("Testing for isalnum\n");
469 for (i = 0x0041; i < 0x005B; i++) {
470 if (!u_isalnum(i))
471 {
472 log_err("Failed isAlNum test at %.4X\n", i);
473 }
474 }
475 for (i = 0x0660; i < 0x066A; i++) {
476 if (!u_isalnum(i))
477 {
478 log_err("Failed isAlNum test at %.4X\n", i);
479 }
480 }
481
482 {
483 /*
484 * The following checks work only starting from Unicode 4.0.
485 * Check the version number here.
486 */
487 static UVersionInfo u401={ 4, 0, 1, 0 };
488 UVersionInfo version;
489 u_getUnicodeVersion(version);
490 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
491 return;
492 }
493 }
494
495 {
496 /*
497 * Sanity check:
498 * Verify that exactly the digit characters have decimal digit values.
499 * This assumption is used in the implementation of u_digit()
500 * (which checks nt=de)
501 * compared with the parallel java.lang.Character.digit()
502 * (which checks Nd).
503 *
504 * This was not true in Unicode 3.2 and earlier.
505 * Unicode 4.0 fixed discrepancies.
506 * Unicode 4.0.1 re-introduced problems in this area due to an
507 * unintentionally incomplete last-minute change.
508 */
509 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
510 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
511
512 USet *digits, *decimalValues;
513 UErrorCode errorCode;
514
515 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
516 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
517 errorCode=U_ZERO_ERROR;
518 digits=uset_openPattern(digitsPattern, 6, &errorCode);
519 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
520
521 if(U_SUCCESS(errorCode)) {
522 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
523 }
524
525 uset_close(digits);
526 uset_close(decimalValues);
527 }
528 }
529
testSampleCharProps(UBool propFn (UChar32),const char * propName,const UChar32 * sampleChars,int32_t sampleCharsLength,UBool expected)530 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
531 const UChar32 *sampleChars, int32_t sampleCharsLength,
532 UBool expected) {
533 int32_t i;
534 for (i = 0; i < sampleCharsLength; ++i) {
535 UBool result = propFn(sampleChars[i]);
536 if (result != expected) {
537 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
538 propName, sampleChars[i], result);
539 }
540 }
541 }
542
543 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMisc()544 static void TestMisc()
545 {
546 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
547 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
548 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
549 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
550 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
551 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
552 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
553 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
554 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
555 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
556 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
557
558 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
559
560 uint32_t mask;
561
562 int32_t i;
563 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
564 UVersionInfo realVersion;
565
566 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
567
568 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
569 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570
571 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
572 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
573 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
574 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
575
576 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
577 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
578 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
579 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
580
581 testSampleCharProps(u_isdefined, "u_isdefined",
582 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
583 testSampleCharProps(u_isdefined, "u_isdefined",
584 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
585
586 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
587 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
588
589 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
590 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
591
592 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
593 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
594 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
595 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
596 }
597 }
598
599 /* Tests the ICU version #*/
600 u_getVersion(realVersion);
601 u_versionToString(realVersion, icuVersion);
602 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
603 {
604 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
605 }
606 #if defined(ICU_VERSION)
607 /* test only happens where we have configure.in with VERSION - sanity check. */
608 if(strcmp(U_ICU_VERSION, ICU_VERSION))
609 {
610 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
611 }
612 #endif
613
614 /* test U_GC_... */
615 if(
616 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
617 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
618 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
619 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
620 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
621 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
622 ) {
623 log_err("error: U_GET_GC_MASK does not work properly\n");
624 }
625
626 mask=0;
627 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
628
629 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
630 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
631 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
632 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
633 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
634
635 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
636 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
637 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
638
639 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
640 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
641 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
642
643 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
644 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
645 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
646
647 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
648 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
649 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
650 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
651
652 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
653 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
654 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
655 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
656 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
657
658 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
659 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
660 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
661 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
662
663 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
664 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
665
666 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
667 log_err("error: problems with U_GC_XX_MASK constants\n");
668 }
669
670 mask=0;
671 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
672 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
673 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
674 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
675 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
676 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
677 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
678
679 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
680 log_err("error: problems with U_GC_Y_MASK constants\n");
681 }
682 {
683 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
684 for(i=0; i<10; i++){
685 if(digit[i]!=u_forDigit(i,10)){
686 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
687 }
688 }
689 }
690
691 /* test u_digit() */
692 {
693 static const struct {
694 UChar32 c;
695 int8_t radix, value;
696 } data[]={
697 /* base 16 */
698 { 0x0031, 16, 1 },
699 { 0x0038, 16, 8 },
700 { 0x0043, 16, 12 },
701 { 0x0066, 16, 15 },
702 { 0x00e4, 16, -1 },
703 { 0x0662, 16, 2 },
704 { 0x06f5, 16, 5 },
705 { 0xff13, 16, 3 },
706 { 0xff41, 16, 10 },
707
708 /* base 8 */
709 { 0x0031, 8, 1 },
710 { 0x0038, 8, -1 },
711 { 0x0043, 8, -1 },
712 { 0x0066, 8, -1 },
713 { 0x00e4, 8, -1 },
714 { 0x0662, 8, 2 },
715 { 0x06f5, 8, 5 },
716 { 0xff13, 8, 3 },
717 { 0xff41, 8, -1 },
718
719 /* base 36 */
720 { 0x5a, 36, 35 },
721 { 0x7a, 36, 35 },
722 { 0xff3a, 36, 35 },
723 { 0xff5a, 36, 35 },
724
725 /* wrong radix values */
726 { 0x0031, 1, -1 },
727 { 0xff3a, 37, -1 }
728 };
729
730 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
731 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
732 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
733 data[i].c,
734 data[i].radix,
735 u_digit(data[i].c, data[i].radix),
736 data[i].value);
737 }
738 }
739 }
740 }
741
742 /* test C/POSIX-style functions --------------------------------------------- */
743
744 /* bit flags */
745 #define ISAL 1
746 #define ISLO 2
747 #define ISUP 4
748
749 #define ISDI 8
750 #define ISXD 0x10
751
752 #define ISAN 0x20
753
754 #define ISPU 0x40
755 #define ISGR 0x80
756 #define ISPR 0x100
757
758 #define ISSP 0x200
759 #define ISBL 0x400
760 #define ISCN 0x800
761
762 /* C/POSIX-style functions, in the same order as the bit flags */
763 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
764
765 static const struct {
766 IsPOSIXClass *fn;
767 const char *name;
768 } posixClasses[]={
769 { u_isalpha, "isalpha" },
770 { u_islower, "islower" },
771 { u_isupper, "isupper" },
772 { u_isdigit, "isdigit" },
773 { u_isxdigit, "isxdigit" },
774 { u_isalnum, "isalnum" },
775 { u_ispunct, "ispunct" },
776 { u_isgraph, "isgraph" },
777 { u_isprint, "isprint" },
778 { u_isspace, "isspace" },
779 { u_isblank, "isblank" },
780 { u_iscntrl, "iscntrl" }
781 };
782
783 static const struct {
784 UChar32 c;
785 uint32_t posixResults;
786 } posixData[]={
787 { 0x0008, ISCN }, /* backspace */
788 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
789 { 0x000a, ISSP| ISCN }, /* LF */
790 { 0x000c, ISSP| ISCN }, /* FF */
791 { 0x000d, ISSP| ISCN }, /* CR */
792 { 0x0020, ISPR|ISSP|ISBL }, /* space */
793 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
794 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
795 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
796 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
797 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
798 { 0x007b, ISPU|ISGR|ISPR }, /* { */
799 { 0x0085, ISSP| ISCN }, /* NEL */
800 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
801 { 0x00a4, ISGR|ISPR }, /* currency sign */
802 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
803 { 0x0300, ISGR|ISPR }, /* combining grave */
804 { 0x0600, ISCN }, /* arabic number sign */
805 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
806 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
807 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
808 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
809 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
810 { 0x200b, ISCN }, /* ZWSP */
811 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
812 { 0x200e, ISCN }, /* LRM */
813 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
814 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
815 { 0x20ac, ISGR|ISPR }, /* Euro */
816 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
817 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
818 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
819 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
820 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
821 };
822
823 static void
TestPOSIX()824 TestPOSIX() {
825 uint32_t mask;
826 int32_t cl, i;
827 UBool expect;
828
829 mask=1;
830 for(cl=0; cl<12; ++cl) {
831 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
832 expect=(UBool)((posixData[i].posixResults&mask)!=0);
833 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
834 log_err("u_%s(U+%04x)=%s is wrong\n",
835 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
836 }
837 }
838 mask<<=1;
839 }
840 }
841
842 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrint()843 static void TestControlPrint()
844 {
845 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
846 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
847 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
848 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
849 UChar32 c;
850
851 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
852 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
853
854 testSampleCharProps(u_isprint, "u_isprint",
855 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
856 testSampleCharProps(u_isprint, "u_isprint",
857 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
858
859 /* test all ISO 8 controls */
860 for(c=0; c<=0x9f; ++c) {
861 if(c==0x20) {
862 /* skip ASCII graphic characters and continue with DEL */
863 c=0x7f;
864 }
865 if(!u_iscntrl(c)) {
866 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
867 }
868 if(!u_isISOControl(c)) {
869 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
870 }
871 if(u_isprint(c)) {
872 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
873 }
874 }
875
876 /* test all Latin-1 graphic characters */
877 for(c=0x20; c<=0xff; ++c) {
878 if(c==0x7f) {
879 c=0xa0;
880 } else if(c==0xad) {
881 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
882 ++c;
883 }
884 if(!u_isprint(c)) {
885 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
886 }
887 }
888 }
889
890 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifier()891 static void TestIdentifier()
892 {
893 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
894 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
895 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
896 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
897 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
898 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
899 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
900 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
901 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
902 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
903
904 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
905 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
906 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
907 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
908
909 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
910 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
911 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
912 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
913
914 /* IDPart should imply IDStart */
915 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
916 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
917
918 testSampleCharProps(u_isIDStart, "u_isIDStart",
919 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
920 testSampleCharProps(u_isIDStart, "u_isIDStart",
921 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
922
923 testSampleCharProps(u_isIDPart, "u_isIDPart",
924 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
925 testSampleCharProps(u_isIDPart, "u_isIDPart",
926 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
927
928 /* IDPart should imply IDStart */
929 testSampleCharProps(u_isIDPart, "u_isIDPart",
930 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
931
932 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
933 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
934 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
935 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
936 }
937
938 /* for each line of UnicodeData.txt, check some of the properties */
939 typedef struct UnicodeDataContext {
940 #if UCONFIG_NO_NORMALIZATION
941 const void *dummy;
942 #else
943 const UNormalizer2 *nfc;
944 const UNormalizer2 *nfkc;
945 #endif
946 } UnicodeDataContext;
947
948 /*
949 * ### TODO
950 * This test fails incorrectly if the First or Last code point of a repetitive area
951 * is overridden, which is allowed and is encouraged for the PUAs.
952 * Currently, this means that both area First/Last and override lines are
953 * tested against the properties from the API,
954 * and the area boundary will not match and cause an error.
955 *
956 * This function should detect area boundaries and skip them for the test of individual
957 * code points' properties.
958 * Then it should check that the areas contain all the same properties except where overridden.
959 * For this, it would have had to set a flag for which code points were listed explicitly.
960 */
961 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)962 unicodeDataLineFn(void *context,
963 char *fields[][2], int32_t fieldCount,
964 UErrorCode *pErrorCode)
965 {
966 char buffer[100];
967 const char *d;
968 char *end;
969 uint32_t value;
970 UChar32 c;
971 int32_t i;
972 int8_t type;
973 int32_t dt;
974 UChar dm[32], s[32];
975 int32_t dmLength, length;
976
977 #if !UCONFIG_NO_NORMALIZATION
978 const UNormalizer2 *nfc, *nfkc;
979 #endif
980
981 /* get the character code, field 0 */
982 c=strtoul(fields[0][0], &end, 16);
983 if(end<=fields[0][0] || end!=fields[0][1]) {
984 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
985 return;
986 }
987 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
988 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
989 return;
990 }
991
992 /* get general category, field 2 */
993 *fields[2][1]=0;
994 type = (int8_t)tagValues[MakeProp(fields[2][0])];
995 if(u_charType(c)!=type) {
996 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
997 }
998 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
999 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1000 }
1001
1002 /* get canonical combining class, field 3 */
1003 value=strtoul(fields[3][0], &end, 10);
1004 if(end<=fields[3][0] || end!=fields[3][1]) {
1005 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1006 return;
1007 }
1008 if(value>255) {
1009 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1010 return;
1011 }
1012 #if !UCONFIG_NO_NORMALIZATION
1013 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1014 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1015 }
1016 nfkc=((UnicodeDataContext *)context)->nfkc;
1017 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1018 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1019 }
1020 #endif
1021
1022 /* get BiDi category, field 4 */
1023 *fields[4][1]=0;
1024 i=MakeDir(fields[4][0]);
1025 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1026 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1027 }
1028
1029 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1030 d=NULL;
1031 if(fields[5][0]==fields[5][1]) {
1032 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1033 if(c==0xac00 || c==0xd7a3) {
1034 dt=U_DT_CANONICAL;
1035 } else {
1036 dt=U_DT_NONE;
1037 }
1038 } else {
1039 d=fields[5][0];
1040 *fields[5][1]=0;
1041 dt=UCHAR_INVALID_CODE;
1042 if(*d=='<') {
1043 end=strchr(++d, '>');
1044 if(end!=NULL) {
1045 *end=0;
1046 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1047 d=u_skipWhitespace(end+1);
1048 }
1049 } else {
1050 dt=U_DT_CANONICAL;
1051 }
1052 }
1053 if(dt>U_DT_NONE) {
1054 if(c==0xac00) {
1055 dm[0]=0x1100;
1056 dm[1]=0x1161;
1057 dm[2]=0;
1058 dmLength=2;
1059 } else if(c==0xd7a3) {
1060 dm[0]=0xd788;
1061 dm[1]=0x11c2;
1062 dm[2]=0;
1063 dmLength=2;
1064 } else {
1065 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1066 }
1067 } else {
1068 dmLength=-1;
1069 }
1070 if(dt<0 || U_FAILURE(*pErrorCode)) {
1071 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1072 return;
1073 }
1074 #if !UCONFIG_NO_NORMALIZATION
1075 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1076 if(i!=dt) {
1077 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1078 }
1079 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1080 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1081 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1082 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1083 "or the Decomposition_Mapping is different (%s)\n",
1084 c, length, dmLength, u_errorName(*pErrorCode));
1085 return;
1086 }
1087 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1088 if(dt!=U_DT_CANONICAL) {
1089 dmLength=-1;
1090 }
1091 nfc=((UnicodeDataContext *)context)->nfc;
1092 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1093 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1094 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1095 "or the Decomposition_Mapping is different (%s)\n",
1096 c, length, dmLength, u_errorName(*pErrorCode));
1097 return;
1098 }
1099 /* recompose */
1100 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1101 UChar32 a, b, composite;
1102 i=0;
1103 U16_NEXT(dm, i, dmLength, a);
1104 U16_NEXT(dm, i, dmLength, b);
1105 /* i==dmLength */
1106 composite=unorm2_composePair(nfc, a, b);
1107 if(composite!=c) {
1108 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1109 (long)c, (long)a, (long)b, (long)composite);
1110 }
1111 /*
1112 * Note: NFKC has fewer round-trip mappings than NFC,
1113 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1114 */
1115 }
1116 #endif
1117
1118 /* get ISO Comment, field 11 */
1119 *fields[11][1]=0;
1120 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1121 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1122 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1123 c, u_errorName(*pErrorCode),
1124 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1125 fields[11][0]);
1126 }
1127
1128 /* get uppercase mapping, field 12 */
1129 if(fields[12][0]!=fields[12][1]) {
1130 value=strtoul(fields[12][0], &end, 16);
1131 if(end!=fields[12][1]) {
1132 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1133 return;
1134 }
1135 if((UChar32)value!=u_toupper(c)) {
1136 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1137 }
1138 } else {
1139 /* no case mapping: the API must map the code point to itself */
1140 if(c!=u_toupper(c)) {
1141 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1142 }
1143 }
1144
1145 /* get lowercase mapping, field 13 */
1146 if(fields[13][0]!=fields[13][1]) {
1147 value=strtoul(fields[13][0], &end, 16);
1148 if(end!=fields[13][1]) {
1149 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1150 return;
1151 }
1152 if((UChar32)value!=u_tolower(c)) {
1153 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1154 }
1155 } else {
1156 /* no case mapping: the API must map the code point to itself */
1157 if(c!=u_tolower(c)) {
1158 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1159 }
1160 }
1161
1162 /* get titlecase mapping, field 14 */
1163 if(fields[14][0]!=fields[14][1]) {
1164 value=strtoul(fields[14][0], &end, 16);
1165 if(end!=fields[14][1]) {
1166 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1167 return;
1168 }
1169 if((UChar32)value!=u_totitle(c)) {
1170 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1171 }
1172 } else {
1173 /* no case mapping: the API must map the code point to itself */
1174 if(c!=u_totitle(c)) {
1175 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1176 }
1177 }
1178 }
1179
1180 static UBool U_CALLCONV
enumTypeRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1181 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1182 static const UChar32 test[][2]={
1183 {0x41, U_UPPERCASE_LETTER},
1184 {0x308, U_NON_SPACING_MARK},
1185 {0xfffe, U_GENERAL_OTHER_TYPES},
1186 {0xe0041, U_FORMAT_CHAR},
1187 {0xeffff, U_UNASSIGNED}
1188 };
1189
1190 int32_t i, count;
1191
1192 if(0!=strcmp((const char *)context, "a1")) {
1193 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1194 return FALSE;
1195 }
1196
1197 count=UPRV_LENGTHOF(test);
1198 for(i=0; i<count; ++i) {
1199 if(start<=test[i][0] && test[i][0]<limit) {
1200 if(type!=(UCharCategory)test[i][1]) {
1201 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1202 start, limit, (long)type, test[i][0], test[i][1]);
1203 }
1204 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1205 return i==(count-1) ? FALSE : TRUE;
1206 }
1207 }
1208
1209 if(start>test[count-1][0]) {
1210 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1211 start, limit, (long)type);
1212 return FALSE;
1213 }
1214
1215 return TRUE;
1216 }
1217
1218 static UBool U_CALLCONV
enumDefaultsRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1219 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1220 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1221 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1222 { 0x0590, U_LEFT_TO_RIGHT },
1223 { 0x0600, U_RIGHT_TO_LEFT },
1224 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1225 { 0x0860, U_RIGHT_TO_LEFT },
1226 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
1227 { 0x08A0, U_RIGHT_TO_LEFT },
1228 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1229 { 0x20A0, U_LEFT_TO_RIGHT },
1230 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1231 { 0xFB1D, U_LEFT_TO_RIGHT },
1232 { 0xFB50, U_RIGHT_TO_LEFT },
1233 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1234 { 0xFE70, U_LEFT_TO_RIGHT },
1235 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1236
1237 { 0x10800, U_LEFT_TO_RIGHT },
1238 { 0x10D00, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1239 { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1240 { 0x10F30, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1241 { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1242 { 0x11000, U_RIGHT_TO_LEFT },
1243
1244 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1245 { 0x1EC70, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1246 { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1247 { 0x1EE00, U_RIGHT_TO_LEFT },
1248 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1249 { 0x1F000, U_RIGHT_TO_LEFT },
1250 { 0x110000, U_LEFT_TO_RIGHT }
1251 };
1252
1253 UChar32 c;
1254 int32_t i;
1255 UCharDirection shouldBeDir;
1256
1257 /*
1258 * LineBreak.txt specifies:
1259 * # - Assigned characters that are not listed explicitly are given the value
1260 * # "AL".
1261 * # - Unassigned characters are given the value "XX".
1262 *
1263 * PUA characters are listed explicitly with "XX".
1264 * Verify that no assigned character has "XX".
1265 */
1266 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1267 c=start;
1268 while(c<limit) {
1269 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1270 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1271 }
1272 ++c;
1273 }
1274 }
1275
1276 /*
1277 * Verify default Bidi classes.
1278 * See DerivedBidiClass.txt, especially for unassigned code points.
1279 */
1280 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1281 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1282 c=start;
1283 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1284 if((int32_t)c<defaultBidi[i][0]) {
1285 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1286 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1287 shouldBeDir=U_BOUNDARY_NEUTRAL;
1288 } else {
1289 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1290 }
1291
1292 if( u_charDirection(c)!=shouldBeDir ||
1293 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1294 ) {
1295 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1296 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1297 }
1298 ++c;
1299 }
1300 }
1301 }
1302 }
1303
1304 return TRUE;
1305 }
1306
1307 /* tests for several properties */
TestUnicodeData()1308 static void TestUnicodeData()
1309 {
1310 UVersionInfo expectVersionArray;
1311 UVersionInfo versionArray;
1312 char *fields[15][2];
1313 UErrorCode errorCode;
1314 UChar32 c;
1315 int8_t type;
1316
1317 UnicodeDataContext context;
1318
1319 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1320 u_getUnicodeVersion(versionArray);
1321 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1322 {
1323 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1324 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1325 }
1326
1327 #if defined(ICU_UNICODE_VERSION)
1328 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1329 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1330 {
1331 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1332 }
1333 #endif
1334
1335 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1336 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1337 }
1338
1339 errorCode=U_ZERO_ERROR;
1340 #if !UCONFIG_NO_NORMALIZATION
1341 context.nfc=unorm2_getNFCInstance(&errorCode);
1342 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1343 if(U_FAILURE(errorCode)) {
1344 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1345 return;
1346 }
1347 #endif
1348 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1349 if(U_FAILURE(errorCode)) {
1350 return; /* if we couldn't parse UnicodeData.txt, we should return */
1351 }
1352
1353 /* sanity check on repeated properties */
1354 for(c=0xfffe; c<=0x10ffff;) {
1355 type=u_charType(c);
1356 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1357 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1358 }
1359 if(type!=U_UNASSIGNED) {
1360 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1361 }
1362 if((c&0xffff)==0xfffe) {
1363 ++c;
1364 } else {
1365 c+=0xffff;
1366 }
1367 }
1368
1369 /* test that PUA is not "unassigned" */
1370 for(c=0xe000; c<=0x10fffd;) {
1371 type=u_charType(c);
1372 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1373 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1374 }
1375 if(type==U_UNASSIGNED) {
1376 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1377 } else if(type!=U_PRIVATE_USE_CHAR) {
1378 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1379 }
1380 if(c==0xf8ff) {
1381 c=0xf0000;
1382 } else if(c==0xffffd) {
1383 c=0x100000;
1384 } else {
1385 ++c;
1386 }
1387 }
1388
1389 /* test u_enumCharTypes() */
1390 u_enumCharTypes(enumTypeRange, "a1");
1391
1392 /* check default properties */
1393 u_enumCharTypes(enumDefaultsRange, NULL);
1394 }
1395
TestCodeUnit()1396 static void TestCodeUnit(){
1397 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1398
1399 int32_t i;
1400
1401 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1402 UChar c=codeunit[i];
1403 if(i<4){
1404 if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1405 U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1406 log_err("ERROR: U+%04x is a single", c);
1407 }
1408
1409 }
1410 if(i >= 4 && i< 8){
1411 if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1412 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1413 log_err("ERROR: U+%04x is a first surrogate", c);
1414 }
1415 }
1416 if(i >= 8 && i< 12){
1417 if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1418 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1419 log_err("ERROR: U+%04x is a second surrogate", c);
1420 }
1421 }
1422 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1423 if(i<4){
1424 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1425 log_err("ERROR: U+%04x is a single", c);
1426 }
1427
1428 }
1429 if(i >= 4 && i< 8){
1430 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1431 log_err("ERROR: U+%04x is a first surrogate", c);
1432 }
1433 }
1434 if(i >= 8 && i< 12){
1435 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1436 log_err("ERROR: U+%04x is a second surrogate", c);
1437 }
1438 }
1439 #endif
1440 }
1441 }
1442
TestCodePoint()1443 static void TestCodePoint(){
1444 const UChar32 codePoint[]={
1445 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1446 0xd800,
1447 0xdbff,
1448 0xdc00,
1449 0xdfff,
1450 0xdc04,
1451 0xd821,
1452 /*not a surrogate, valid, isUnicodeChar , not Error*/
1453 0x20ac,
1454 0xd7ff,
1455 0xe000,
1456 0xe123,
1457 0x0061,
1458 0xe065,
1459 0x20402,
1460 0x24506,
1461 0x23456,
1462 0x20402,
1463 0x10402,
1464 0x23456,
1465 /*not a surrogate, not valid, isUnicodeChar, isError */
1466 0x0015,
1467 0x009f,
1468 /*not a surrogate, not valid, not isUnicodeChar, isError */
1469 0xffff,
1470 0xfffe,
1471 };
1472 int32_t i;
1473 for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1474 UChar32 c=codePoint[i];
1475 if(i<6) {
1476 if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1477 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1478 }
1479 if(U_IS_UNICODE_CHAR(c)) {
1480 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1481 }
1482 } else if(i >=6 && i<18) {
1483 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1484 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1485 }
1486 if(!U_IS_UNICODE_CHAR(c)) {
1487 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1488 }
1489 } else if(i >=18 && i<20) {
1490 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1491 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1492 }
1493 if(!U_IS_UNICODE_CHAR(c)) {
1494 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1495 }
1496 } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1497 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1498 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1499 }
1500 if(U_IS_UNICODE_CHAR(c)) {
1501 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1502 }
1503 }
1504 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1505 if(i<6){
1506 if(!UTF_IS_SURROGATE(c)){
1507 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1508 }
1509 if(UTF_IS_VALID(c)){
1510 log_err("ERROR: isValid() failed for U+%04x\n", c);
1511 }
1512 if(UTF_IS_UNICODE_CHAR(c)){
1513 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1514 }
1515 if(UTF_IS_ERROR(c)){
1516 log_err("ERROR: isError() failed for U+%04x\n", c);
1517 }
1518 }else if(i >=6 && i<18){
1519 if(UTF_IS_SURROGATE(c)){
1520 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1521 }
1522 if(!UTF_IS_VALID(c)){
1523 log_err("ERROR: isValid() failed for U+%04x\n", c);
1524 }
1525 if(!UTF_IS_UNICODE_CHAR(c)){
1526 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1527 }
1528 if(UTF_IS_ERROR(c)){
1529 log_err("ERROR: isError() failed for U+%04x\n", c);
1530 }
1531 }else if(i >=18 && i<20){
1532 if(UTF_IS_SURROGATE(c)){
1533 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1534 }
1535 if(UTF_IS_VALID(c)){
1536 log_err("ERROR: isValid() failed for U+%04x\n", c);
1537 }
1538 if(!UTF_IS_UNICODE_CHAR(c)){
1539 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1540 }
1541 if(!UTF_IS_ERROR(c)){
1542 log_err("ERROR: isError() failed for U+%04x\n", c);
1543 }
1544 }
1545 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1546 if(UTF_IS_SURROGATE(c)){
1547 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1548 }
1549 if(UTF_IS_VALID(c)){
1550 log_err("ERROR: isValid() failed for U+%04x\n", c);
1551 }
1552 if(UTF_IS_UNICODE_CHAR(c)){
1553 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1554 }
1555 if(!UTF_IS_ERROR(c)){
1556 log_err("ERROR: isError() failed for U+%04x\n", c);
1557 }
1558 }
1559 #endif
1560 }
1561
1562 if(
1563 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1564 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1565 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1566 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1567 ) {
1568 log_err("error with U_IS_BMP()\n");
1569 }
1570
1571 if(
1572 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1573 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1574 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1575 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1576 ) {
1577 log_err("error with U_IS_SUPPLEMENTARY()\n");
1578 }
1579 }
1580
TestCharLength()1581 static void TestCharLength()
1582 {
1583 const int32_t codepoint[]={
1584 1, 0x0061,
1585 1, 0xe065,
1586 1, 0x20ac,
1587 2, 0x20402,
1588 2, 0x23456,
1589 2, 0x24506,
1590 2, 0x20402,
1591 2, 0x10402,
1592 1, 0xd7ff,
1593 1, 0xe000
1594 };
1595
1596 int32_t i;
1597 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1598 UBool multiple;
1599 #endif
1600 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1601 UChar32 c=codepoint[i+1];
1602 if(
1603 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1604 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1605 #endif
1606 U16_LENGTH(c) != codepoint[i]) {
1607 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1608 }
1609 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1610 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1611 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1612 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1613 }
1614 #endif
1615 }
1616 }
1617
1618 /*internal functions ----*/
MakeProp(char * str)1619 static int32_t MakeProp(char* str)
1620 {
1621 int32_t result = 0;
1622 char* matchPosition =0;
1623
1624 matchPosition = strstr(tagStrings, str);
1625 if (matchPosition == 0)
1626 {
1627 log_err("unrecognized type letter ");
1628 log_err(str);
1629 }
1630 else
1631 result = (int32_t)((matchPosition - tagStrings) / 2);
1632 return result;
1633 }
1634
MakeDir(char * str)1635 static int32_t MakeDir(char* str)
1636 {
1637 int32_t pos = 0;
1638 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1639 if (strcmp(str, dirStrings[pos]) == 0) {
1640 return pos;
1641 }
1642 }
1643 return -1;
1644 }
1645
1646 /* test u_charName() -------------------------------------------------------- */
1647
1648 static const struct {
1649 uint32_t code;
1650 const char *name, *oldName, *extName, *alias;
1651 } names[]={
1652 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1653 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1654 "LATIN CAPITAL LETTER OI",
1655 "LATIN CAPITAL LETTER GHA"},
1656 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1657 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1658 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1659 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1660 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1661 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1662 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1663 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1664 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1665 {0xd800, "", "", "<lead surrogate-D800>" },
1666 {0xdc00, "", "", "<trail surrogate-DC00>" },
1667 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1668 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1669 {0xffff, "", "", "<noncharacter-FFFF>" },
1670 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1671 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1672 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1673 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1674 };
1675
1676 static UBool
enumCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1677 enumCharNamesFn(void *context,
1678 UChar32 code, UCharNameChoice nameChoice,
1679 const char *name, int32_t length) {
1680 int32_t *pCount=(int32_t *)context;
1681 const char *expected;
1682 int i;
1683
1684 if(length<=0 || length!=(int32_t)strlen(name)) {
1685 /* should not be called with an empty string or invalid length */
1686 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1687 return TRUE;
1688 }
1689
1690 ++*pCount;
1691 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1692 if(code==(UChar32)names[i].code) {
1693 switch (nameChoice) {
1694 case U_EXTENDED_CHAR_NAME:
1695 if(0!=strcmp(name, names[i].extName)) {
1696 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1697 }
1698 break;
1699 case U_UNICODE_CHAR_NAME:
1700 if(0!=strcmp(name, names[i].name)) {
1701 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1702 }
1703 break;
1704 case U_UNICODE_10_CHAR_NAME:
1705 expected=names[i].oldName;
1706 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1707 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1708 }
1709 break;
1710 case U_CHAR_NAME_ALIAS:
1711 expected=names[i].alias;
1712 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1713 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1714 }
1715 break;
1716 case U_CHAR_NAME_CHOICE_COUNT:
1717 break;
1718 }
1719 break;
1720 }
1721 }
1722 return TRUE;
1723 }
1724
1725 struct enumExtCharNamesContext {
1726 uint32_t length;
1727 int32_t last;
1728 };
1729
1730 static UBool
enumExtCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1731 enumExtCharNamesFn(void *context,
1732 UChar32 code, UCharNameChoice nameChoice,
1733 const char *name, int32_t length) {
1734 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1735
1736 if (ecncp->last != (int32_t) code - 1) {
1737 if (ecncp->last < 0) {
1738 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1739 } else {
1740 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1741 }
1742 }
1743 ecncp->last = (int32_t) code;
1744
1745 if (!*name) {
1746 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1747 }
1748
1749 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1750 }
1751
1752 /**
1753 * This can be made more efficient by moving it into putil.c and having
1754 * it directly access the ebcdic translation tables.
1755 * TODO: If we get this method in putil.c, then delete it from here.
1756 */
1757 static UChar
u_charToUChar(char c)1758 u_charToUChar(char c) {
1759 UChar uc;
1760 u_charsToUChars(&c, &uc, 1);
1761 return uc;
1762 }
1763
1764 static void
TestCharNames()1765 TestCharNames() {
1766 static char name[80];
1767 UErrorCode errorCode=U_ZERO_ERROR;
1768 struct enumExtCharNamesContext extContext;
1769 const char *expected;
1770 int32_t length;
1771 UChar32 c;
1772 int32_t i;
1773
1774 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1775 length=uprv_getMaxCharNameLength();
1776 if(length==0) {
1777 /* no names data available */
1778 return;
1779 }
1780 if(length<83) { /* Unicode 3.2 max char name length */
1781 log_err("uprv_getMaxCharNameLength()=%d is too short");
1782 }
1783 /* ### TODO same tests for max ISO comment length as for max name length */
1784
1785 log_verbose("Testing u_charName()\n");
1786 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1787 /* modern Unicode character name */
1788 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1789 if(U_FAILURE(errorCode)) {
1790 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1791 return;
1792 }
1793 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1794 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1795 }
1796
1797 /* find the modern name */
1798 if (*names[i].name) {
1799 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1800 if(U_FAILURE(errorCode)) {
1801 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1802 return;
1803 }
1804 if(c!=(UChar32)names[i].code) {
1805 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1806 }
1807 }
1808
1809 /* Unicode 1.0 character name */
1810 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1811 if(U_FAILURE(errorCode)) {
1812 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1813 return;
1814 }
1815 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1816 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1817 }
1818
1819 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1820 if(names[i].oldName[0]!=0 /* && length>0 */) {
1821 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1822 if(U_FAILURE(errorCode)) {
1823 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1824 return;
1825 }
1826 if(c!=(UChar32)names[i].code) {
1827 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1828 }
1829 }
1830
1831 /* Unicode character name alias */
1832 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1833 if(U_FAILURE(errorCode)) {
1834 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1835 return;
1836 }
1837 expected=names[i].alias;
1838 if(expected==NULL) {
1839 expected="";
1840 }
1841 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1842 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1843 names[i].code, name, length, expected);
1844 }
1845
1846 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1847 if(expected[0]!=0 /* && length>0 */) {
1848 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1849 if(U_FAILURE(errorCode)) {
1850 log_err("u_charFromName(%s - alias) error %s\n",
1851 expected, u_errorName(errorCode));
1852 return;
1853 }
1854 if(c!=(UChar32)names[i].code) {
1855 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1856 expected, c, names[i].code);
1857 }
1858 }
1859 }
1860
1861 /* test u_enumCharNames() */
1862 length=0;
1863 errorCode=U_ZERO_ERROR;
1864 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1865 if(U_FAILURE(errorCode) || length<94140) {
1866 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1867 }
1868
1869 extContext.length = 0;
1870 extContext.last = -1;
1871 errorCode=U_ZERO_ERROR;
1872 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1873 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1874 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1875 }
1876
1877 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1878 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1879 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1880 }
1881
1882 /* Test getCharNameCharacters */
1883 if(!getTestOption(QUICK_OPTION)) {
1884 enum { BUFSIZE = 256 };
1885 UErrorCode ec = U_ZERO_ERROR;
1886 char buf[BUFSIZE];
1887 int32_t maxLength;
1888 UChar32 cp;
1889 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1890 int32_t l1, l2;
1891 UBool map[256];
1892 UBool ok;
1893
1894 USet* set = uset_open(1, 0); /* empty set */
1895 USet* dumb = uset_open(1, 0); /* empty set */
1896
1897 /*
1898 * uprv_getCharNameCharacters() will likely return more lowercase
1899 * letters than actual character names contain because
1900 * it includes all the characters in lowercased names of
1901 * general categories, for the full possible set of extended names.
1902 */
1903 {
1904 USetAdder sa={
1905 NULL,
1906 uset_add,
1907 uset_addRange,
1908 uset_addString,
1909 NULL /* don't need remove() */
1910 };
1911 sa.set=set;
1912 uprv_getCharNameCharacters(&sa);
1913 }
1914
1915 /* build set the dumb (but sure-fire) way */
1916 for (i=0; i<256; ++i) {
1917 map[i] = FALSE;
1918 }
1919
1920 maxLength=0;
1921 for (cp=0; cp<0x110000; ++cp) {
1922 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1923 buf, BUFSIZE, &ec);
1924 if (U_FAILURE(ec)) {
1925 log_err("FAIL: u_charName failed when it shouldn't\n");
1926 uset_close(set);
1927 uset_close(dumb);
1928 return;
1929 }
1930 if(len>maxLength) {
1931 maxLength=len;
1932 }
1933
1934 for (i=0; i<len; ++i) {
1935 if (!map[(uint8_t) buf[i]]) {
1936 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1937 map[(uint8_t) buf[i]] = TRUE;
1938 }
1939 }
1940
1941 /* test for leading/trailing whitespace */
1942 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1943 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1944 }
1945 }
1946
1947 if(map[(uint8_t)'\t']) {
1948 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1949 }
1950
1951 length=uprv_getMaxCharNameLength();
1952 if(length!=maxLength) {
1953 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1954 length, maxLength);
1955 }
1956
1957 /* compare the sets. Where is my uset_equals?!! */
1958 ok=TRUE;
1959 for(i=0; i<256; ++i) {
1960 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1961 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1962 /* ignore lowercase a-z that are in set but not in dumb */
1963 ok=TRUE;
1964 } else {
1965 ok=FALSE;
1966 break;
1967 }
1968 }
1969 }
1970
1971 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1972 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1973 if (U_FAILURE(ec)) {
1974 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1975 uset_close(set);
1976 uset_close(dumb);
1977 return;
1978 }
1979
1980 if (l1 >= BUFSIZE) {
1981 l1 = BUFSIZE-1;
1982 pat[l1] = 0;
1983 }
1984 if (l2 >= BUFSIZE) {
1985 l2 = BUFSIZE-1;
1986 dumbPat[l2] = 0;
1987 }
1988
1989 if (!ok) {
1990 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1991 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1992 } else if(getTestOption(VERBOSITY_OPTION)) {
1993 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1994 }
1995
1996 uset_close(set);
1997 uset_close(dumb);
1998 }
1999
2000 /* ### TODO: test error cases and other interesting things */
2001 }
2002
2003 static void
TestUCharFromNameUnderflow()2004 TestUCharFromNameUnderflow() {
2005 // Ticket #10889: Underflow crash when there is no dash.
2006 UErrorCode errorCode=U_ZERO_ERROR;
2007 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
2008 if(U_SUCCESS(errorCode)) {
2009 log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2010 }
2011
2012 // Test related edge cases.
2013 errorCode=U_ZERO_ERROR;
2014 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
2015 if(U_SUCCESS(errorCode)) {
2016 log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2017 }
2018
2019 errorCode=U_ZERO_ERROR;
2020 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
2021 if(U_SUCCESS(errorCode)) {
2022 log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2023 }
2024
2025 errorCode=U_ZERO_ERROR;
2026 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
2027 if(U_SUCCESS(errorCode)) {
2028 log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
2029 }
2030 }
2031
2032 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2033
2034 static void
TestMirroring()2035 TestMirroring() {
2036 USet *set;
2037 UErrorCode errorCode;
2038
2039 UChar32 start, end, c2, c3;
2040 int32_t i;
2041
2042 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2043
2044 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2045
2046 log_verbose("Testing u_isMirrored()\n");
2047 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2048 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2049 )
2050 ) {
2051 log_err("u_isMirrored() does not work correctly\n");
2052 }
2053
2054 log_verbose("Testing u_charMirror()\n");
2055 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2056 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2057 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2058 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2059 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2060 )
2061 ) {
2062 log_err("u_charMirror() does not work correctly\n");
2063 }
2064
2065 /* verify that Bidi_Mirroring_Glyph roundtrips */
2066 errorCode=U_ZERO_ERROR;
2067 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2068
2069 if (U_FAILURE(errorCode)) {
2070 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2071 } else {
2072 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2073 do {
2074 c2=u_charMirror(start);
2075 c3=u_charMirror(c2);
2076 if(c3!=start) {
2077 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2078 }
2079 c3=u_getBidiPairedBracket(start);
2080 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2081 if(c3!=start) {
2082 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2083 (long)start);
2084 }
2085 } else {
2086 if(c3!=c2) {
2087 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2088 (long)start, (long)c2);
2089 }
2090 }
2091 } while(++start<=end);
2092 }
2093 }
2094
2095 uset_close(set);
2096 }
2097
2098
2099 struct RunTestData
2100 {
2101 const char *runText;
2102 UScriptCode runCode;
2103 };
2104
2105 typedef struct RunTestData RunTestData;
2106
2107 static void
CheckScriptRuns(UScriptRun * scriptRun,int32_t * runStarts,const RunTestData * testData,int32_t nRuns,const char * prefix)2108 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2109 const char *prefix)
2110 {
2111 int32_t run, runStart, runLimit;
2112 UScriptCode runCode;
2113
2114 /* iterate over all the runs */
2115 run = 0;
2116 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2117 if (runStart != runStarts[run]) {
2118 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2119 prefix, run, runStarts[run], runStart);
2120 }
2121
2122 if (runLimit != runStarts[run + 1]) {
2123 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2124 prefix, run, runStarts[run + 1], runLimit);
2125 }
2126
2127 if (runCode != testData[run].runCode) {
2128 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2129 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2130 }
2131
2132 run += 1;
2133
2134 /* stop when we've seen all the runs we expect to see */
2135 if (run >= nRuns) {
2136 break;
2137 }
2138 }
2139
2140 /* Complain if we didn't see then number of runs we expected */
2141 if (run != nRuns) {
2142 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2143 }
2144 }
2145
2146 static void
TestUScriptRunAPI()2147 TestUScriptRunAPI()
2148 {
2149 static const RunTestData testData1[] = {
2150 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2151 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2152 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2153 {"English (", USCRIPT_LATIN},
2154 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2155 {") ", USCRIPT_LATIN},
2156 {"\\u6F22\\u5B75", USCRIPT_HAN},
2157 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2158 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2159 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2160 };
2161
2162 static const RunTestData testData2[] = {
2163 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2164 };
2165
2166 static const struct {
2167 const RunTestData *testData;
2168 int32_t nRuns;
2169 } testDataEntries[] = {
2170 {testData1, UPRV_LENGTHOF(testData1)},
2171 {testData2, UPRV_LENGTHOF(testData2)}
2172 };
2173
2174 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2175 int32_t testEntry;
2176
2177 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2178 UChar testString[1024];
2179 int32_t runStarts[256];
2180 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2181 const RunTestData *testData = testDataEntries[testEntry].testData;
2182
2183 int32_t run, stringLimit;
2184 UScriptRun *scriptRun = NULL;
2185 UErrorCode err;
2186
2187 /*
2188 * Fill in the test string and the runStarts array.
2189 */
2190 stringLimit = 0;
2191 for (run = 0; run < nTestRuns; run += 1) {
2192 runStarts[run] = stringLimit;
2193 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2194 /*stringLimit -= 1;*/
2195 }
2196
2197 /* The limit of the last run */
2198 runStarts[nTestRuns] = stringLimit;
2199
2200 /*
2201 * Make sure that calling uscript_OpenRun with a NULL text pointer
2202 * and a non-zero text length returns the correct error.
2203 */
2204 err = U_ZERO_ERROR;
2205 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2206
2207 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2208 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2209 }
2210
2211 if (scriptRun != NULL) {
2212 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2213 uscript_closeRun(scriptRun);
2214 }
2215
2216 /*
2217 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2218 * and a zero text length returns the correct error.
2219 */
2220 err = U_ZERO_ERROR;
2221 scriptRun = uscript_openRun(testString, 0, &err);
2222
2223 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2224 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2225 }
2226
2227 if (scriptRun != NULL) {
2228 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2229 uscript_closeRun(scriptRun);
2230 }
2231
2232 /*
2233 * Make sure that calling uscript_openRun with a NULL text pointer
2234 * and a zero text length doesn't return an error.
2235 */
2236 err = U_ZERO_ERROR;
2237 scriptRun = uscript_openRun(NULL, 0, &err);
2238
2239 if (U_FAILURE(err)) {
2240 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2241 }
2242
2243 /* Make sure that the empty iterator doesn't find any runs */
2244 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2245 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2246 }
2247
2248 /*
2249 * Make sure that calling uscript_setRunText with a NULL text pointer
2250 * and a non-zero text length returns the correct error.
2251 */
2252 err = U_ZERO_ERROR;
2253 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2254
2255 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2256 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2257 }
2258
2259 /*
2260 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2261 * and a zero text length returns the correct error.
2262 */
2263 err = U_ZERO_ERROR;
2264 uscript_setRunText(scriptRun, testString, 0, &err);
2265
2266 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2267 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2268 }
2269
2270 /*
2271 * Now call uscript_setRunText on the empty iterator
2272 * and make sure that it works.
2273 */
2274 err = U_ZERO_ERROR;
2275 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2276
2277 if (U_FAILURE(err)) {
2278 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2279 } else {
2280 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2281 }
2282
2283 uscript_closeRun(scriptRun);
2284
2285 /*
2286 * Now open an interator over the testString
2287 * using uscript_openRun and make sure that it works
2288 */
2289 scriptRun = uscript_openRun(testString, stringLimit, &err);
2290
2291 if (U_FAILURE(err)) {
2292 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2293 } else {
2294 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2295 }
2296
2297 /* Now reset the iterator, and make sure
2298 * that it still works.
2299 */
2300 uscript_resetRun(scriptRun);
2301
2302 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2303
2304 /* Close the iterator */
2305 uscript_closeRun(scriptRun);
2306 }
2307 }
2308
2309 /* test additional, non-core properties */
2310 static void
TestAdditionalProperties()2311 TestAdditionalProperties() {
2312 /* test data for u_charAge() */
2313 static const struct {
2314 UChar32 c;
2315 UVersionInfo version;
2316 } charAges[]={
2317 {0x41, { 1, 1, 0, 0 }},
2318 {0xffff, { 1, 1, 0, 0 }},
2319 {0x20ab, { 2, 0, 0, 0 }},
2320 {0x2fffe, { 2, 0, 0, 0 }},
2321 {0x20ac, { 2, 1, 0, 0 }},
2322 {0xfb1d, { 3, 0, 0, 0 }},
2323 {0x3f4, { 3, 1, 0, 0 }},
2324 {0x10300, { 3, 1, 0, 0 }},
2325 {0x220, { 3, 2, 0, 0 }},
2326 {0xff60, { 3, 2, 0, 0 }}
2327 };
2328
2329 /* test data for u_hasBinaryProperty() */
2330 static const int32_t
2331 props[][3]={ /* code point, property, value */
2332 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2333 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2334 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2335
2336 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2337 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2338
2339 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2340 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2341
2342 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2343 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2344
2345 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2346 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2347 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2348 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2349 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2350
2351 { 0x058a, UCHAR_DASH, TRUE },
2352 { 0x007e, UCHAR_DASH, FALSE },
2353
2354 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2355 { 0x3000, UCHAR_DIACRITIC, FALSE },
2356
2357 { 0x0e46, UCHAR_EXTENDER, TRUE },
2358 { 0x0020, UCHAR_EXTENDER, FALSE },
2359
2360 #if !UCONFIG_NO_NORMALIZATION
2361 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2362 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2363 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2364
2365 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2366 { 0x0308, UCHAR_NFD_INERT, FALSE },
2367
2368 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2369 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2370
2371 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2372 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2373 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2374 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2375 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2376 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2377
2378 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2379 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2380
2381 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2382 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2383 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2384 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2385 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2386 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2387 #endif
2388
2389 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2390 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2391 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2392
2393 { 0x30fb, UCHAR_HYPHEN, TRUE },
2394 { 0xfe58, UCHAR_HYPHEN, FALSE },
2395
2396 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2397 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2398 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2399
2400 { 0x2172, UCHAR_ID_START, TRUE },
2401 { 0x007a, UCHAR_ID_START, TRUE },
2402 { 0x0039, UCHAR_ID_START, FALSE },
2403
2404 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2405 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2406 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2407
2408 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2409 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2410
2411 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2412 { 0x0345, UCHAR_LOWERCASE, TRUE },
2413 { 0x0030, UCHAR_LOWERCASE, FALSE },
2414
2415 { 0x1d7a9, UCHAR_MATH, TRUE },
2416 { 0x2135, UCHAR_MATH, TRUE },
2417 { 0x0062, UCHAR_MATH, FALSE },
2418
2419 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2420 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2421 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2422
2423 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2424 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2425 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2426
2427 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2428 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2429
2430 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2431 { 0x2162, UCHAR_UPPERCASE, TRUE },
2432 { 0x0345, UCHAR_UPPERCASE, FALSE },
2433
2434 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2435 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2436 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2437
2438 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2439 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2440 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2441
2442 { 0x16ee, UCHAR_XID_START, TRUE },
2443 { 0x23456, UCHAR_XID_START, TRUE },
2444 { 0x1d1aa, UCHAR_XID_START, FALSE },
2445
2446 /*
2447 * Version break:
2448 * The following properties are only supported starting with the
2449 * Unicode version indicated in the second field.
2450 */
2451 { -1, 0x320, 0 },
2452
2453 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2454 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2455 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2456
2457 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2458 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2459 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2460 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2461
2462 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2463 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2464 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2465 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2466
2467 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2468 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2469 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2470 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2471
2472 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2473 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2474
2475 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2476 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2477
2478 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2479 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2480
2481 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2482 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2483
2484 { 0x2e9b, UCHAR_RADICAL, TRUE },
2485 { 0x4e00, UCHAR_RADICAL, FALSE },
2486
2487 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2488 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2489
2490 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2491 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2492
2493 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2494
2495 { 0x002e, UCHAR_S_TERM, TRUE },
2496 { 0x0061, UCHAR_S_TERM, FALSE },
2497
2498 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2499 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2500 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2501 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2502
2503 /* enum/integer type properties */
2504
2505 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2506 /* test default Bidi classes for unassigned code points */
2507 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2508 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2509 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2510 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2511 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2512 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2513 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2514 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2515 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2516 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2517 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2518
2519 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2520 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2521 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2522 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2523 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2524 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2525 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2526
2527 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2528 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2529 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2530 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2531 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2532 { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2533 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2534 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2535 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2536 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2537 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2538
2539 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2540 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2541
2542 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2543 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2544 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2545 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2546 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2547 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2548 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2549 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2550 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2551
2552 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2553 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2554 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2555 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2556 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2557 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2558 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2559 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2560 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2561 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2562 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2563 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2564 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2565 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2566 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2567 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2568 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2569
2570 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2571 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2572 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2573
2574 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2575 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2576 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2577 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2578 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2579
2580 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2581 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2582 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2583 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2584 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2585 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2586 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2587 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2588
2589 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2590 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2591 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2592 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2593 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2594 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2595 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2596 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2597 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2598 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2599 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2600 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2601 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2602 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2603 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2604 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2605
2606 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2607
2608 /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2609
2610 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2611 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2612 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2613 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2614 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2615 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2616 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2617
2618 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2619 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2620 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2621 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2622
2623 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2624 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2625 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2626 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2627 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2628 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2629
2630 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2631 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2632 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2633 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2634
2635 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2636 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2637 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2638 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2639 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2640 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2641 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2642
2643 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2644 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2645 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2646 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2647
2648 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2649 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2650 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2651 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2652
2653 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2654 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2655 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2656 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2657 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2658
2659 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2660
2661 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2662
2663 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2664 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2665 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2666
2667 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2668 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2669 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2670 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2671 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2672
2673 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2674 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2675 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2676
2677 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2678 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2679 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2680 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2681
2682 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2683 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2684 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2685 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2686 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2687 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2688
2689 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2690 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2691 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2692 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2693
2694 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2695 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2696 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2697 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2698
2699 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2700 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2701 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2702 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2703
2704 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2705
2706 /* unassigned code points in new default Bidi R blocks */
2707 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2708 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2709
2710 /* test some script codes >127 */
2711 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2712 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2713 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2714
2715 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2716
2717 /* value changed in Unicode 6.0 */
2718 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2719
2720 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2721
2722 /* unassigned code points in new/changed default Bidi AL blocks */
2723 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2724 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2725
2726 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2727
2728 /* unassigned code points in the currency symbols block now default to ET */
2729 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2730 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2731
2732 /* new property in Unicode 6.3 */
2733 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2734 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2735 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2736 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2737 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2738 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2739
2740 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2741
2742 /* new character range with Joining_Group values */
2743 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2744 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2745 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2746 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2747 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2748
2749 { -1, 0xa00, 0 }, // version break for Unicode 10
2750
2751 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2752 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2753 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2754 { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2755
2756 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2757 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2758 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2759
2760 /* undefined UProperty values */
2761 { 0x61, 0x4a7, 0 },
2762 { 0x234bc, 0x15ed, 0 }
2763 };
2764
2765 UVersionInfo version;
2766 UChar32 c;
2767 int32_t i, result, uVersion;
2768 UProperty which;
2769
2770 /* what is our Unicode version? */
2771 u_getUnicodeVersion(version);
2772 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2773
2774 u_charAge(0x20, version);
2775 if(version[0]==0) {
2776 /* no additional properties available */
2777 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2778 return;
2779 }
2780
2781 /* test u_charAge() */
2782 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2783 u_charAge(charAges[i].c, version);
2784 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2785 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2786 charAges[i].c,
2787 version[0], version[1], version[2], version[3],
2788 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2789 }
2790 }
2791
2792 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2793 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2794 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2795 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2796 u_getIntPropertyMinValue(0x2345)!=0
2797 ) {
2798 log_err("error: u_getIntPropertyMinValue() wrong\n");
2799 }
2800 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2801 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2802 }
2803 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2804 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2805 }
2806 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2807 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2808 }
2809 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2810 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2811 }
2812 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2813 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2814 }
2815 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2816 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2817 }
2818 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2819 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2820 }
2821 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2822 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2823 }
2824 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2825 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2826 }
2827 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2828 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2829 }
2830 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2831 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2832 }
2833 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2834 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2835 }
2836 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2837 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2838 }
2839 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2840 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2841 }
2842 /*JB#2410*/
2843 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2844 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2845 }
2846 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2847 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2848 }
2849 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2850 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2851 }
2852 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2853 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2854 }
2855 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2856 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2857 }
2858
2859 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2860 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2861 const char *whichName;
2862
2863 if(props[i][0]<0) {
2864 /* Unicode version break */
2865 if(uVersion<props[i][1]) {
2866 break; /* do not test properties that are not yet supported */
2867 } else {
2868 continue; /* skip this row */
2869 }
2870 }
2871
2872 c=(UChar32)props[i][0];
2873 which=(UProperty)props[i][1];
2874 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2875
2876 if(which<UCHAR_INT_START) {
2877 result=u_hasBinaryProperty(c, which);
2878 if(result!=props[i][2]) {
2879 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2880 c, whichName, result, i);
2881 }
2882 }
2883
2884 result=u_getIntPropertyValue(c, which);
2885 if(result!=props[i][2]) {
2886 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2887 c, whichName, result, props[i][2], i);
2888 }
2889
2890 /* test separate functions, too */
2891 switch((UProperty)props[i][1]) {
2892 case UCHAR_ALPHABETIC:
2893 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2894 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2895 props[i][0], result, i);
2896 }
2897 break;
2898 case UCHAR_LOWERCASE:
2899 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2900 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2901 props[i][0], result, i);
2902 }
2903 break;
2904 case UCHAR_UPPERCASE:
2905 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2906 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2907 props[i][0], result, i);
2908 }
2909 break;
2910 case UCHAR_WHITE_SPACE:
2911 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2912 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2913 props[i][0], result, i);
2914 }
2915 break;
2916 default:
2917 break;
2918 }
2919 }
2920 }
2921
2922 static void
TestNumericProperties(void)2923 TestNumericProperties(void) {
2924 /* see UnicodeData.txt, DerivedNumericValues.txt */
2925 static const struct {
2926 UChar32 c;
2927 int32_t type;
2928 double numValue;
2929 } values[]={
2930 { 0x0F33, U_NT_NUMERIC, -1./2. },
2931 { 0x0C66, U_NT_DECIMAL, 0 },
2932 { 0x96f6, U_NT_NUMERIC, 0 },
2933 { 0xa833, U_NT_NUMERIC, 1./16. },
2934 { 0x2152, U_NT_NUMERIC, 1./10. },
2935 { 0x2151, U_NT_NUMERIC, 1./9. },
2936 { 0x1245f, U_NT_NUMERIC, 1./8. },
2937 { 0x2150, U_NT_NUMERIC, 1./7. },
2938 { 0x2159, U_NT_NUMERIC, 1./6. },
2939 { 0x09f6, U_NT_NUMERIC, 3./16. },
2940 { 0x2155, U_NT_NUMERIC, 1./5. },
2941 { 0x00BD, U_NT_NUMERIC, 1./2. },
2942 { 0x0031, U_NT_DECIMAL, 1. },
2943 { 0x4e00, U_NT_NUMERIC, 1. },
2944 { 0x58f1, U_NT_NUMERIC, 1. },
2945 { 0x10320, U_NT_NUMERIC, 1. },
2946 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2947 { 0x00B2, U_NT_DIGIT, 2. },
2948 { 0x5f10, U_NT_NUMERIC, 2. },
2949 { 0x1813, U_NT_DECIMAL, 3. },
2950 { 0x5f0e, U_NT_NUMERIC, 3. },
2951 { 0x2173, U_NT_NUMERIC, 4. },
2952 { 0x8086, U_NT_NUMERIC, 4. },
2953 { 0x278E, U_NT_DIGIT, 5. },
2954 { 0x1D7F2, U_NT_DECIMAL, 6. },
2955 { 0x247A, U_NT_DIGIT, 7. },
2956 { 0x7396, U_NT_NUMERIC, 9. },
2957 { 0x1372, U_NT_NUMERIC, 10. },
2958 { 0x216B, U_NT_NUMERIC, 12. },
2959 { 0x16EE, U_NT_NUMERIC, 17. },
2960 { 0x249A, U_NT_NUMERIC, 19. },
2961 { 0x303A, U_NT_NUMERIC, 30. },
2962 { 0x5345, U_NT_NUMERIC, 30. },
2963 { 0x32B2, U_NT_NUMERIC, 37. },
2964 { 0x1375, U_NT_NUMERIC, 40. },
2965 { 0x10323, U_NT_NUMERIC, 50. },
2966 { 0x0BF1, U_NT_NUMERIC, 100. },
2967 { 0x964c, U_NT_NUMERIC, 100. },
2968 { 0x217E, U_NT_NUMERIC, 500. },
2969 { 0x2180, U_NT_NUMERIC, 1000. },
2970 { 0x4edf, U_NT_NUMERIC, 1000. },
2971 { 0x2181, U_NT_NUMERIC, 5000. },
2972 { 0x137C, U_NT_NUMERIC, 10000. },
2973 { 0x4e07, U_NT_NUMERIC, 10000. },
2974 { 0x12432, U_NT_NUMERIC, 216000. },
2975 { 0x12433, U_NT_NUMERIC, 432000. },
2976 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2977 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2978 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2979 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2980 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2981 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2982 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2983 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2984 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2985 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2986 };
2987
2988 double nv;
2989 UChar32 c;
2990 int32_t i, type;
2991
2992 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
2993 c=values[i].c;
2994 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2995 nv=u_getNumericValue(c);
2996
2997 if(type!=values[i].type) {
2998 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2999 }
3000 if(0.000001 <= fabs(nv - values[i].numValue)) {
3001 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3002 }
3003 }
3004 }
3005
3006 /**
3007 * Test the property names and property value names API.
3008 */
3009 static void
TestPropertyNames(void)3010 TestPropertyNames(void) {
3011 int32_t p, v, choice=0, rev;
3012 UBool atLeastSomething = FALSE;
3013
3014 for (p=0; ; ++p) {
3015 UProperty propEnum = (UProperty)p;
3016 UBool sawProp = FALSE;
3017 if(p > 10 && !atLeastSomething) {
3018 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3019 return;
3020 }
3021
3022 for (choice=0; ; ++choice) {
3023 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3024 if (name) {
3025 if (!sawProp)
3026 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3027 log_verbose("%d=\"%s\"", choice, name);
3028 sawProp = TRUE;
3029 atLeastSomething = TRUE;
3030
3031 /* test reverse mapping */
3032 rev = u_getPropertyEnum(name);
3033 if (rev != p) {
3034 log_err("Property round-trip failure: %d -> %s -> %d\n",
3035 p, name, rev);
3036 }
3037 }
3038 if (!name && choice>0) break;
3039 }
3040 if (sawProp) {
3041 /* looks like a valid property; check the values */
3042 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3043 int32_t max = 0;
3044 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3045 max = 255;
3046 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3047 /* it's far too slow to iterate all the way up to
3048 the real max, U_GC_P_MASK */
3049 max = U_GC_NL_MASK;
3050 } else if (p == UCHAR_BLOCK) {
3051 /* UBlockCodes, unlike other values, start at 1 */
3052 max = 1;
3053 }
3054 log_verbose("\n");
3055 for (v=-1; ; ++v) {
3056 UBool sawValue = FALSE;
3057 for (choice=0; ; ++choice) {
3058 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3059 if (vname) {
3060 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3061 log_verbose("%d=\"%s\"", choice, vname);
3062 sawValue = TRUE;
3063
3064 /* test reverse mapping */
3065 rev = u_getPropertyValueEnum(propEnum, vname);
3066 if (rev != v) {
3067 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3068 pname, v, vname, rev);
3069 }
3070 }
3071 if (!vname && choice>0) break;
3072 }
3073 if (sawValue) {
3074 log_verbose("\n");
3075 }
3076 if (!sawValue && v>=max) break;
3077 }
3078 }
3079 if (!sawProp) {
3080 if (p>=UCHAR_STRING_LIMIT) {
3081 break;
3082 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3083 p = UCHAR_STRING_START - 1;
3084 } else if (p>=UCHAR_MASK_LIMIT) {
3085 p = UCHAR_DOUBLE_START - 1;
3086 } else if (p>=UCHAR_INT_LIMIT) {
3087 p = UCHAR_MASK_START - 1;
3088 } else if (p>=UCHAR_BINARY_LIMIT) {
3089 p = UCHAR_INT_START - 1;
3090 }
3091 }
3092 }
3093 }
3094
3095 /**
3096 * Test the property values API. See JB#2410.
3097 */
3098 static void
TestPropertyValues(void)3099 TestPropertyValues(void) {
3100 int32_t i, p, min, max;
3101 UErrorCode ec;
3102
3103 /* Min should be 0 for everything. */
3104 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3105 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3106 UProperty propEnum = (UProperty)p;
3107 min = u_getIntPropertyMinValue(propEnum);
3108 if (min != 0) {
3109 if (p == UCHAR_BLOCK) {
3110 /* This is okay...for now. See JB#2487.
3111 TODO Update this for JB#2487. */
3112 } else {
3113 const char* name;
3114 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3115 if (name == NULL)
3116 name = "<ERROR>";
3117 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3118 name, min);
3119 }
3120 }
3121 }
3122
3123 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3124 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3125 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3126 }
3127
3128 /* Max should be -1 for invalid properties. */
3129 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3130 if (max != -1) {
3131 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3132 max);
3133 }
3134
3135 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3136 for (i=0; i<2; ++i) {
3137 int32_t script;
3138 const char* desc;
3139 ec = U_ZERO_ERROR;
3140 switch (i) {
3141 case 0:
3142 script = uscript_getScript(-1, &ec);
3143 desc = "uscript_getScript(-1)";
3144 break;
3145 case 1:
3146 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3147 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3148 break;
3149 default:
3150 log_err("Internal test error. Too many scripts\n");
3151 return;
3152 }
3153 /* We don't explicitly test ec. It should be U_FAILURE but it
3154 isn't documented as such. */
3155 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3156 log_err("FAIL: %s = %d, exp. 0\n",
3157 desc, script);
3158 }
3159 }
3160 }
3161
3162 /* various tests for consistency of UCD data and API behavior */
3163 static void
TestConsistency()3164 TestConsistency() {
3165 char buffer[300];
3166 USet *set1, *set2, *set3, *set4;
3167 UErrorCode errorCode;
3168
3169 UChar32 start, end;
3170 int32_t i, length;
3171
3172 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3173 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3174 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3175 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3176 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3177
3178 U_STRING_DECL(mathBlocksPattern,
3179 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3180 214);
3181 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3182 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3183 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3184 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3185
3186 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3187 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3188 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3189 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3190 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3191
3192 U_STRING_INIT(mathBlocksPattern,
3193 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3194 214);
3195 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3196 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3197 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3198 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3199
3200 /*
3201 * It used to be that UCD.html and its precursors said
3202 * "Those dashes used to mark connections between pieces of words,
3203 * plus the Katakana middle dot."
3204 *
3205 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3206 * but not from Hyphen.
3207 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3208 * Therefore, do not show errors when testing the Hyphen property.
3209 */
3210 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3211 "known to the UTC and not considered errors.\n");
3212
3213 errorCode=U_ZERO_ERROR;
3214 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3215 set2=uset_openPattern(dashPattern, 8, &errorCode);
3216 if(U_SUCCESS(errorCode)) {
3217 /* remove the Katakana middle dot(s) from set1 */
3218 uset_remove(set1, 0x30fb);
3219 uset_remove(set1, 0xff65); /* halfwidth variant */
3220 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3221 } else {
3222 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3223 }
3224
3225 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3226 set3=uset_openPattern(formatPattern, 6, &errorCode);
3227 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3228 if(U_SUCCESS(errorCode)) {
3229 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3230 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3231 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3232 } else {
3233 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3234 }
3235
3236 uset_close(set1);
3237 uset_close(set2);
3238 uset_close(set3);
3239 uset_close(set4);
3240
3241 /*
3242 * Check that each lowercase character has "small" in its name
3243 * and not "capital".
3244 * There are some such characters, some of which seem odd.
3245 * Use the verbose flag to see these notices.
3246 */
3247 errorCode=U_ZERO_ERROR;
3248 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3249 if(U_SUCCESS(errorCode)) {
3250 for(i=0;; ++i) {
3251 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3252 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3253 break; /* done */
3254 }
3255 if(U_FAILURE(errorCode)) {
3256 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3257 i, u_errorName(errorCode));
3258 break;
3259 }
3260 if(length!=0) {
3261 break; /* done with code points, got a string or -1 */
3262 }
3263
3264 while(start<=end) {
3265 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3266 if(U_FAILURE(errorCode)) {
3267 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3268 errorCode=U_ZERO_ERROR;
3269 }
3270 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3271 strstr(buffer, "SMALL CAPITAL")==NULL
3272 ) {
3273 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3274 }
3275 ++start;
3276 }
3277 }
3278 } else {
3279 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3280 }
3281 uset_close(set1);
3282
3283 /* verify that all assigned characters in Math blocks are exactly Math characters */
3284 errorCode=U_ZERO_ERROR;
3285 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3286 set2=uset_openPattern(mathPattern, 8, &errorCode);
3287 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3288 if(U_SUCCESS(errorCode)) {
3289 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3290 uset_complement(set3); /* assigned characters */
3291 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3292 compareUSets(set1, set2,
3293 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3294 TRUE);
3295 } else {
3296 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3297 }
3298 uset_close(set1);
3299 uset_close(set2);
3300 uset_close(set3);
3301
3302 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3303 errorCode=U_ZERO_ERROR;
3304 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3305 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3306 if(U_SUCCESS(errorCode)) {
3307 compareUSets(set1, set2,
3308 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3309 TRUE);
3310 } else {
3311 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3312 }
3313 uset_close(set1);
3314 uset_close(set2);
3315 }
3316
3317 /* test case folding, compare return values with CaseFolding.txt ------------ */
3318
3319 /* bit set for which case foldings for a character have been tested already */
3320 enum {
3321 CF_SIMPLE=1,
3322 CF_FULL=2,
3323 CF_TURKIC=4,
3324 CF_ALL=7
3325 };
3326
3327 static void
testFold(UChar32 c,int which,UChar32 simple,UChar32 turkic,const UChar * full,int32_t fullLength,const UChar * turkicFull,int32_t turkicFullLength)3328 testFold(UChar32 c, int which,
3329 UChar32 simple, UChar32 turkic,
3330 const UChar *full, int32_t fullLength,
3331 const UChar *turkicFull, int32_t turkicFullLength) {
3332 UChar s[2], t[32];
3333 UChar32 c2;
3334 int32_t length, length2;
3335
3336 UErrorCode errorCode=U_ZERO_ERROR;
3337
3338 length=0;
3339 U16_APPEND_UNSAFE(s, length, c);
3340
3341 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3342 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3343 }
3344 if((which&CF_FULL)!=0) {
3345 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3346 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3347 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3348 }
3349 }
3350 if((which&CF_TURKIC)!=0) {
3351 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3352 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3353 }
3354
3355 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3356 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3357 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3358 }
3359 }
3360 }
3361
3362 /* test that c case-folds to itself */
3363 static void
testFoldToSelf(UChar32 c,int which)3364 testFoldToSelf(UChar32 c, int which) {
3365 UChar s[2];
3366 int32_t length;
3367
3368 length=0;
3369 U16_APPEND_UNSAFE(s, length, c);
3370 testFold(c, which, c, c, s, length, s, length);
3371 }
3372
3373 struct CaseFoldingData {
3374 USet *notSeen;
3375 UChar32 prev, prevSimple;
3376 UChar prevFull[32];
3377 int32_t prevFullLength;
3378 int which;
3379 };
3380 typedef struct CaseFoldingData CaseFoldingData;
3381
3382 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)3383 caseFoldingLineFn(void *context,
3384 char *fields[][2], int32_t fieldCount,
3385 UErrorCode *pErrorCode) {
3386 CaseFoldingData *pData=(CaseFoldingData *)context;
3387 char *end;
3388 UChar full[32];
3389 UChar32 c, prev, simple;
3390 int32_t count;
3391 int which;
3392 char status;
3393
3394 /* get code point */
3395 const char *s=u_skipWhitespace(fields[0][0]);
3396 if(0==strncmp(s, "0000..10FFFF", 12)) {
3397 /*
3398 * Ignore the line
3399 * # @missing: 0000..10FFFF; C; <code point>
3400 * because maps-to-self is already our default, and this line breaks this parser.
3401 */
3402 return;
3403 }
3404 c=(UChar32)strtoul(s, &end, 16);
3405 end=(char *)u_skipWhitespace(end);
3406 if(end<=fields[0][0] || end!=fields[0][1]) {
3407 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3408 *pErrorCode=U_PARSE_ERROR;
3409 return;
3410 }
3411
3412 /* get the status of this mapping */
3413 status=*u_skipWhitespace(fields[1][0]);
3414 if(status!='C' && status!='S' && status!='F' && status!='T') {
3415 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3416 *pErrorCode=U_PARSE_ERROR;
3417 return;
3418 }
3419
3420 /* get the mapping */
3421 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3422 if(U_FAILURE(*pErrorCode)) {
3423 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3424 return;
3425 }
3426
3427 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3428 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3429 simple=c;
3430 }
3431
3432 if(c!=(prev=pData->prev)) {
3433 /*
3434 * Test remaining mappings for the previous code point.
3435 * If a turkic folding was not mentioned, then it should fold the same
3436 * as the regular simple case folding.
3437 */
3438 UChar prevString[2];
3439 int32_t length;
3440
3441 length=0;
3442 U16_APPEND_UNSAFE(prevString, length, prev);
3443 testFold(prev, (~pData->which)&CF_ALL,
3444 prev, pData->prevSimple,
3445 prevString, length,
3446 pData->prevFull, pData->prevFullLength);
3447 pData->prev=pData->prevSimple=c;
3448 length=0;
3449 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3450 pData->prevFullLength=length;
3451 pData->which=0;
3452 }
3453
3454 /*
3455 * Turn the status into a bit set of case foldings to test.
3456 * Remember non-Turkic case foldings as defaults for Turkic mode.
3457 */
3458 switch(status) {
3459 case 'C':
3460 which=CF_SIMPLE|CF_FULL;
3461 pData->prevSimple=simple;
3462 u_memcpy(pData->prevFull, full, count);
3463 pData->prevFullLength=count;
3464 break;
3465 case 'S':
3466 which=CF_SIMPLE;
3467 pData->prevSimple=simple;
3468 break;
3469 case 'F':
3470 which=CF_FULL;
3471 u_memcpy(pData->prevFull, full, count);
3472 pData->prevFullLength=count;
3473 break;
3474 case 'T':
3475 which=CF_TURKIC;
3476 break;
3477 default:
3478 which=0;
3479 break; /* won't happen because of test above */
3480 }
3481
3482 testFold(c, which, simple, simple, full, count, full, count);
3483
3484 /* remember which case foldings of c have been tested */
3485 pData->which|=which;
3486
3487 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3488 uset_remove(pData->notSeen, c);
3489 }
3490
3491 static void
TestCaseFolding()3492 TestCaseFolding() {
3493 CaseFoldingData data={ NULL };
3494 char *fields[3][2];
3495 UErrorCode errorCode;
3496
3497 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3498
3499 errorCode=U_ZERO_ERROR;
3500 /* test BMP & plane 1 - nothing interesting above */
3501 data.notSeen=uset_open(0, 0x1ffff);
3502 data.prevFullLength=1; /* length of full case folding of U+0000 */
3503
3504 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3505 if(U_SUCCESS(errorCode)) {
3506 int32_t i, start, end;
3507
3508 /* add a pseudo-last line to finish testing of the actual last one */
3509 fields[0][0]=lastLine;
3510 fields[0][1]=lastLine+6;
3511 fields[1][0]=lastLine+7;
3512 fields[1][1]=lastLine+9;
3513 fields[2][0]=lastLine+10;
3514 fields[2][1]=lastLine+17;
3515 caseFoldingLineFn(&data, fields, 3, &errorCode);
3516
3517 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3518 for(i=0;
3519 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3520 U_SUCCESS(errorCode);
3521 ++i
3522 ) {
3523 do {
3524 testFoldToSelf(start, CF_ALL);
3525 } while(++start<=end);
3526 }
3527 }
3528
3529 uset_close(data.notSeen);
3530 }
3531
TestBinaryCharacterPropertiesAPI()3532 static void TestBinaryCharacterPropertiesAPI() {
3533 // API test only. See intltest/ucdtest.cpp for functional test.
3534 UErrorCode errorCode = U_ZERO_ERROR;
3535 const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3536 if (U_SUCCESS(errorCode)) {
3537 log_err("u_getBinaryPropertySet(-1) did not fail\n");
3538 }
3539 errorCode = U_ZERO_ERROR;
3540 set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3541 if (U_SUCCESS(errorCode)) {
3542 log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3543 }
3544 errorCode = U_ZERO_ERROR;
3545 set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3546 if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3547 log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3548 }
3549 }
3550
TestIntCharacterPropertiesAPI()3551 static void TestIntCharacterPropertiesAPI() {
3552 // API test only. See intltest/ucdtest.cpp for functional test.
3553 UErrorCode errorCode = U_ZERO_ERROR;
3554 const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3555 if (U_SUCCESS(errorCode)) {
3556 log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3557 }
3558 errorCode = U_ZERO_ERROR;
3559 map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3560 if (U_SUCCESS(errorCode)) {
3561 log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3562 }
3563 errorCode = U_ZERO_ERROR;
3564 map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3565 if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3566 log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3567 }
3568 }
3569