1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28 #include "unicode/utf16.h"
29 #include "unicode/utf_old.h"
30 #include "cintltst.h"
31 #include "putilimp.h"
32 #include "uparse.h"
33 #include "ucase.h"
34 #include "ubidi_props.h"
35 #include "uprops.h"
36 #include "uset_imp.h"
37 #include "usc_impl.h"
38 #include "udatamem.h"
39 #include "cucdapi.h"
40 #include "cmemory.h"
41
42 /* prototypes --------------------------------------------------------------- */
43
44 static void TestUpperLower(void);
45 static void TestLetterNumber(void);
46 static void TestMisc(void);
47 static void TestPOSIX(void);
48 static void TestControlPrint(void);
49 static void TestIdentifier(void);
50 static void TestUnicodeData(void);
51 static void TestCodeUnit(void);
52 static void TestCodePoint(void);
53 static void TestCharLength(void);
54 static void TestCharNames(void);
55 static void TestUCharFromNameUnderflow(void);
56 static void TestMirroring(void);
57 static void TestUScriptRunAPI(void);
58 static void TestAdditionalProperties(void);
59 static void TestNumericProperties(void);
60 static void TestPropertyNames(void);
61 static void TestPropertyValues(void);
62 static void TestConsistency(void);
63 static void TestCaseFolding(void);
64 static void TestBinaryCharacterPropertiesAPI(void);
65 static void TestIntCharacterPropertiesAPI(void);
66
67 /* internal methods used */
68 static int32_t MakeProp(char* str);
69 static int32_t MakeDir(char* str);
70
71 /* helpers ------------------------------------------------------------------ */
72
73 static void
parseUCDFile(const char * filename,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)74 parseUCDFile(const char *filename,
75 char *fields[][2], int32_t fieldCount,
76 UParseLineFn *lineFn, void *context,
77 UErrorCode *pErrorCode) {
78 char path[256];
79 char backupPath[256];
80
81 if(U_FAILURE(*pErrorCode)) {
82 return;
83 }
84
85 /* Look inside ICU_DATA first */
86 strcpy(path, u_getDataDirectory());
87 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
88 strcat(path, filename);
89
90 /* As a fallback, try to guess where the source data was located
91 * at the time ICU was built, and look there.
92 */
93 strcpy(backupPath, ctest_dataSrcDir());
94 strcat(backupPath, U_FILE_SEP_STRING);
95 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
96 strcat(backupPath, filename);
97
98 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
99 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
100 *pErrorCode=U_ZERO_ERROR;
101 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
102 }
103 if(U_FAILURE(*pErrorCode)) {
104 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
105 }
106 }
107
108 /* test data ---------------------------------------------------------------- */
109
110 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
111 static const int32_t tagValues[] =
112 {
113 /* Mn */ U_NON_SPACING_MARK,
114 /* Mc */ U_COMBINING_SPACING_MARK,
115 /* Me */ U_ENCLOSING_MARK,
116 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
117 /* Nl */ U_LETTER_NUMBER,
118 /* No */ U_OTHER_NUMBER,
119 /* Zs */ U_SPACE_SEPARATOR,
120 /* Zl */ U_LINE_SEPARATOR,
121 /* Zp */ U_PARAGRAPH_SEPARATOR,
122 /* Cc */ U_CONTROL_CHAR,
123 /* Cf */ U_FORMAT_CHAR,
124 /* Cs */ U_SURROGATE,
125 /* Co */ U_PRIVATE_USE_CHAR,
126 /* Cn */ U_UNASSIGNED,
127 /* Lu */ U_UPPERCASE_LETTER,
128 /* Ll */ U_LOWERCASE_LETTER,
129 /* Lt */ U_TITLECASE_LETTER,
130 /* Lm */ U_MODIFIER_LETTER,
131 /* Lo */ U_OTHER_LETTER,
132 /* Pc */ U_CONNECTOR_PUNCTUATION,
133 /* Pd */ U_DASH_PUNCTUATION,
134 /* Ps */ U_START_PUNCTUATION,
135 /* Pe */ U_END_PUNCTUATION,
136 /* Po */ U_OTHER_PUNCTUATION,
137 /* Sm */ U_MATH_SYMBOL,
138 /* Sc */ U_CURRENCY_SYMBOL,
139 /* Sk */ U_MODIFIER_SYMBOL,
140 /* So */ U_OTHER_SYMBOL,
141 /* Pi */ U_INITIAL_PUNCTUATION,
142 /* Pf */ U_FINAL_PUNCTUATION
143 };
144
145 static const char dirStrings[][5] = {
146 "L",
147 "R",
148 "EN",
149 "ES",
150 "ET",
151 "AN",
152 "CS",
153 "B",
154 "S",
155 "WS",
156 "ON",
157 "LRE",
158 "LRO",
159 "AL",
160 "RLE",
161 "RLO",
162 "PDF",
163 "NSM",
164 "BN",
165 /* new in Unicode 6.3/ICU 52 */
166 "FSI",
167 "LRI",
168 "RLI",
169 "PDI"
170 };
171
172 void addUnicodeTest(TestNode** root);
173
addUnicodeTest(TestNode ** root)174 void addUnicodeTest(TestNode** root)
175 {
176 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
177 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
178 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
179 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
180 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
181 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
182 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
183 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
184 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
185 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
186 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
187 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
188 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
189 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
190 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
191 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
192 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
193 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
194 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
195 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
196 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
197 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
198 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
199 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
200 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
201 addTest(root, &TestBinaryCharacterPropertiesAPI,
202 "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
203 addTest(root, &TestIntCharacterPropertiesAPI,
204 "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
205 }
206
207 /*==================================================== */
208 /* test u_toupper() and u_tolower() */
209 /*==================================================== */
TestUpperLower()210 static void TestUpperLower()
211 {
212 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
213 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
214 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
215 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
216 int32_t i;
217
218 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
219 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
220
221 /*
222 Checks LetterLike Symbols which were previously a source of confusion
223 [Bertrand A. D. 02/04/98]
224 */
225 for (i=0x2100;i<0x2138;i++)
226 {
227 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
228 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
229 {
230 if (i != (int)u_tolower(i)) /* itself */
231 log_err("Failed case conversion with itself: U+%04x\n", i);
232 if (i != (int)u_toupper(i))
233 log_err("Failed case conversion with itself: U+%04x\n", i);
234 }
235 }
236
237 for(i=0; i < u_strlen(upper); i++){
238 if(u_tolower(upper[i]) != lower[i]){
239 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
240 }
241 }
242
243 log_verbose("testing upper lower\n");
244 for (i = 0; i < 21; i++) {
245
246 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
247 {
248 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
249 }
250 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
251 {
252 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
253 }
254 else if (upperTest[i] != u_tolower(lowerTest[i]))
255 {
256 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
257 }
258 else if (lowerTest[i] != u_toupper(upperTest[i]))
259 {
260 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
261 }
262 else if (upperTest[i] != u_tolower(upperTest[i]))
263 {
264 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
265 }
266 else if (lowerTest[i] != u_toupper(lowerTest[i]))
267 {
268 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
269 }
270 }
271 log_verbose("done testing upper lower\n");
272
273 log_verbose("testing u_istitle\n");
274 {
275 static const UChar expected[] = {
276 0x1F88,
277 0x1F89,
278 0x1F8A,
279 0x1F8B,
280 0x1F8C,
281 0x1F8D,
282 0x1F8E,
283 0x1F8F,
284 0x1F88,
285 0x1F89,
286 0x1F8A,
287 0x1F8B,
288 0x1F8C,
289 0x1F8D,
290 0x1F8E,
291 0x1F8F,
292 0x1F98,
293 0x1F99,
294 0x1F9A,
295 0x1F9B,
296 0x1F9C,
297 0x1F9D,
298 0x1F9E,
299 0x1F9F,
300 0x1F98,
301 0x1F99,
302 0x1F9A,
303 0x1F9B,
304 0x1F9C,
305 0x1F9D,
306 0x1F9E,
307 0x1F9F,
308 0x1FA8,
309 0x1FA9,
310 0x1FAA,
311 0x1FAB,
312 0x1FAC,
313 0x1FAD,
314 0x1FAE,
315 0x1FAF,
316 0x1FA8,
317 0x1FA9,
318 0x1FAA,
319 0x1FAB,
320 0x1FAC,
321 0x1FAD,
322 0x1FAE,
323 0x1FAF,
324 0x1FBC,
325 0x1FBC,
326 0x1FCC,
327 0x1FCC,
328 0x1FFC,
329 0x1FFC,
330 };
331 int32_t num = UPRV_LENGTHOF(expected);
332 for(i=0; i<num; i++){
333 if(!u_istitle(expected[i])){
334 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
335 }
336 }
337
338 }
339 }
340
341 /* compare two sets and verify that their difference or intersection is empty */
342 static UBool
showADiffB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool expect,UBool diffIsError)343 showADiffB(const USet *a, const USet *b,
344 const char *a_name, const char *b_name,
345 UBool expect, UBool diffIsError) {
346 USet *aa;
347 int32_t i, start, end, length;
348 UErrorCode errorCode;
349
350 /*
351 * expect:
352 * TRUE -> a-b should be empty, that is, b should contain all of a
353 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
354 */
355 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
356 return TRUE;
357 }
358
359 /* clone a to aa because a is const */
360 aa=uset_open(1, 0);
361 if(aa==NULL) {
362 /* unusual problem - out of memory? */
363 return FALSE;
364 }
365 uset_addAll(aa, a);
366
367 /* compute the set in question */
368 if(expect) {
369 /* a-b */
370 uset_removeAll(aa, b);
371 } else {
372 /* a&b */
373 uset_retainAll(aa, b);
374 }
375
376 /* aa is not empty because of the initial tests above; show its contents */
377 errorCode=U_ZERO_ERROR;
378 i=0;
379 for(;;) {
380 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
381 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
382 break; /* done */
383 }
384 if(U_FAILURE(errorCode)) {
385 log_err("error comparing %s with %s at difference item %d: %s\n",
386 a_name, b_name, i, u_errorName(errorCode));
387 break;
388 }
389 if(length!=0) {
390 break; /* done with code points, got a string or -1 */
391 }
392
393 if(diffIsError) {
394 if(expect) {
395 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
396 } else {
397 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
398 }
399 } else {
400 if(expect) {
401 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
402 } else {
403 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
404 }
405 }
406
407 ++i;
408 }
409
410 uset_close(aa);
411 return FALSE;
412 }
413
414 static UBool
showAMinusB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)415 showAMinusB(const USet *a, const USet *b,
416 const char *a_name, const char *b_name,
417 UBool diffIsError) {
418 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
419 }
420
421 static UBool
showAIntersectB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)422 showAIntersectB(const USet *a, const USet *b,
423 const char *a_name, const char *b_name,
424 UBool diffIsError) {
425 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
426 }
427
428 static UBool
compareUSets(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)429 compareUSets(const USet *a, const USet *b,
430 const char *a_name, const char *b_name,
431 UBool diffIsError) {
432 /*
433 * Use an arithmetic & not a logical && so that both branches
434 * are always taken and all differences are shown.
435 */
436 return
437 showAMinusB(a, b, a_name, b_name, diffIsError) &
438 showAMinusB(b, a, b_name, a_name, diffIsError);
439 }
440
441 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumber()442 static void TestLetterNumber()
443 {
444 UChar i = 0x0000;
445
446 log_verbose("Testing for isalpha\n");
447 for (i = 0x0041; i < 0x005B; i++) {
448 if (!u_isalpha(i))
449 {
450 log_err("Failed isLetter test at %.4X\n", i);
451 }
452 }
453 for (i = 0x0660; i < 0x066A; i++) {
454 if (u_isalpha(i))
455 {
456 log_err("Failed isLetter test with numbers at %.4X\n", i);
457 }
458 }
459
460 log_verbose("Testing for isdigit\n");
461 for (i = 0x0660; i < 0x066A; i++) {
462 if (!u_isdigit(i))
463 {
464 log_verbose("Failed isNumber test at %.4X\n", i);
465 }
466 }
467
468 log_verbose("Testing for isalnum\n");
469 for (i = 0x0041; i < 0x005B; i++) {
470 if (!u_isalnum(i))
471 {
472 log_err("Failed isAlNum test at %.4X\n", i);
473 }
474 }
475 for (i = 0x0660; i < 0x066A; i++) {
476 if (!u_isalnum(i))
477 {
478 log_err("Failed isAlNum test at %.4X\n", i);
479 }
480 }
481
482 {
483 /*
484 * The following checks work only starting from Unicode 4.0.
485 * Check the version number here.
486 */
487 static UVersionInfo u401={ 4, 0, 1, 0 };
488 UVersionInfo version;
489 u_getUnicodeVersion(version);
490 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
491 return;
492 }
493 }
494
495 {
496 /*
497 * Sanity check:
498 * Verify that exactly the digit characters have decimal digit values.
499 * This assumption is used in the implementation of u_digit()
500 * (which checks nt=de)
501 * compared with the parallel java.lang.Character.digit()
502 * (which checks Nd).
503 *
504 * This was not true in Unicode 3.2 and earlier.
505 * Unicode 4.0 fixed discrepancies.
506 * Unicode 4.0.1 re-introduced problems in this area due to an
507 * unintentionally incomplete last-minute change.
508 */
509 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
510 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
511
512 USet *digits, *decimalValues;
513 UErrorCode errorCode;
514
515 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
516 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
517 errorCode=U_ZERO_ERROR;
518 digits=uset_openPattern(digitsPattern, 6, &errorCode);
519 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
520
521 if(U_SUCCESS(errorCode)) {
522 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
523 }
524
525 uset_close(digits);
526 uset_close(decimalValues);
527 }
528 }
529
testSampleCharProps(UBool propFn (UChar32),const char * propName,const UChar32 * sampleChars,int32_t sampleCharsLength,UBool expected)530 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
531 const UChar32 *sampleChars, int32_t sampleCharsLength,
532 UBool expected) {
533 int32_t i;
534 for (i = 0; i < sampleCharsLength; ++i) {
535 UBool result = propFn(sampleChars[i]);
536 if (result != expected) {
537 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
538 propName, sampleChars[i], result);
539 }
540 }
541 }
542
543 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMisc()544 static void TestMisc()
545 {
546 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
547 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
548 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
549 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
550 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
551 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
552 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
553 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
554 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
555 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
556 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
557
558 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
559
560 uint32_t mask;
561
562 int32_t i;
563 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
564 UVersionInfo realVersion;
565
566 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
567
568 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
569 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
570
571 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
572 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
573 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
574 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
575
576 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
577 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
578 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
579 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
580
581 testSampleCharProps(u_isdefined, "u_isdefined",
582 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
583 testSampleCharProps(u_isdefined, "u_isdefined",
584 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
585
586 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
587 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
588
589 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
590 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
591
592 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
593 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
594 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
595 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
596 }
597 }
598
599 /* Tests the ICU version #*/
600 u_getVersion(realVersion);
601 u_versionToString(realVersion, icuVersion);
602 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
603 {
604 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
605 }
606 #if defined(ICU_VERSION)
607 /* test only happens where we have configure.in with VERSION - sanity check. */
608 if(strcmp(U_ICU_VERSION, ICU_VERSION))
609 {
610 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
611 }
612 #endif
613
614 /* test U_GC_... */
615 if(
616 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
617 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
618 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
619 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
620 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
621 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
622 ) {
623 log_err("error: U_GET_GC_MASK does not work properly\n");
624 }
625
626 mask=0;
627 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
628
629 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
630 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
631 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
632 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
633 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
634
635 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
636 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
637 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
638
639 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
640 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
641 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
642
643 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
644 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
645 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
646
647 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
648 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
649 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
650 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
651
652 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
653 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
654 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
655 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
656 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
657
658 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
659 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
660 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
661 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
662
663 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
664 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
665
666 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
667 log_err("error: problems with U_GC_XX_MASK constants\n");
668 }
669
670 mask=0;
671 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
672 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
673 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
674 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
675 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
676 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
677 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
678
679 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
680 log_err("error: problems with U_GC_Y_MASK constants\n");
681 }
682 {
683 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
684 for(i=0; i<10; i++){
685 if(digit[i]!=u_forDigit(i,10)){
686 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
687 }
688 }
689 }
690
691 /* test u_digit() */
692 {
693 static const struct {
694 UChar32 c;
695 int8_t radix, value;
696 } data[]={
697 /* base 16 */
698 { 0x0031, 16, 1 },
699 { 0x0038, 16, 8 },
700 { 0x0043, 16, 12 },
701 { 0x0066, 16, 15 },
702 { 0x00e4, 16, -1 },
703 { 0x0662, 16, 2 },
704 { 0x06f5, 16, 5 },
705 { 0xff13, 16, 3 },
706 { 0xff41, 16, 10 },
707
708 /* base 8 */
709 { 0x0031, 8, 1 },
710 { 0x0038, 8, -1 },
711 { 0x0043, 8, -1 },
712 { 0x0066, 8, -1 },
713 { 0x00e4, 8, -1 },
714 { 0x0662, 8, 2 },
715 { 0x06f5, 8, 5 },
716 { 0xff13, 8, 3 },
717 { 0xff41, 8, -1 },
718
719 /* base 36 */
720 { 0x5a, 36, 35 },
721 { 0x7a, 36, 35 },
722 { 0xff3a, 36, 35 },
723 { 0xff5a, 36, 35 },
724
725 /* wrong radix values */
726 { 0x0031, 1, -1 },
727 { 0xff3a, 37, -1 }
728 };
729
730 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
731 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
732 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
733 data[i].c,
734 data[i].radix,
735 u_digit(data[i].c, data[i].radix),
736 data[i].value);
737 }
738 }
739 }
740 }
741
742 /* test C/POSIX-style functions --------------------------------------------- */
743
744 /* bit flags */
745 #define ISAL 1
746 #define ISLO 2
747 #define ISUP 4
748
749 #define ISDI 8
750 #define ISXD 0x10
751
752 #define ISAN 0x20
753
754 #define ISPU 0x40
755 #define ISGR 0x80
756 #define ISPR 0x100
757
758 #define ISSP 0x200
759 #define ISBL 0x400
760 #define ISCN 0x800
761
762 /* C/POSIX-style functions, in the same order as the bit flags */
763 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
764
765 static const struct {
766 IsPOSIXClass *fn;
767 const char *name;
768 } posixClasses[]={
769 { u_isalpha, "isalpha" },
770 { u_islower, "islower" },
771 { u_isupper, "isupper" },
772 { u_isdigit, "isdigit" },
773 { u_isxdigit, "isxdigit" },
774 { u_isalnum, "isalnum" },
775 { u_ispunct, "ispunct" },
776 { u_isgraph, "isgraph" },
777 { u_isprint, "isprint" },
778 { u_isspace, "isspace" },
779 { u_isblank, "isblank" },
780 { u_iscntrl, "iscntrl" }
781 };
782
783 static const struct {
784 UChar32 c;
785 uint32_t posixResults;
786 } posixData[]={
787 { 0x0008, ISCN }, /* backspace */
788 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
789 { 0x000a, ISSP| ISCN }, /* LF */
790 { 0x000c, ISSP| ISCN }, /* FF */
791 { 0x000d, ISSP| ISCN }, /* CR */
792 { 0x0020, ISPR|ISSP|ISBL }, /* space */
793 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
794 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
795 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
796 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
797 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
798 { 0x007b, ISPU|ISGR|ISPR }, /* { */
799 { 0x0085, ISSP| ISCN }, /* NEL */
800 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
801 { 0x00a4, ISGR|ISPR }, /* currency sign */
802 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
803 { 0x0300, ISGR|ISPR }, /* combining grave */
804 { 0x0600, ISCN }, /* arabic number sign */
805 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
806 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
807 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
808 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
809 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
810 { 0x200b, ISCN }, /* ZWSP */
811 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
812 { 0x200e, ISCN }, /* LRM */
813 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
814 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
815 { 0x20ac, ISGR|ISPR }, /* Euro */
816 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
817 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
818 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
819 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
820 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
821 };
822
823 static void
TestPOSIX()824 TestPOSIX() {
825 uint32_t mask;
826 int32_t cl, i;
827 UBool expect;
828
829 mask=1;
830 for(cl=0; cl<12; ++cl) {
831 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
832 expect=(UBool)((posixData[i].posixResults&mask)!=0);
833 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
834 log_err("u_%s(U+%04x)=%s is wrong\n",
835 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
836 }
837 }
838 mask<<=1;
839 }
840 }
841
842 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrint()843 static void TestControlPrint()
844 {
845 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
846 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
847 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
848 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
849 UChar32 c;
850
851 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
852 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
853
854 testSampleCharProps(u_isprint, "u_isprint",
855 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
856 testSampleCharProps(u_isprint, "u_isprint",
857 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
858
859 /* test all ISO 8 controls */
860 for(c=0; c<=0x9f; ++c) {
861 if(c==0x20) {
862 /* skip ASCII graphic characters and continue with DEL */
863 c=0x7f;
864 }
865 if(!u_iscntrl(c)) {
866 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
867 }
868 if(!u_isISOControl(c)) {
869 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
870 }
871 if(u_isprint(c)) {
872 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
873 }
874 }
875
876 /* test all Latin-1 graphic characters */
877 for(c=0x20; c<=0xff; ++c) {
878 if(c==0x7f) {
879 c=0xa0;
880 } else if(c==0xad) {
881 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
882 ++c;
883 }
884 if(!u_isprint(c)) {
885 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
886 }
887 }
888 }
889
890 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifier()891 static void TestIdentifier()
892 {
893 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
894 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
895 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
896 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
897 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
898 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
899 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
900 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
901 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
902 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
903
904 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
905 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
906 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
907 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
908
909 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
910 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
911 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
912 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
913
914 /* IDPart should imply IDStart */
915 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
916 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
917
918 testSampleCharProps(u_isIDStart, "u_isIDStart",
919 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
920 testSampleCharProps(u_isIDStart, "u_isIDStart",
921 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
922
923 testSampleCharProps(u_isIDPart, "u_isIDPart",
924 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
925 testSampleCharProps(u_isIDPart, "u_isIDPart",
926 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
927
928 /* IDPart should imply IDStart */
929 testSampleCharProps(u_isIDPart, "u_isIDPart",
930 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
931
932 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
933 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
934 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
935 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
936 }
937
938 /* for each line of UnicodeData.txt, check some of the properties */
939 typedef struct UnicodeDataContext {
940 #if UCONFIG_NO_NORMALIZATION
941 const void *dummy;
942 #else
943 const UNormalizer2 *nfc;
944 const UNormalizer2 *nfkc;
945 #endif
946 } UnicodeDataContext;
947
948 /*
949 * ### TODO
950 * This test fails incorrectly if the First or Last code point of a repetitive area
951 * is overridden, which is allowed and is encouraged for the PUAs.
952 * Currently, this means that both area First/Last and override lines are
953 * tested against the properties from the API,
954 * and the area boundary will not match and cause an error.
955 *
956 * This function should detect area boundaries and skip them for the test of individual
957 * code points' properties.
958 * Then it should check that the areas contain all the same properties except where overridden.
959 * For this, it would have had to set a flag for which code points were listed explicitly.
960 */
961 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)962 unicodeDataLineFn(void *context,
963 char *fields[][2], int32_t fieldCount,
964 UErrorCode *pErrorCode)
965 {
966 (void)fieldCount; // suppress compiler warnings about unused variable
967 char buffer[100];
968 const char *d;
969 char *end;
970 uint32_t value;
971 UChar32 c;
972 int32_t i;
973 int8_t type;
974 int32_t dt;
975 UChar dm[32], s[32];
976 int32_t dmLength, length;
977
978 #if !UCONFIG_NO_NORMALIZATION
979 const UNormalizer2 *nfc, *nfkc;
980 #endif
981
982 /* get the character code, field 0 */
983 c=strtoul(fields[0][0], &end, 16);
984 if(end<=fields[0][0] || end!=fields[0][1]) {
985 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
986 return;
987 }
988 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
989 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
990 return;
991 }
992
993 /* get general category, field 2 */
994 *fields[2][1]=0;
995 type = (int8_t)tagValues[MakeProp(fields[2][0])];
996 if(u_charType(c)!=type) {
997 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
998 }
999 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1000 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1001 }
1002
1003 /* get canonical combining class, field 3 */
1004 value=strtoul(fields[3][0], &end, 10);
1005 if(end<=fields[3][0] || end!=fields[3][1]) {
1006 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1007 return;
1008 }
1009 if(value>255) {
1010 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1011 return;
1012 }
1013 #if !UCONFIG_NO_NORMALIZATION
1014 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1015 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1016 }
1017 nfkc=((UnicodeDataContext *)context)->nfkc;
1018 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1019 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1020 }
1021 #endif
1022
1023 /* get BiDi category, field 4 */
1024 *fields[4][1]=0;
1025 i=MakeDir(fields[4][0]);
1026 if(i!=(int32_t)u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1027 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1028 }
1029
1030 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1031 d=NULL;
1032 if(fields[5][0]==fields[5][1]) {
1033 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1034 if(c==0xac00 || c==0xd7a3) {
1035 dt=U_DT_CANONICAL;
1036 } else {
1037 dt=U_DT_NONE;
1038 }
1039 } else {
1040 d=fields[5][0];
1041 *fields[5][1]=0;
1042 dt=UCHAR_INVALID_CODE;
1043 if(*d=='<') {
1044 end=strchr(++d, '>');
1045 if(end!=NULL) {
1046 *end=0;
1047 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1048 d=u_skipWhitespace(end+1);
1049 }
1050 } else {
1051 dt=U_DT_CANONICAL;
1052 }
1053 }
1054 if(dt>U_DT_NONE) {
1055 if(c==0xac00) {
1056 dm[0]=0x1100;
1057 dm[1]=0x1161;
1058 dm[2]=0;
1059 dmLength=2;
1060 } else if(c==0xd7a3) {
1061 dm[0]=0xd788;
1062 dm[1]=0x11c2;
1063 dm[2]=0;
1064 dmLength=2;
1065 } else {
1066 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1067 }
1068 } else {
1069 dmLength=-1;
1070 }
1071 if(dt<0 || U_FAILURE(*pErrorCode)) {
1072 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1073 return;
1074 }
1075 #if !UCONFIG_NO_NORMALIZATION
1076 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1077 if(i!=dt) {
1078 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1079 }
1080 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1081 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1082 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1083 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1084 "or the Decomposition_Mapping is different (%s)\n",
1085 c, length, dmLength, u_errorName(*pErrorCode));
1086 return;
1087 }
1088 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1089 if(dt!=U_DT_CANONICAL) {
1090 dmLength=-1;
1091 }
1092 nfc=((UnicodeDataContext *)context)->nfc;
1093 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1094 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1095 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1096 "or the Decomposition_Mapping is different (%s)\n",
1097 c, length, dmLength, u_errorName(*pErrorCode));
1098 return;
1099 }
1100 /* recompose */
1101 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1102 UChar32 a, b, composite;
1103 i=0;
1104 U16_NEXT(dm, i, dmLength, a);
1105 U16_NEXT(dm, i, dmLength, b);
1106 /* i==dmLength */
1107 composite=unorm2_composePair(nfc, a, b);
1108 if(composite!=c) {
1109 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1110 (long)c, (long)a, (long)b, (long)composite);
1111 }
1112 /*
1113 * Note: NFKC has fewer round-trip mappings than NFC,
1114 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1115 */
1116 }
1117 #endif
1118
1119 /* get ISO Comment, field 11 */
1120 *fields[11][1]=0;
1121 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1122 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1123 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1124 c, u_errorName(*pErrorCode),
1125 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1126 fields[11][0]);
1127 }
1128
1129 /* get uppercase mapping, field 12 */
1130 if(fields[12][0]!=fields[12][1]) {
1131 value=strtoul(fields[12][0], &end, 16);
1132 if(end!=fields[12][1]) {
1133 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1134 return;
1135 }
1136 if((UChar32)value!=u_toupper(c)) {
1137 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1138 }
1139 } else {
1140 /* no case mapping: the API must map the code point to itself */
1141 if(c!=u_toupper(c)) {
1142 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1143 }
1144 }
1145
1146 /* get lowercase mapping, field 13 */
1147 if(fields[13][0]!=fields[13][1]) {
1148 value=strtoul(fields[13][0], &end, 16);
1149 if(end!=fields[13][1]) {
1150 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1151 return;
1152 }
1153 if((UChar32)value!=u_tolower(c)) {
1154 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1155 }
1156 } else {
1157 /* no case mapping: the API must map the code point to itself */
1158 if(c!=u_tolower(c)) {
1159 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1160 }
1161 }
1162
1163 /* get titlecase mapping, field 14 */
1164 if(fields[14][0]!=fields[14][1]) {
1165 value=strtoul(fields[14][0], &end, 16);
1166 if(end!=fields[14][1]) {
1167 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1168 return;
1169 }
1170 if((UChar32)value!=u_totitle(c)) {
1171 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1172 }
1173 } else {
1174 /* no case mapping: the API must map the code point to itself */
1175 if(c!=u_totitle(c)) {
1176 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1177 }
1178 }
1179 }
1180
1181 static UBool U_CALLCONV
enumTypeRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1182 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1183 static const UChar32 test[][2]={
1184 {0x41, U_UPPERCASE_LETTER},
1185 {0x308, U_NON_SPACING_MARK},
1186 {0xfffe, U_GENERAL_OTHER_TYPES},
1187 {0xe0041, U_FORMAT_CHAR},
1188 {0xeffff, U_UNASSIGNED}
1189 };
1190
1191 int32_t i, count;
1192
1193 if(0!=strcmp((const char *)context, "a1")) {
1194 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1195 return FALSE;
1196 }
1197
1198 count=UPRV_LENGTHOF(test);
1199 for(i=0; i<count; ++i) {
1200 if(start<=test[i][0] && test[i][0]<limit) {
1201 if(type!=(UCharCategory)test[i][1]) {
1202 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1203 start, limit, (long)type, test[i][0], test[i][1]);
1204 }
1205 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1206 return i==(count-1) ? FALSE : TRUE;
1207 }
1208 }
1209
1210 if(start>test[count-1][0]) {
1211 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1212 start, limit, (long)type);
1213 return FALSE;
1214 }
1215
1216 return TRUE;
1217 }
1218
1219 static UBool U_CALLCONV
enumDefaultsRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1220 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1221 (void)context; // suppress compiler warnings about unused variable
1222
1223 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1224 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1225 { 0x0590, U_LEFT_TO_RIGHT },
1226 { 0x0600, U_RIGHT_TO_LEFT },
1227 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1228 { 0x0860, U_RIGHT_TO_LEFT },
1229 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
1230 { 0x08A0, U_RIGHT_TO_LEFT },
1231 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1232 { 0x20A0, U_LEFT_TO_RIGHT },
1233 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1234 { 0xFB1D, U_LEFT_TO_RIGHT },
1235 { 0xFB50, U_RIGHT_TO_LEFT },
1236 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1237 { 0xFE70, U_LEFT_TO_RIGHT },
1238 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1239
1240 { 0x10800, U_LEFT_TO_RIGHT },
1241 { 0x10D00, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1242 { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1243 { 0x10F30, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1244 { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1245 { 0x11000, U_RIGHT_TO_LEFT },
1246
1247 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1248 { 0x1EC70, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1249 { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1250 { 0x1ED00, U_RIGHT_TO_LEFT }, // Unicode 12 changes U+1ED00..U+1ED4F from R to AL.
1251 { 0x1ED50, U_RIGHT_TO_LEFT_ARABIC },
1252 { 0x1EE00, U_RIGHT_TO_LEFT },
1253 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1254 { 0x1F000, U_RIGHT_TO_LEFT },
1255 { 0x110000, U_LEFT_TO_RIGHT }
1256 };
1257
1258 UChar32 c;
1259 int32_t i;
1260 UCharDirection shouldBeDir;
1261
1262 /*
1263 * LineBreak.txt specifies:
1264 * # - Assigned characters that are not listed explicitly are given the value
1265 * # "AL".
1266 * # - Unassigned characters are given the value "XX".
1267 *
1268 * PUA characters are listed explicitly with "XX".
1269 * Verify that no assigned character has "XX".
1270 */
1271 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1272 c=start;
1273 while(c<limit) {
1274 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1275 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1276 }
1277 ++c;
1278 }
1279 }
1280
1281 /*
1282 * Verify default Bidi classes.
1283 * See DerivedBidiClass.txt, especially for unassigned code points.
1284 */
1285 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1286 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1287 c=start;
1288 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1289 if((int32_t)c<defaultBidi[i][0]) {
1290 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1291 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1292 shouldBeDir=U_BOUNDARY_NEUTRAL;
1293 } else {
1294 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1295 }
1296
1297 if( u_charDirection(c)!=shouldBeDir ||
1298 (UCharDirection)u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1299 ) {
1300 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1301 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1302 }
1303 ++c;
1304 }
1305 }
1306 }
1307 }
1308
1309 return TRUE;
1310 }
1311
1312 /* tests for several properties */
TestUnicodeData()1313 static void TestUnicodeData()
1314 {
1315 UVersionInfo expectVersionArray;
1316 UVersionInfo versionArray;
1317 char *fields[15][2];
1318 UErrorCode errorCode;
1319 UChar32 c;
1320 int8_t type;
1321
1322 UnicodeDataContext context;
1323
1324 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1325 u_getUnicodeVersion(versionArray);
1326 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1327 {
1328 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1329 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1330 }
1331
1332 #if defined(ICU_UNICODE_VERSION)
1333 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1334 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1335 {
1336 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1337 }
1338 #endif
1339
1340 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1341 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1342 }
1343
1344 errorCode=U_ZERO_ERROR;
1345 #if !UCONFIG_NO_NORMALIZATION
1346 context.nfc=unorm2_getNFCInstance(&errorCode);
1347 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1348 if(U_FAILURE(errorCode)) {
1349 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1350 return;
1351 }
1352 #endif
1353 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1354 if(U_FAILURE(errorCode)) {
1355 return; /* if we couldn't parse UnicodeData.txt, we should return */
1356 }
1357
1358 /* sanity check on repeated properties */
1359 for(c=0xfffe; c<=0x10ffff;) {
1360 type=u_charType(c);
1361 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1362 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1363 }
1364 if(type!=U_UNASSIGNED) {
1365 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1366 }
1367 if((c&0xffff)==0xfffe) {
1368 ++c;
1369 } else {
1370 c+=0xffff;
1371 }
1372 }
1373
1374 /* test that PUA is not "unassigned" */
1375 for(c=0xe000; c<=0x10fffd;) {
1376 type=u_charType(c);
1377 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1378 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1379 }
1380 if(type==U_UNASSIGNED) {
1381 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1382 } else if(type!=U_PRIVATE_USE_CHAR) {
1383 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1384 }
1385 if(c==0xf8ff) {
1386 c=0xf0000;
1387 } else if(c==0xffffd) {
1388 c=0x100000;
1389 } else {
1390 ++c;
1391 }
1392 }
1393
1394 /* test u_enumCharTypes() */
1395 u_enumCharTypes(enumTypeRange, "a1");
1396
1397 /* check default properties */
1398 u_enumCharTypes(enumDefaultsRange, NULL);
1399 }
1400
TestCodeUnit()1401 static void TestCodeUnit(){
1402 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1403
1404 int32_t i;
1405
1406 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1407 UChar c=codeunit[i];
1408 if(i<4){
1409 if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1410 U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1411 log_err("ERROR: U+%04x is a single", c);
1412 }
1413
1414 }
1415 if(i >= 4 && i< 8){
1416 if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1417 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1418 log_err("ERROR: U+%04x is a first surrogate", c);
1419 }
1420 }
1421 if(i >= 8 && i< 12){
1422 if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1423 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1424 log_err("ERROR: U+%04x is a second surrogate", c);
1425 }
1426 }
1427 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1428 if(i<4){
1429 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1430 log_err("ERROR: U+%04x is a single", c);
1431 }
1432
1433 }
1434 if(i >= 4 && i< 8){
1435 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1436 log_err("ERROR: U+%04x is a first surrogate", c);
1437 }
1438 }
1439 if(i >= 8 && i< 12){
1440 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1441 log_err("ERROR: U+%04x is a second surrogate", c);
1442 }
1443 }
1444 #endif
1445 }
1446 }
1447
TestCodePoint()1448 static void TestCodePoint(){
1449 const UChar32 codePoint[]={
1450 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1451 0xd800,
1452 0xdbff,
1453 0xdc00,
1454 0xdfff,
1455 0xdc04,
1456 0xd821,
1457 /*not a surrogate, valid, isUnicodeChar , not Error*/
1458 0x20ac,
1459 0xd7ff,
1460 0xe000,
1461 0xe123,
1462 0x0061,
1463 0xe065,
1464 0x20402,
1465 0x24506,
1466 0x23456,
1467 0x20402,
1468 0x10402,
1469 0x23456,
1470 /*not a surrogate, not valid, isUnicodeChar, isError */
1471 0x0015,
1472 0x009f,
1473 /*not a surrogate, not valid, not isUnicodeChar, isError */
1474 0xffff,
1475 0xfffe,
1476 };
1477 int32_t i;
1478 for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1479 UChar32 c=codePoint[i];
1480 if(i<6) {
1481 if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1482 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1483 }
1484 if(U_IS_UNICODE_CHAR(c)) {
1485 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1486 }
1487 } else if(i >=6 && i<18) {
1488 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1489 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1490 }
1491 if(!U_IS_UNICODE_CHAR(c)) {
1492 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1493 }
1494 } else if(i >=18 && i<20) {
1495 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1496 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1497 }
1498 if(!U_IS_UNICODE_CHAR(c)) {
1499 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1500 }
1501 } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1502 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1503 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1504 }
1505 if(U_IS_UNICODE_CHAR(c)) {
1506 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1507 }
1508 }
1509 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1510 if(i<6){
1511 if(!UTF_IS_SURROGATE(c)){
1512 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1513 }
1514 if(UTF_IS_VALID(c)){
1515 log_err("ERROR: isValid() failed for U+%04x\n", c);
1516 }
1517 if(UTF_IS_UNICODE_CHAR(c)){
1518 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1519 }
1520 if(UTF_IS_ERROR(c)){
1521 log_err("ERROR: isError() failed for U+%04x\n", c);
1522 }
1523 }else if(i >=6 && i<18){
1524 if(UTF_IS_SURROGATE(c)){
1525 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1526 }
1527 if(!UTF_IS_VALID(c)){
1528 log_err("ERROR: isValid() failed for U+%04x\n", c);
1529 }
1530 if(!UTF_IS_UNICODE_CHAR(c)){
1531 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1532 }
1533 if(UTF_IS_ERROR(c)){
1534 log_err("ERROR: isError() failed for U+%04x\n", c);
1535 }
1536 }else if(i >=18 && i<20){
1537 if(UTF_IS_SURROGATE(c)){
1538 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1539 }
1540 if(UTF_IS_VALID(c)){
1541 log_err("ERROR: isValid() failed for U+%04x\n", c);
1542 }
1543 if(!UTF_IS_UNICODE_CHAR(c)){
1544 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1545 }
1546 if(!UTF_IS_ERROR(c)){
1547 log_err("ERROR: isError() failed for U+%04x\n", c);
1548 }
1549 }
1550 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1551 if(UTF_IS_SURROGATE(c)){
1552 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1553 }
1554 if(UTF_IS_VALID(c)){
1555 log_err("ERROR: isValid() failed for U+%04x\n", c);
1556 }
1557 if(UTF_IS_UNICODE_CHAR(c)){
1558 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1559 }
1560 if(!UTF_IS_ERROR(c)){
1561 log_err("ERROR: isError() failed for U+%04x\n", c);
1562 }
1563 }
1564 #endif
1565 }
1566
1567 if(
1568 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1569 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1570 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1571 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1572 ) {
1573 log_err("error with U_IS_BMP()\n");
1574 }
1575
1576 if(
1577 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1578 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1579 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1580 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1581 ) {
1582 log_err("error with U_IS_SUPPLEMENTARY()\n");
1583 }
1584 }
1585
TestCharLength()1586 static void TestCharLength()
1587 {
1588 const int32_t codepoint[]={
1589 1, 0x0061,
1590 1, 0xe065,
1591 1, 0x20ac,
1592 2, 0x20402,
1593 2, 0x23456,
1594 2, 0x24506,
1595 2, 0x20402,
1596 2, 0x10402,
1597 1, 0xd7ff,
1598 1, 0xe000
1599 };
1600
1601 int32_t i;
1602 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1603 UBool multiple;
1604 #endif
1605 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1606 UChar32 c=codepoint[i+1];
1607 if(
1608 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1609 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1610 #endif
1611 U16_LENGTH(c) != codepoint[i]) {
1612 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1613 }
1614 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1615 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1616 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1617 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1618 }
1619 #endif
1620 }
1621 }
1622
1623 /*internal functions ----*/
MakeProp(char * str)1624 static int32_t MakeProp(char* str)
1625 {
1626 int32_t result = 0;
1627 char* matchPosition =0;
1628
1629 matchPosition = strstr(tagStrings, str);
1630 if (matchPosition == 0)
1631 {
1632 log_err("unrecognized type letter ");
1633 log_err(str);
1634 }
1635 else
1636 result = (int32_t)((matchPosition - tagStrings) / 2);
1637 return result;
1638 }
1639
MakeDir(char * str)1640 static int32_t MakeDir(char* str)
1641 {
1642 int32_t pos = 0;
1643 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1644 if (strcmp(str, dirStrings[pos]) == 0) {
1645 return pos;
1646 }
1647 }
1648 return -1;
1649 }
1650
1651 /* test u_charName() -------------------------------------------------------- */
1652
1653 static const struct {
1654 uint32_t code;
1655 const char *name, *oldName, *extName, *alias;
1656 } names[]={
1657 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A", NULL},
1658 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1659 "LATIN CAPITAL LETTER OI",
1660 "LATIN CAPITAL LETTER GHA"},
1661 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1662 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", NULL},
1663 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1664 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1665 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1666 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401", NULL},
1667 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED", NULL},
1668 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA", NULL},
1669 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH", NULL},
1670 {0xd800, "", "", "<lead surrogate-D800>", NULL},
1671 {0xdc00, "", "", "<trail surrogate-DC00>", NULL},
1672 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS", NULL},
1673 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN", NULL},
1674 {0xffff, "", "", "<noncharacter-FFFF>", NULL},
1675 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1676 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1677 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1678 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456", NULL}
1679 };
1680
1681 static UBool
enumCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1682 enumCharNamesFn(void *context,
1683 UChar32 code, UCharNameChoice nameChoice,
1684 const char *name, int32_t length) {
1685 int32_t *pCount=(int32_t *)context;
1686 const char *expected;
1687 int i;
1688
1689 if(length<=0 || length!=(int32_t)strlen(name)) {
1690 /* should not be called with an empty string or invalid length */
1691 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1692 return TRUE;
1693 }
1694
1695 ++*pCount;
1696 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1697 if(code==(UChar32)names[i].code) {
1698 switch (nameChoice) {
1699 case U_EXTENDED_CHAR_NAME:
1700 if(0!=strcmp(name, names[i].extName)) {
1701 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1702 }
1703 break;
1704 case U_UNICODE_CHAR_NAME:
1705 if(0!=strcmp(name, names[i].name)) {
1706 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1707 }
1708 break;
1709 case U_UNICODE_10_CHAR_NAME:
1710 expected=names[i].oldName;
1711 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1712 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1713 }
1714 break;
1715 case U_CHAR_NAME_ALIAS:
1716 expected=names[i].alias;
1717 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1718 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1719 }
1720 break;
1721 case U_CHAR_NAME_CHOICE_COUNT:
1722 break;
1723 }
1724 break;
1725 }
1726 }
1727 return TRUE;
1728 }
1729
1730 struct enumExtCharNamesContext {
1731 uint32_t length;
1732 int32_t last;
1733 };
1734
1735 static UBool
enumExtCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1736 enumExtCharNamesFn(void *context,
1737 UChar32 code, UCharNameChoice nameChoice,
1738 const char *name, int32_t length) {
1739 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1740
1741 if (ecncp->last != (int32_t) code - 1) {
1742 if (ecncp->last < 0) {
1743 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1744 } else {
1745 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1746 }
1747 }
1748 ecncp->last = (int32_t) code;
1749
1750 if (!*name) {
1751 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1752 }
1753
1754 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1755 }
1756
1757 /**
1758 * This can be made more efficient by moving it into putil.c and having
1759 * it directly access the ebcdic translation tables.
1760 * TODO: If we get this method in putil.c, then delete it from here.
1761 */
1762 static UChar
u_charToUChar(char c)1763 u_charToUChar(char c) {
1764 UChar uc;
1765 u_charsToUChars(&c, &uc, 1);
1766 return uc;
1767 }
1768
1769 static void
TestCharNames()1770 TestCharNames() {
1771 static char name[80];
1772 UErrorCode errorCode=U_ZERO_ERROR;
1773 struct enumExtCharNamesContext extContext;
1774 const char *expected;
1775 int32_t length;
1776 UChar32 c;
1777 int32_t i;
1778
1779 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1780 length=uprv_getMaxCharNameLength();
1781 if(length==0) {
1782 /* no names data available */
1783 return;
1784 }
1785 if(length<83) { /* Unicode 3.2 max char name length */
1786 log_err("uprv_getMaxCharNameLength()=%d is too short");
1787 }
1788 /* ### TODO same tests for max ISO comment length as for max name length */
1789
1790 log_verbose("Testing u_charName()\n");
1791 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1792 /* modern Unicode character name */
1793 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1794 if(U_FAILURE(errorCode)) {
1795 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1796 return;
1797 }
1798 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1799 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1800 }
1801
1802 /* find the modern name */
1803 if (*names[i].name) {
1804 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1805 if(U_FAILURE(errorCode)) {
1806 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1807 return;
1808 }
1809 if(c!=(UChar32)names[i].code) {
1810 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1811 }
1812 }
1813
1814 /* Unicode 1.0 character name */
1815 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1816 if(U_FAILURE(errorCode)) {
1817 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1818 return;
1819 }
1820 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1821 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1822 }
1823
1824 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1825 if(names[i].oldName[0]!=0 /* && length>0 */) {
1826 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1827 if(U_FAILURE(errorCode)) {
1828 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1829 return;
1830 }
1831 if(c!=(UChar32)names[i].code) {
1832 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1833 }
1834 }
1835
1836 /* Unicode character name alias */
1837 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1838 if(U_FAILURE(errorCode)) {
1839 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1840 return;
1841 }
1842 expected=names[i].alias;
1843 if(expected==NULL) {
1844 expected="";
1845 }
1846 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1847 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1848 names[i].code, name, length, expected);
1849 }
1850
1851 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1852 if(expected[0]!=0 /* && length>0 */) {
1853 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1854 if(U_FAILURE(errorCode)) {
1855 log_err("u_charFromName(%s - alias) error %s\n",
1856 expected, u_errorName(errorCode));
1857 return;
1858 }
1859 if(c!=(UChar32)names[i].code) {
1860 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1861 expected, c, names[i].code);
1862 }
1863 }
1864 }
1865
1866 /* test u_enumCharNames() */
1867 length=0;
1868 errorCode=U_ZERO_ERROR;
1869 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1870 if(U_FAILURE(errorCode) || length<94140) {
1871 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1872 }
1873
1874 extContext.length = 0;
1875 extContext.last = -1;
1876 errorCode=U_ZERO_ERROR;
1877 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1878 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1879 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1880 }
1881
1882 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1883 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1884 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1885 }
1886
1887 /* Test getCharNameCharacters */
1888 if(!getTestOption(QUICK_OPTION)) {
1889 enum { BUFSIZE = 256 };
1890 UErrorCode ec = U_ZERO_ERROR;
1891 char buf[BUFSIZE];
1892 int32_t maxLength;
1893 UChar32 cp;
1894 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1895 int32_t l1, l2;
1896 UBool map[256];
1897 UBool ok;
1898
1899 USet* set = uset_open(1, 0); /* empty set */
1900 USet* dumb = uset_open(1, 0); /* empty set */
1901
1902 /*
1903 * uprv_getCharNameCharacters() will likely return more lowercase
1904 * letters than actual character names contain because
1905 * it includes all the characters in lowercased names of
1906 * general categories, for the full possible set of extended names.
1907 */
1908 {
1909 USetAdder sa={
1910 NULL,
1911 uset_add,
1912 uset_addRange,
1913 uset_addString,
1914 NULL, /* don't need remove() */
1915 NULL /* don't need removeRange() */
1916 };
1917 sa.set=set;
1918 uprv_getCharNameCharacters(&sa);
1919 }
1920
1921 /* build set the dumb (but sure-fire) way */
1922 for (i=0; i<256; ++i) {
1923 map[i] = FALSE;
1924 }
1925
1926 maxLength=0;
1927 for (cp=0; cp<0x110000; ++cp) {
1928 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1929 buf, BUFSIZE, &ec);
1930 if (U_FAILURE(ec)) {
1931 log_err("FAIL: u_charName failed when it shouldn't\n");
1932 uset_close(set);
1933 uset_close(dumb);
1934 return;
1935 }
1936 if(len>maxLength) {
1937 maxLength=len;
1938 }
1939
1940 for (i=0; i<len; ++i) {
1941 if (!map[(uint8_t) buf[i]]) {
1942 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1943 map[(uint8_t) buf[i]] = TRUE;
1944 }
1945 }
1946
1947 /* test for leading/trailing whitespace */
1948 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1949 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1950 }
1951 }
1952
1953 if(map[(uint8_t)'\t']) {
1954 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1955 }
1956
1957 length=uprv_getMaxCharNameLength();
1958 if(length!=maxLength) {
1959 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1960 length, maxLength);
1961 }
1962
1963 /* compare the sets. Where is my uset_equals?!! */
1964 ok=TRUE;
1965 for(i=0; i<256; ++i) {
1966 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1967 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1968 /* ignore lowercase a-z that are in set but not in dumb */
1969 ok=TRUE;
1970 } else {
1971 ok=FALSE;
1972 break;
1973 }
1974 }
1975 }
1976
1977 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1978 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1979 if (U_FAILURE(ec)) {
1980 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1981 uset_close(set);
1982 uset_close(dumb);
1983 return;
1984 }
1985
1986 if (l1 >= BUFSIZE) {
1987 l1 = BUFSIZE-1;
1988 pat[l1] = 0;
1989 }
1990 if (l2 >= BUFSIZE) {
1991 l2 = BUFSIZE-1;
1992 dumbPat[l2] = 0;
1993 }
1994
1995 if (!ok) {
1996 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1997 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1998 } else if(getTestOption(VERBOSITY_OPTION)) {
1999 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
2000 }
2001
2002 uset_close(set);
2003 uset_close(dumb);
2004 }
2005
2006 /* ### TODO: test error cases and other interesting things */
2007 }
2008
2009 static void
TestUCharFromNameUnderflow()2010 TestUCharFromNameUnderflow() {
2011 // Ticket #10889: Underflow crash when there is no dash.
2012 const char *name="<NO BREAK SPACE>";
2013 UErrorCode errorCode=U_ZERO_ERROR;
2014 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2015 if(U_SUCCESS(errorCode)) {
2016 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2017 name, c, u_errorName(errorCode));
2018 }
2019
2020 // Test related edge cases.
2021 name="<-00a0>";
2022 errorCode=U_ZERO_ERROR;
2023 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2024 if(U_SUCCESS(errorCode)) {
2025 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2026 name, c, u_errorName(errorCode));
2027 }
2028
2029 errorCode=U_ZERO_ERROR;
2030 name="<control->";
2031 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2032 if(U_SUCCESS(errorCode)) {
2033 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2034 name, c, u_errorName(errorCode));
2035 }
2036
2037 errorCode=U_ZERO_ERROR;
2038 name="<control-111111>";
2039 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2040 if(U_SUCCESS(errorCode)) {
2041 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2042 name, c, u_errorName(errorCode));
2043 }
2044
2045 // ICU-20292: integer overflow
2046 errorCode=U_ZERO_ERROR;
2047 name="<noncharacter-10010FFFF>";
2048 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2049 if(U_SUCCESS(errorCode)) {
2050 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2051 name, c, u_errorName(errorCode));
2052 }
2053
2054 errorCode=U_ZERO_ERROR;
2055 name="<noncharacter-00010FFFF>"; // too many digits even if only leading 0s
2056 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2057 if(U_SUCCESS(errorCode)) {
2058 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2059 name, c, u_errorName(errorCode));
2060 }
2061
2062 errorCode=U_ZERO_ERROR;
2063 name="<noncharacter-fFFf>>";
2064 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2065 if(U_SUCCESS(errorCode)) {
2066 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2067 name, c, u_errorName(errorCode));
2068 }
2069 }
2070
2071 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2072
2073 static void
TestMirroring()2074 TestMirroring() {
2075 USet *set;
2076 UErrorCode errorCode;
2077
2078 UChar32 start, end, c2, c3;
2079 int32_t i;
2080
2081 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2082
2083 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2084
2085 log_verbose("Testing u_isMirrored()\n");
2086 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2087 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2088 )
2089 ) {
2090 log_err("u_isMirrored() does not work correctly\n");
2091 }
2092
2093 log_verbose("Testing u_charMirror()\n");
2094 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2095 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2096 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2097 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2098 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2099 )
2100 ) {
2101 log_err("u_charMirror() does not work correctly\n");
2102 }
2103
2104 /* verify that Bidi_Mirroring_Glyph roundtrips */
2105 errorCode=U_ZERO_ERROR;
2106 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2107
2108 if (U_FAILURE(errorCode)) {
2109 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2110 } else {
2111 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2112 do {
2113 c2=u_charMirror(start);
2114 c3=u_charMirror(c2);
2115 if(c3!=start) {
2116 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2117 }
2118 c3=u_getBidiPairedBracket(start);
2119 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2120 if(c3!=start) {
2121 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2122 (long)start);
2123 }
2124 } else {
2125 if(c3!=c2) {
2126 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2127 (long)start, (long)c2);
2128 }
2129 }
2130 } while(++start<=end);
2131 }
2132 }
2133
2134 uset_close(set);
2135 }
2136
2137
2138 struct RunTestData
2139 {
2140 const char *runText;
2141 UScriptCode runCode;
2142 };
2143
2144 typedef struct RunTestData RunTestData;
2145
2146 static void
CheckScriptRuns(UScriptRun * scriptRun,int32_t * runStarts,const RunTestData * testData,int32_t nRuns,const char * prefix)2147 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2148 const char *prefix)
2149 {
2150 int32_t run, runStart, runLimit;
2151 UScriptCode runCode;
2152
2153 /* iterate over all the runs */
2154 run = 0;
2155 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2156 if (runStart != runStarts[run]) {
2157 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2158 prefix, run, runStarts[run], runStart);
2159 }
2160
2161 if (runLimit != runStarts[run + 1]) {
2162 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2163 prefix, run, runStarts[run + 1], runLimit);
2164 }
2165
2166 if (runCode != testData[run].runCode) {
2167 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2168 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2169 }
2170
2171 run += 1;
2172
2173 /* stop when we've seen all the runs we expect to see */
2174 if (run >= nRuns) {
2175 break;
2176 }
2177 }
2178
2179 /* Complain if we didn't see then number of runs we expected */
2180 if (run != nRuns) {
2181 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2182 }
2183 }
2184
2185 static void
TestUScriptRunAPI()2186 TestUScriptRunAPI()
2187 {
2188 static const RunTestData testData1[] = {
2189 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2190 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2191 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2192 {"English (", USCRIPT_LATIN},
2193 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2194 {") ", USCRIPT_LATIN},
2195 {"\\u6F22\\u5B75", USCRIPT_HAN},
2196 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2197 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2198 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2199 };
2200
2201 static const RunTestData testData2[] = {
2202 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2203 };
2204
2205 static const struct {
2206 const RunTestData *testData;
2207 int32_t nRuns;
2208 } testDataEntries[] = {
2209 {testData1, UPRV_LENGTHOF(testData1)},
2210 {testData2, UPRV_LENGTHOF(testData2)}
2211 };
2212
2213 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2214 int32_t testEntry;
2215
2216 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2217 UChar testString[1024];
2218 int32_t runStarts[256];
2219 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2220 const RunTestData *testData = testDataEntries[testEntry].testData;
2221
2222 int32_t run, stringLimit;
2223 UScriptRun *scriptRun = NULL;
2224 UErrorCode err;
2225
2226 /*
2227 * Fill in the test string and the runStarts array.
2228 */
2229 stringLimit = 0;
2230 for (run = 0; run < nTestRuns; run += 1) {
2231 runStarts[run] = stringLimit;
2232 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2233 /*stringLimit -= 1;*/
2234 }
2235
2236 /* The limit of the last run */
2237 runStarts[nTestRuns] = stringLimit;
2238
2239 /*
2240 * Make sure that calling uscript_OpenRun with a NULL text pointer
2241 * and a non-zero text length returns the correct error.
2242 */
2243 err = U_ZERO_ERROR;
2244 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2245
2246 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2247 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2248 }
2249
2250 if (scriptRun != NULL) {
2251 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2252 uscript_closeRun(scriptRun);
2253 }
2254
2255 /*
2256 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2257 * and a zero text length returns the correct error.
2258 */
2259 err = U_ZERO_ERROR;
2260 scriptRun = uscript_openRun(testString, 0, &err);
2261
2262 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2263 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2264 }
2265
2266 if (scriptRun != NULL) {
2267 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2268 uscript_closeRun(scriptRun);
2269 }
2270
2271 /*
2272 * Make sure that calling uscript_openRun with a NULL text pointer
2273 * and a zero text length doesn't return an error.
2274 */
2275 err = U_ZERO_ERROR;
2276 scriptRun = uscript_openRun(NULL, 0, &err);
2277
2278 if (U_FAILURE(err)) {
2279 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2280 }
2281
2282 /* Make sure that the empty iterator doesn't find any runs */
2283 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2284 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2285 }
2286
2287 /*
2288 * Make sure that calling uscript_setRunText with a NULL text pointer
2289 * and a non-zero text length returns the correct error.
2290 */
2291 err = U_ZERO_ERROR;
2292 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2293
2294 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2295 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2296 }
2297
2298 /*
2299 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2300 * and a zero text length returns the correct error.
2301 */
2302 err = U_ZERO_ERROR;
2303 uscript_setRunText(scriptRun, testString, 0, &err);
2304
2305 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2306 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2307 }
2308
2309 /*
2310 * Now call uscript_setRunText on the empty iterator
2311 * and make sure that it works.
2312 */
2313 err = U_ZERO_ERROR;
2314 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2315
2316 if (U_FAILURE(err)) {
2317 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2318 } else {
2319 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2320 }
2321
2322 uscript_closeRun(scriptRun);
2323
2324 /*
2325 * Now open an interator over the testString
2326 * using uscript_openRun and make sure that it works
2327 */
2328 scriptRun = uscript_openRun(testString, stringLimit, &err);
2329
2330 if (U_FAILURE(err)) {
2331 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2332 } else {
2333 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2334 }
2335
2336 /* Now reset the iterator, and make sure
2337 * that it still works.
2338 */
2339 uscript_resetRun(scriptRun);
2340
2341 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2342
2343 /* Close the iterator */
2344 uscript_closeRun(scriptRun);
2345 }
2346 }
2347
2348 /* test additional, non-core properties */
2349 static void
TestAdditionalProperties()2350 TestAdditionalProperties() {
2351 /* test data for u_charAge() */
2352 static const struct {
2353 UChar32 c;
2354 UVersionInfo version;
2355 } charAges[]={
2356 {0x41, { 1, 1, 0, 0 }},
2357 {0xffff, { 1, 1, 0, 0 }},
2358 {0x20ab, { 2, 0, 0, 0 }},
2359 {0x2fffe, { 2, 0, 0, 0 }},
2360 {0x20ac, { 2, 1, 0, 0 }},
2361 {0xfb1d, { 3, 0, 0, 0 }},
2362 {0x3f4, { 3, 1, 0, 0 }},
2363 {0x10300, { 3, 1, 0, 0 }},
2364 {0x220, { 3, 2, 0, 0 }},
2365 {0xff60, { 3, 2, 0, 0 }}
2366 };
2367
2368 /* test data for u_hasBinaryProperty() */
2369 static const int32_t
2370 props[][3]={ /* code point, property, value */
2371 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2372 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2373 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2374
2375 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2376 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2377
2378 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2379 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2380
2381 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2382 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2383
2384 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2385 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2386 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2387 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2388 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2389
2390 { 0x058a, UCHAR_DASH, TRUE },
2391 { 0x007e, UCHAR_DASH, FALSE },
2392
2393 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2394 { 0x3000, UCHAR_DIACRITIC, FALSE },
2395
2396 { 0x0e46, UCHAR_EXTENDER, TRUE },
2397 { 0x0020, UCHAR_EXTENDER, FALSE },
2398
2399 #if !UCONFIG_NO_NORMALIZATION
2400 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2401 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2402 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2403
2404 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2405 { 0x0308, UCHAR_NFD_INERT, FALSE },
2406
2407 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2408 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2409
2410 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2411 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2412 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2413 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2414 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2415 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2416
2417 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2418 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2419
2420 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2421 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2422 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2423 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2424 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2425 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2426 #endif
2427
2428 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2429 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2430 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2431
2432 { 0x30fb, UCHAR_HYPHEN, TRUE },
2433 { 0xfe58, UCHAR_HYPHEN, FALSE },
2434
2435 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2436 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2437 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2438
2439 { 0x2172, UCHAR_ID_START, TRUE },
2440 { 0x007a, UCHAR_ID_START, TRUE },
2441 { 0x0039, UCHAR_ID_START, FALSE },
2442
2443 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2444 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2445 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2446
2447 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2448 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2449
2450 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2451 { 0x0345, UCHAR_LOWERCASE, TRUE },
2452 { 0x0030, UCHAR_LOWERCASE, FALSE },
2453
2454 { 0x1d7a9, UCHAR_MATH, TRUE },
2455 { 0x2135, UCHAR_MATH, TRUE },
2456 { 0x0062, UCHAR_MATH, FALSE },
2457
2458 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2459 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2460 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2461
2462 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2463 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2464 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2465
2466 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2467 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2468
2469 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2470 { 0x2162, UCHAR_UPPERCASE, TRUE },
2471 { 0x0345, UCHAR_UPPERCASE, FALSE },
2472
2473 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2474 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2475 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2476
2477 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2478 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2479 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2480
2481 { 0x16ee, UCHAR_XID_START, TRUE },
2482 { 0x23456, UCHAR_XID_START, TRUE },
2483 { 0x1d1aa, UCHAR_XID_START, FALSE },
2484
2485 /*
2486 * Version break:
2487 * The following properties are only supported starting with the
2488 * Unicode version indicated in the second field.
2489 */
2490 { -1, 0x320, 0 },
2491
2492 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2493 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2494 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2495
2496 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2497 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2498 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2499 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2500
2501 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2502 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2503 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2504 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2505
2506 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2507 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2508 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2509 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2510
2511 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2512 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2513
2514 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2515 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2516
2517 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2518 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2519
2520 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2521 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2522
2523 { 0x2e9b, UCHAR_RADICAL, TRUE },
2524 { 0x4e00, UCHAR_RADICAL, FALSE },
2525
2526 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2527 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2528
2529 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2530 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2531
2532 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2533
2534 { 0x002e, UCHAR_S_TERM, TRUE },
2535 { 0x0061, UCHAR_S_TERM, FALSE },
2536
2537 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2538 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2539 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2540 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2541
2542 /* enum/integer type properties */
2543
2544 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2545 /* test default Bidi classes for unassigned code points */
2546 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2547 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2548 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2549 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2550 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2551 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2552 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2553 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2554 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2555 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2556 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2557
2558 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2559 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2560 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2561 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2562 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2563 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2564 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2565
2566 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2567 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2568 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2569 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2570 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2571 { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2572 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2573 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2574 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2575 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2576 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2577
2578 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2579 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2580
2581 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2582 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2583 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2584 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2585 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2586 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2587 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2588 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2589 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2590
2591 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2592 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2593 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2594 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2595 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2596 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2597 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2598 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2599 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2600 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2601 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2602 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2603 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2604 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2605 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2606 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2607 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2608
2609 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2610 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2611 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2612
2613 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2614 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2615 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2616 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2617 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2618
2619 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2620 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2621 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2622 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2623 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2624 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2625 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2626 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2627
2628 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2629 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2630 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2631 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2632 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2633 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2634 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2635 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2636 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2637 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2638 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2639 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2640 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2641 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2642 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2643 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2644
2645 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2646
2647 /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2648
2649 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2650 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2651 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2652 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2653 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2654 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2655 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2656
2657 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2658 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2659 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2660 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2661
2662 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2663 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2664 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2665 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2666 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2667 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2668
2669 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2670 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2671 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2672 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2673
2674 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2675 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2676 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2677 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2678 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2679 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2680 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2681
2682 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2683 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2684 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2685 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2686
2687 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2688 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2689 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2690 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2691
2692 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2693 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2694 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2695 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2696 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2697
2698 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2699
2700 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2701
2702 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2703 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2704 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2705
2706 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2707 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2708 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2709 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2710 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2711
2712 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2713 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2714 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2715
2716 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2717 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2718 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2719 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2720
2721 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2722 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2723 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2724 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2725 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2726 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2727
2728 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2729 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2730 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2731 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2732
2733 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2734 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2735 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2736 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2737
2738 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2739 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2740 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2741 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2742
2743 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2744
2745 /* unassigned code points in new default Bidi R blocks */
2746 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2747 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2748
2749 /* test some script codes >127 */
2750 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2751 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2752 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2753
2754 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2755
2756 /* value changed in Unicode 6.0 */
2757 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2758
2759 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2760
2761 /* unassigned code points in new/changed default Bidi AL blocks */
2762 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2763 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2764
2765 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2766
2767 /* unassigned code points in the currency symbols block now default to ET */
2768 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2769 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2770
2771 /* new property in Unicode 6.3 */
2772 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2773 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2774 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2775 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2776 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2777 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2778
2779 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2780
2781 /* new character range with Joining_Group values */
2782 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2783 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2784 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2785 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2786 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2787
2788 { -1, 0xa00, 0 }, // version break for Unicode 10
2789
2790 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2791 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2792 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2793 { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2794
2795 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2796 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2797 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2798
2799 /* undefined UProperty values */
2800 { 0x61, 0x4a7, 0 },
2801 { 0x234bc, 0x15ed, 0 }
2802 };
2803
2804 UVersionInfo version;
2805 UChar32 c;
2806 int32_t i, result, uVersion;
2807 UProperty which;
2808
2809 /* what is our Unicode version? */
2810 u_getUnicodeVersion(version);
2811 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2812
2813 u_charAge(0x20, version);
2814 if(version[0]==0) {
2815 /* no additional properties available */
2816 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2817 return;
2818 }
2819
2820 /* test u_charAge() */
2821 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2822 u_charAge(charAges[i].c, version);
2823 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2824 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2825 charAges[i].c,
2826 version[0], version[1], version[2], version[3],
2827 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2828 }
2829 }
2830
2831 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2832 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2833 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2834 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2835 u_getIntPropertyMinValue(0x2345)!=0
2836 ) {
2837 log_err("error: u_getIntPropertyMinValue() wrong\n");
2838 }
2839 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2840 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2841 }
2842 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2843 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2844 }
2845 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2846 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2847 }
2848 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2849 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2850 }
2851 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2852 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2853 }
2854 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2855 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2856 }
2857 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2858 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2859 }
2860 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2861 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2862 }
2863 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2864 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2865 }
2866 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2867 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2868 }
2869 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2870 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2871 }
2872 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2873 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2874 }
2875 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2876 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2877 }
2878 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2879 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2880 }
2881 /*JB#2410*/
2882 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2883 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2884 }
2885 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2886 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2887 }
2888 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2889 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2890 }
2891 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2892 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2893 }
2894 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2895 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2896 }
2897
2898 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2899 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2900 const char *whichName;
2901
2902 if(props[i][0]<0) {
2903 /* Unicode version break */
2904 if(uVersion<props[i][1]) {
2905 break; /* do not test properties that are not yet supported */
2906 } else {
2907 continue; /* skip this row */
2908 }
2909 }
2910
2911 c=(UChar32)props[i][0];
2912 which=(UProperty)props[i][1];
2913 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2914
2915 if(which<UCHAR_INT_START) {
2916 result=u_hasBinaryProperty(c, which);
2917 if(result!=props[i][2]) {
2918 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2919 c, whichName, result, i);
2920 }
2921 }
2922
2923 result=u_getIntPropertyValue(c, which);
2924 if(result!=props[i][2]) {
2925 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2926 c, whichName, result, props[i][2], i);
2927 }
2928
2929 /* test separate functions, too */
2930 switch((UProperty)props[i][1]) {
2931 case UCHAR_ALPHABETIC:
2932 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2933 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2934 props[i][0], result, i);
2935 }
2936 break;
2937 case UCHAR_LOWERCASE:
2938 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2939 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2940 props[i][0], result, i);
2941 }
2942 break;
2943 case UCHAR_UPPERCASE:
2944 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2945 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2946 props[i][0], result, i);
2947 }
2948 break;
2949 case UCHAR_WHITE_SPACE:
2950 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2951 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2952 props[i][0], result, i);
2953 }
2954 break;
2955 default:
2956 break;
2957 }
2958 }
2959 }
2960
2961 static void
TestNumericProperties(void)2962 TestNumericProperties(void) {
2963 /* see UnicodeData.txt, DerivedNumericValues.txt */
2964 static const struct {
2965 UChar32 c;
2966 int32_t type;
2967 double numValue;
2968 } values[]={
2969 { 0x0F33, U_NT_NUMERIC, -1./2. },
2970 { 0x0C66, U_NT_DECIMAL, 0 },
2971 { 0x96f6, U_NT_NUMERIC, 0 },
2972 { 0xa833, U_NT_NUMERIC, 1./16. },
2973 { 0x2152, U_NT_NUMERIC, 1./10. },
2974 { 0x2151, U_NT_NUMERIC, 1./9. },
2975 { 0x1245f, U_NT_NUMERIC, 1./8. },
2976 { 0x2150, U_NT_NUMERIC, 1./7. },
2977 { 0x2159, U_NT_NUMERIC, 1./6. },
2978 { 0x09f6, U_NT_NUMERIC, 3./16. },
2979 { 0x2155, U_NT_NUMERIC, 1./5. },
2980 { 0x00BD, U_NT_NUMERIC, 1./2. },
2981 { 0x0031, U_NT_DECIMAL, 1. },
2982 { 0x4e00, U_NT_NUMERIC, 1. },
2983 { 0x58f1, U_NT_NUMERIC, 1. },
2984 { 0x10320, U_NT_NUMERIC, 1. },
2985 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2986 { 0x00B2, U_NT_DIGIT, 2. },
2987 { 0x5f10, U_NT_NUMERIC, 2. },
2988 { 0x1813, U_NT_DECIMAL, 3. },
2989 { 0x5f0e, U_NT_NUMERIC, 3. },
2990 { 0x2173, U_NT_NUMERIC, 4. },
2991 { 0x8086, U_NT_NUMERIC, 4. },
2992 { 0x278E, U_NT_DIGIT, 5. },
2993 { 0x1D7F2, U_NT_DECIMAL, 6. },
2994 { 0x247A, U_NT_DIGIT, 7. },
2995 { 0x7396, U_NT_NUMERIC, 9. },
2996 { 0x1372, U_NT_NUMERIC, 10. },
2997 { 0x216B, U_NT_NUMERIC, 12. },
2998 { 0x16EE, U_NT_NUMERIC, 17. },
2999 { 0x249A, U_NT_NUMERIC, 19. },
3000 { 0x303A, U_NT_NUMERIC, 30. },
3001 { 0x5345, U_NT_NUMERIC, 30. },
3002 { 0x32B2, U_NT_NUMERIC, 37. },
3003 { 0x1375, U_NT_NUMERIC, 40. },
3004 { 0x10323, U_NT_NUMERIC, 50. },
3005 { 0x0BF1, U_NT_NUMERIC, 100. },
3006 { 0x964c, U_NT_NUMERIC, 100. },
3007 { 0x217E, U_NT_NUMERIC, 500. },
3008 { 0x2180, U_NT_NUMERIC, 1000. },
3009 { 0x4edf, U_NT_NUMERIC, 1000. },
3010 { 0x2181, U_NT_NUMERIC, 5000. },
3011 { 0x137C, U_NT_NUMERIC, 10000. },
3012 { 0x4e07, U_NT_NUMERIC, 10000. },
3013 { 0x12432, U_NT_NUMERIC, 216000. },
3014 { 0x12433, U_NT_NUMERIC, 432000. },
3015 { 0x4ebf, U_NT_NUMERIC, 100000000. },
3016 { 0x5146, U_NT_NUMERIC, 1000000000000. },
3017 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
3018 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
3019 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
3020 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
3021 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
3022 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
3023 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
3024 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
3025 };
3026
3027 double nv;
3028 UChar32 c;
3029 int32_t i, type;
3030
3031 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
3032 c=values[i].c;
3033 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
3034 nv=u_getNumericValue(c);
3035
3036 if(type!=values[i].type) {
3037 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
3038 }
3039 if(0.000001 <= fabs(nv - values[i].numValue)) {
3040 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3041 }
3042 }
3043 }
3044
3045 /**
3046 * Test the property names and property value names API.
3047 */
3048 static void
TestPropertyNames(void)3049 TestPropertyNames(void) {
3050 int32_t p, v, choice=0, rev;
3051 UBool atLeastSomething = FALSE;
3052
3053 for (p=0; ; ++p) {
3054 UProperty propEnum = (UProperty)p;
3055 UBool sawProp = FALSE;
3056 if(p > 10 && !atLeastSomething) {
3057 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3058 return;
3059 }
3060
3061 for (choice=0; ; ++choice) {
3062 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3063 if (name) {
3064 if (!sawProp)
3065 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3066 log_verbose("%d=\"%s\"", choice, name);
3067 sawProp = TRUE;
3068 atLeastSomething = TRUE;
3069
3070 /* test reverse mapping */
3071 rev = u_getPropertyEnum(name);
3072 if (rev != p) {
3073 log_err("Property round-trip failure: %d -> %s -> %d\n",
3074 p, name, rev);
3075 }
3076 }
3077 if (!name && choice>0) break;
3078 }
3079 if (sawProp) {
3080 /* looks like a valid property; check the values */
3081 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3082 int32_t max = 0;
3083 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3084 max = 255;
3085 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3086 /* it's far too slow to iterate all the way up to
3087 the real max, U_GC_P_MASK */
3088 max = U_GC_NL_MASK;
3089 } else if (p == UCHAR_BLOCK) {
3090 /* UBlockCodes, unlike other values, start at 1 */
3091 max = 1;
3092 }
3093 log_verbose("\n");
3094 for (v=-1; ; ++v) {
3095 UBool sawValue = FALSE;
3096 for (choice=0; ; ++choice) {
3097 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3098 if (vname) {
3099 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3100 log_verbose("%d=\"%s\"", choice, vname);
3101 sawValue = TRUE;
3102
3103 /* test reverse mapping */
3104 rev = u_getPropertyValueEnum(propEnum, vname);
3105 if (rev != v) {
3106 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3107 pname, v, vname, rev);
3108 }
3109 }
3110 if (!vname && choice>0) break;
3111 }
3112 if (sawValue) {
3113 log_verbose("\n");
3114 }
3115 if (!sawValue && v>=max) break;
3116 }
3117 }
3118 if (!sawProp) {
3119 if (p>=UCHAR_STRING_LIMIT) {
3120 break;
3121 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3122 p = UCHAR_STRING_START - 1;
3123 } else if (p>=UCHAR_MASK_LIMIT) {
3124 p = UCHAR_DOUBLE_START - 1;
3125 } else if (p>=UCHAR_INT_LIMIT) {
3126 p = UCHAR_MASK_START - 1;
3127 } else if (p>=UCHAR_BINARY_LIMIT) {
3128 p = UCHAR_INT_START - 1;
3129 }
3130 }
3131 }
3132 }
3133
3134 /**
3135 * Test the property values API. See JB#2410.
3136 */
3137 static void
TestPropertyValues(void)3138 TestPropertyValues(void) {
3139 int32_t i, p, min, max;
3140 UErrorCode ec;
3141
3142 /* Min should be 0 for everything. */
3143 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3144 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3145 UProperty propEnum = (UProperty)p;
3146 min = u_getIntPropertyMinValue(propEnum);
3147 if (min != 0) {
3148 if (p == UCHAR_BLOCK) {
3149 /* This is okay...for now. See JB#2487.
3150 TODO Update this for JB#2487. */
3151 } else {
3152 const char* name;
3153 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3154 if (name == NULL)
3155 name = "<ERROR>";
3156 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3157 name, min);
3158 }
3159 }
3160 }
3161
3162 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3163 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3164 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3165 }
3166
3167 /* Max should be -1 for invalid properties. */
3168 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3169 if (max != -1) {
3170 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3171 max);
3172 }
3173
3174 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3175 for (i=0; i<2; ++i) {
3176 int32_t script;
3177 const char* desc;
3178 ec = U_ZERO_ERROR;
3179 switch (i) {
3180 case 0:
3181 script = uscript_getScript(-1, &ec);
3182 desc = "uscript_getScript(-1)";
3183 break;
3184 case 1:
3185 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3186 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3187 break;
3188 default:
3189 log_err("Internal test error. Too many scripts\n");
3190 return;
3191 }
3192 /* We don't explicitly test ec. It should be U_FAILURE but it
3193 isn't documented as such. */
3194 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3195 log_err("FAIL: %s = %d, exp. 0\n",
3196 desc, script);
3197 }
3198 }
3199 }
3200
3201 /* various tests for consistency of UCD data and API behavior */
3202 static void
TestConsistency()3203 TestConsistency() {
3204 char buffer[300];
3205 USet *set1, *set2, *set3, *set4;
3206 UErrorCode errorCode;
3207
3208 UChar32 start, end;
3209 int32_t i, length;
3210
3211 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3212 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3213 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3214 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3215 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3216
3217 U_STRING_DECL(mathBlocksPattern,
3218 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3219 214);
3220 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3221 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3222 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3223 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3224
3225 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3226 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3227 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3228 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3229 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3230
3231 U_STRING_INIT(mathBlocksPattern,
3232 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3233 214);
3234 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3235 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3236 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3237 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3238
3239 /*
3240 * It used to be that UCD.html and its precursors said
3241 * "Those dashes used to mark connections between pieces of words,
3242 * plus the Katakana middle dot."
3243 *
3244 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3245 * but not from Hyphen.
3246 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3247 * Therefore, do not show errors when testing the Hyphen property.
3248 */
3249 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3250 "known to the UTC and not considered errors.\n");
3251
3252 errorCode=U_ZERO_ERROR;
3253 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3254 set2=uset_openPattern(dashPattern, 8, &errorCode);
3255 if(U_SUCCESS(errorCode)) {
3256 /* remove the Katakana middle dot(s) from set1 */
3257 uset_remove(set1, 0x30fb);
3258 uset_remove(set1, 0xff65); /* halfwidth variant */
3259 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3260 } else {
3261 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3262 }
3263
3264 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3265 set3=uset_openPattern(formatPattern, 6, &errorCode);
3266 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3267 if(U_SUCCESS(errorCode)) {
3268 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3269 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3270 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3271 } else {
3272 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3273 }
3274
3275 uset_close(set1);
3276 uset_close(set2);
3277 uset_close(set3);
3278 uset_close(set4);
3279
3280 /*
3281 * Check that each lowercase character has "small" in its name
3282 * and not "capital".
3283 * There are some such characters, some of which seem odd.
3284 * Use the verbose flag to see these notices.
3285 */
3286 errorCode=U_ZERO_ERROR;
3287 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3288 if(U_SUCCESS(errorCode)) {
3289 for(i=0;; ++i) {
3290 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3291 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3292 break; /* done */
3293 }
3294 if(U_FAILURE(errorCode)) {
3295 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3296 i, u_errorName(errorCode));
3297 break;
3298 }
3299 if(length!=0) {
3300 break; /* done with code points, got a string or -1 */
3301 }
3302
3303 while(start<=end) {
3304 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3305 if(U_FAILURE(errorCode)) {
3306 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3307 errorCode=U_ZERO_ERROR;
3308 }
3309 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3310 strstr(buffer, "SMALL CAPITAL")==NULL
3311 ) {
3312 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3313 }
3314 ++start;
3315 }
3316 }
3317 } else {
3318 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3319 }
3320 uset_close(set1);
3321
3322 /* verify that all assigned characters in Math blocks are exactly Math characters */
3323 errorCode=U_ZERO_ERROR;
3324 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3325 set2=uset_openPattern(mathPattern, 8, &errorCode);
3326 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3327 if(U_SUCCESS(errorCode)) {
3328 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3329 uset_complement(set3); /* assigned characters */
3330 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3331 compareUSets(set1, set2,
3332 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3333 TRUE);
3334 } else {
3335 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3336 }
3337 uset_close(set1);
3338 uset_close(set2);
3339 uset_close(set3);
3340
3341 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3342 errorCode=U_ZERO_ERROR;
3343 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3344 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3345 if(U_SUCCESS(errorCode)) {
3346 compareUSets(set1, set2,
3347 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3348 TRUE);
3349 } else {
3350 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3351 }
3352 uset_close(set1);
3353 uset_close(set2);
3354 }
3355
3356 /* test case folding, compare return values with CaseFolding.txt ------------ */
3357
3358 /* bit set for which case foldings for a character have been tested already */
3359 enum {
3360 CF_SIMPLE=1,
3361 CF_FULL=2,
3362 CF_TURKIC=4,
3363 CF_ALL=7
3364 };
3365
3366 static void
testFold(UChar32 c,int which,UChar32 simple,UChar32 turkic,const UChar * full,int32_t fullLength,const UChar * turkicFull,int32_t turkicFullLength)3367 testFold(UChar32 c, int which,
3368 UChar32 simple, UChar32 turkic,
3369 const UChar *full, int32_t fullLength,
3370 const UChar *turkicFull, int32_t turkicFullLength) {
3371 UChar s[2], t[32];
3372 UChar32 c2;
3373 int32_t length, length2;
3374
3375 UErrorCode errorCode=U_ZERO_ERROR;
3376
3377 length=0;
3378 U16_APPEND_UNSAFE(s, length, c);
3379
3380 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3381 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3382 }
3383 if((which&CF_FULL)!=0) {
3384 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3385 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3386 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3387 }
3388 }
3389 if((which&CF_TURKIC)!=0) {
3390 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3391 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3392 }
3393
3394 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3395 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3396 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3397 }
3398 }
3399 }
3400
3401 /* test that c case-folds to itself */
3402 static void
testFoldToSelf(UChar32 c,int which)3403 testFoldToSelf(UChar32 c, int which) {
3404 UChar s[2];
3405 int32_t length;
3406
3407 length=0;
3408 U16_APPEND_UNSAFE(s, length, c);
3409 testFold(c, which, c, c, s, length, s, length);
3410 }
3411
3412 struct CaseFoldingData {
3413 USet *notSeen;
3414 UChar32 prev, prevSimple;
3415 UChar prevFull[32];
3416 int32_t prevFullLength;
3417 int which;
3418 };
3419 typedef struct CaseFoldingData CaseFoldingData;
3420
3421 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)3422 caseFoldingLineFn(void *context,
3423 char *fields[][2], int32_t fieldCount,
3424 UErrorCode *pErrorCode) {
3425 (void)fieldCount; // suppress compiler warnings about unused variable
3426
3427 CaseFoldingData *pData=(CaseFoldingData *)context;
3428 char *end;
3429 UChar full[32];
3430 UChar32 c, prev, simple;
3431 int32_t count;
3432 int which;
3433 char status;
3434
3435 /* get code point */
3436 const char *s=u_skipWhitespace(fields[0][0]);
3437 if(0==strncmp(s, "0000..10FFFF", 12)) {
3438 /*
3439 * Ignore the line
3440 * # @missing: 0000..10FFFF; C; <code point>
3441 * because maps-to-self is already our default, and this line breaks this parser.
3442 */
3443 return;
3444 }
3445 c=(UChar32)strtoul(s, &end, 16);
3446 end=(char *)u_skipWhitespace(end);
3447 if(end<=fields[0][0] || end!=fields[0][1]) {
3448 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3449 *pErrorCode=U_PARSE_ERROR;
3450 return;
3451 }
3452
3453 /* get the status of this mapping */
3454 status=*u_skipWhitespace(fields[1][0]);
3455 if(status!='C' && status!='S' && status!='F' && status!='T') {
3456 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3457 *pErrorCode=U_PARSE_ERROR;
3458 return;
3459 }
3460
3461 /* get the mapping */
3462 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3463 if(U_FAILURE(*pErrorCode)) {
3464 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3465 return;
3466 }
3467
3468 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3469 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3470 simple=c;
3471 }
3472
3473 if(c!=(prev=pData->prev)) {
3474 /*
3475 * Test remaining mappings for the previous code point.
3476 * If a turkic folding was not mentioned, then it should fold the same
3477 * as the regular simple case folding.
3478 */
3479 UChar prevString[2];
3480 int32_t length;
3481
3482 length=0;
3483 U16_APPEND_UNSAFE(prevString, length, prev);
3484 testFold(prev, (~pData->which)&CF_ALL,
3485 prev, pData->prevSimple,
3486 prevString, length,
3487 pData->prevFull, pData->prevFullLength);
3488 pData->prev=pData->prevSimple=c;
3489 length=0;
3490 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3491 pData->prevFullLength=length;
3492 pData->which=0;
3493 }
3494
3495 /*
3496 * Turn the status into a bit set of case foldings to test.
3497 * Remember non-Turkic case foldings as defaults for Turkic mode.
3498 */
3499 switch(status) {
3500 case 'C':
3501 which=CF_SIMPLE|CF_FULL;
3502 pData->prevSimple=simple;
3503 u_memcpy(pData->prevFull, full, count);
3504 pData->prevFullLength=count;
3505 break;
3506 case 'S':
3507 which=CF_SIMPLE;
3508 pData->prevSimple=simple;
3509 break;
3510 case 'F':
3511 which=CF_FULL;
3512 u_memcpy(pData->prevFull, full, count);
3513 pData->prevFullLength=count;
3514 break;
3515 case 'T':
3516 which=CF_TURKIC;
3517 break;
3518 default:
3519 which=0;
3520 break; /* won't happen because of test above */
3521 }
3522
3523 testFold(c, which, simple, simple, full, count, full, count);
3524
3525 /* remember which case foldings of c have been tested */
3526 pData->which|=which;
3527
3528 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3529 uset_remove(pData->notSeen, c);
3530 }
3531
3532 static void
TestCaseFolding()3533 TestCaseFolding() {
3534 CaseFoldingData data={ NULL, 0, 0, {0}, 0, 0 };
3535 char *fields[3][2];
3536 UErrorCode errorCode;
3537
3538 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3539
3540 errorCode=U_ZERO_ERROR;
3541 /* test BMP & plane 1 - nothing interesting above */
3542 data.notSeen=uset_open(0, 0x1ffff);
3543 data.prevFullLength=1; /* length of full case folding of U+0000 */
3544
3545 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3546 if(U_SUCCESS(errorCode)) {
3547 int32_t i, start, end;
3548
3549 /* add a pseudo-last line to finish testing of the actual last one */
3550 fields[0][0]=lastLine;
3551 fields[0][1]=lastLine+6;
3552 fields[1][0]=lastLine+7;
3553 fields[1][1]=lastLine+9;
3554 fields[2][0]=lastLine+10;
3555 fields[2][1]=lastLine+17;
3556 caseFoldingLineFn(&data, fields, 3, &errorCode);
3557
3558 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3559 for(i=0;
3560 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3561 U_SUCCESS(errorCode);
3562 ++i
3563 ) {
3564 do {
3565 testFoldToSelf(start, CF_ALL);
3566 } while(++start<=end);
3567 }
3568 }
3569
3570 uset_close(data.notSeen);
3571 }
3572
TestBinaryCharacterPropertiesAPI()3573 static void TestBinaryCharacterPropertiesAPI() {
3574 // API test only. See intltest/ucdtest.cpp for functional test.
3575 UErrorCode errorCode = U_ZERO_ERROR;
3576 const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3577 if (U_SUCCESS(errorCode)) {
3578 log_err("u_getBinaryPropertySet(-1) did not fail\n");
3579 }
3580 errorCode = U_ZERO_ERROR;
3581 set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3582 if (U_SUCCESS(errorCode)) {
3583 log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3584 }
3585 errorCode = U_ZERO_ERROR;
3586 set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3587 if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3588 log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3589 }
3590 }
3591
TestIntCharacterPropertiesAPI()3592 static void TestIntCharacterPropertiesAPI() {
3593 // API test only. See intltest/ucdtest.cpp for functional test.
3594 UErrorCode errorCode = U_ZERO_ERROR;
3595 const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3596 if (U_SUCCESS(errorCode)) {
3597 log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3598 }
3599 errorCode = U_ZERO_ERROR;
3600 map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3601 if (U_SUCCESS(errorCode)) {
3602 log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3603 }
3604 errorCode = U_ZERO_ERROR;
3605 map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3606 if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3607 log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3608 }
3609 }
3610