1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <string.h>
19 #include <math.h>
20 #include <stdlib.h>
21
22 #include "unicode/utypes.h"
23 #include "unicode/uchar.h"
24 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/uloc.h"
27 #include "unicode/unorm2.h"
28
29 #include "cintltst.h"
30 #include "putilimp.h"
31 #include "uparse.h"
32 #include "ucase.h"
33 #include "ubidi_props.h"
34 #include "uprops.h"
35 #include "uset_imp.h"
36 #include "usc_impl.h"
37 #include "udatamem.h" /* for testing ucase_openBinary() */
38 #include "cucdapi.h"
39 #include "cmemory.h"
40
41 /* prototypes --------------------------------------------------------------- */
42
43 static void TestUpperLower(void);
44 static void TestLetterNumber(void);
45 static void TestMisc(void);
46 static void TestPOSIX(void);
47 static void TestControlPrint(void);
48 static void TestIdentifier(void);
49 static void TestUnicodeData(void);
50 static void TestCodeUnit(void);
51 static void TestCodePoint(void);
52 static void TestCharLength(void);
53 static void TestCharNames(void);
54 static void TestUCharFromNameUnderflow(void);
55 static void TestMirroring(void);
56 static void TestUScriptRunAPI(void);
57 static void TestAdditionalProperties(void);
58 static void TestNumericProperties(void);
59 static void TestPropertyNames(void);
60 static void TestPropertyValues(void);
61 static void TestConsistency(void);
62 static void TestUCase(void);
63 static void TestUBiDiProps(void);
64 static void TestCaseFolding(void);
65
66 /* internal methods used */
67 static int32_t MakeProp(char* str);
68 static int32_t MakeDir(char* str);
69
70 /* helpers ------------------------------------------------------------------ */
71
72 static void
parseUCDFile(const char * filename,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)73 parseUCDFile(const char *filename,
74 char *fields[][2], int32_t fieldCount,
75 UParseLineFn *lineFn, void *context,
76 UErrorCode *pErrorCode) {
77 char path[256];
78 char backupPath[256];
79
80 if(U_FAILURE(*pErrorCode)) {
81 return;
82 }
83
84 /* Look inside ICU_DATA first */
85 strcpy(path, u_getDataDirectory());
86 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
87 strcat(path, filename);
88
89 /* As a fallback, try to guess where the source data was located
90 * at the time ICU was built, and look there.
91 */
92 strcpy(backupPath, ctest_dataSrcDir());
93 strcat(backupPath, U_FILE_SEP_STRING);
94 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
95 strcat(backupPath, filename);
96
97 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
98 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
99 *pErrorCode=U_ZERO_ERROR;
100 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
101 }
102 if(U_FAILURE(*pErrorCode)) {
103 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
104 }
105 }
106
107 /* test data ---------------------------------------------------------------- */
108
109 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
110 static const int32_t tagValues[] =
111 {
112 /* Mn */ U_NON_SPACING_MARK,
113 /* Mc */ U_COMBINING_SPACING_MARK,
114 /* Me */ U_ENCLOSING_MARK,
115 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
116 /* Nl */ U_LETTER_NUMBER,
117 /* No */ U_OTHER_NUMBER,
118 /* Zs */ U_SPACE_SEPARATOR,
119 /* Zl */ U_LINE_SEPARATOR,
120 /* Zp */ U_PARAGRAPH_SEPARATOR,
121 /* Cc */ U_CONTROL_CHAR,
122 /* Cf */ U_FORMAT_CHAR,
123 /* Cs */ U_SURROGATE,
124 /* Co */ U_PRIVATE_USE_CHAR,
125 /* Cn */ U_UNASSIGNED,
126 /* Lu */ U_UPPERCASE_LETTER,
127 /* Ll */ U_LOWERCASE_LETTER,
128 /* Lt */ U_TITLECASE_LETTER,
129 /* Lm */ U_MODIFIER_LETTER,
130 /* Lo */ U_OTHER_LETTER,
131 /* Pc */ U_CONNECTOR_PUNCTUATION,
132 /* Pd */ U_DASH_PUNCTUATION,
133 /* Ps */ U_START_PUNCTUATION,
134 /* Pe */ U_END_PUNCTUATION,
135 /* Po */ U_OTHER_PUNCTUATION,
136 /* Sm */ U_MATH_SYMBOL,
137 /* Sc */ U_CURRENCY_SYMBOL,
138 /* Sk */ U_MODIFIER_SYMBOL,
139 /* So */ U_OTHER_SYMBOL,
140 /* Pi */ U_INITIAL_PUNCTUATION,
141 /* Pf */ U_FINAL_PUNCTUATION
142 };
143
144 static const char dirStrings[][5] = {
145 "L",
146 "R",
147 "EN",
148 "ES",
149 "ET",
150 "AN",
151 "CS",
152 "B",
153 "S",
154 "WS",
155 "ON",
156 "LRE",
157 "LRO",
158 "AL",
159 "RLE",
160 "RLO",
161 "PDF",
162 "NSM",
163 "BN",
164 /* new in Unicode 6.3/ICU 52 */
165 "FSI",
166 "LRI",
167 "RLI",
168 "PDI"
169 };
170
171 void addUnicodeTest(TestNode** root);
172
addUnicodeTest(TestNode ** root)173 void addUnicodeTest(TestNode** root)
174 {
175 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
176 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
177 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
178 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
179 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
180 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
181 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
182 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
183 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
184 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
185 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
186 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
187 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
188 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
189 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
190 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
191 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
192 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
193 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
194 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
195 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
196 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
197 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
198 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
199 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase");
200 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
201 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
202 }
203
204 /*==================================================== */
205 /* test u_toupper() and u_tolower() */
206 /*==================================================== */
TestUpperLower()207 static void TestUpperLower()
208 {
209 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
210 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
211 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
212 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
213 int32_t i;
214
215 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
216 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
217
218 /*
219 Checks LetterLike Symbols which were previously a source of confusion
220 [Bertrand A. D. 02/04/98]
221 */
222 for (i=0x2100;i<0x2138;i++)
223 {
224 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
225 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
226 {
227 if (i != (int)u_tolower(i)) /* itself */
228 log_err("Failed case conversion with itself: U+%04x\n", i);
229 if (i != (int)u_toupper(i))
230 log_err("Failed case conversion with itself: U+%04x\n", i);
231 }
232 }
233
234 for(i=0; i < u_strlen(upper); i++){
235 if(u_tolower(upper[i]) != lower[i]){
236 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
237 }
238 }
239
240 log_verbose("testing upper lower\n");
241 for (i = 0; i < 21; i++) {
242
243 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
244 {
245 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
246 }
247 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
248 {
249 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
250 }
251 else if (upperTest[i] != u_tolower(lowerTest[i]))
252 {
253 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
254 }
255 else if (lowerTest[i] != u_toupper(upperTest[i]))
256 {
257 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
258 }
259 else if (upperTest[i] != u_tolower(upperTest[i]))
260 {
261 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
262 }
263 else if (lowerTest[i] != u_toupper(lowerTest[i]))
264 {
265 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
266 }
267 }
268 log_verbose("done testing upper lower\n");
269
270 log_verbose("testing u_istitle\n");
271 {
272 static const UChar expected[] = {
273 0x1F88,
274 0x1F89,
275 0x1F8A,
276 0x1F8B,
277 0x1F8C,
278 0x1F8D,
279 0x1F8E,
280 0x1F8F,
281 0x1F88,
282 0x1F89,
283 0x1F8A,
284 0x1F8B,
285 0x1F8C,
286 0x1F8D,
287 0x1F8E,
288 0x1F8F,
289 0x1F98,
290 0x1F99,
291 0x1F9A,
292 0x1F9B,
293 0x1F9C,
294 0x1F9D,
295 0x1F9E,
296 0x1F9F,
297 0x1F98,
298 0x1F99,
299 0x1F9A,
300 0x1F9B,
301 0x1F9C,
302 0x1F9D,
303 0x1F9E,
304 0x1F9F,
305 0x1FA8,
306 0x1FA9,
307 0x1FAA,
308 0x1FAB,
309 0x1FAC,
310 0x1FAD,
311 0x1FAE,
312 0x1FAF,
313 0x1FA8,
314 0x1FA9,
315 0x1FAA,
316 0x1FAB,
317 0x1FAC,
318 0x1FAD,
319 0x1FAE,
320 0x1FAF,
321 0x1FBC,
322 0x1FBC,
323 0x1FCC,
324 0x1FCC,
325 0x1FFC,
326 0x1FFC,
327 };
328 int32_t num = UPRV_LENGTHOF(expected);
329 for(i=0; i<num; i++){
330 if(!u_istitle(expected[i])){
331 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
332 }
333 }
334
335 }
336 }
337
338 /* compare two sets and verify that their difference or intersection is empty */
339 static UBool
showADiffB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool expect,UBool diffIsError)340 showADiffB(const USet *a, const USet *b,
341 const char *a_name, const char *b_name,
342 UBool expect, UBool diffIsError) {
343 USet *aa;
344 int32_t i, start, end, length;
345 UErrorCode errorCode;
346
347 /*
348 * expect:
349 * TRUE -> a-b should be empty, that is, b should contain all of a
350 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
351 */
352 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
353 return TRUE;
354 }
355
356 /* clone a to aa because a is const */
357 aa=uset_open(1, 0);
358 if(aa==NULL) {
359 /* unusual problem - out of memory? */
360 return FALSE;
361 }
362 uset_addAll(aa, a);
363
364 /* compute the set in question */
365 if(expect) {
366 /* a-b */
367 uset_removeAll(aa, b);
368 } else {
369 /* a&b */
370 uset_retainAll(aa, b);
371 }
372
373 /* aa is not empty because of the initial tests above; show its contents */
374 errorCode=U_ZERO_ERROR;
375 i=0;
376 for(;;) {
377 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
378 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
379 break; /* done */
380 }
381 if(U_FAILURE(errorCode)) {
382 log_err("error comparing %s with %s at difference item %d: %s\n",
383 a_name, b_name, i, u_errorName(errorCode));
384 break;
385 }
386 if(length!=0) {
387 break; /* done with code points, got a string or -1 */
388 }
389
390 if(diffIsError) {
391 if(expect) {
392 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
393 } else {
394 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
395 }
396 } else {
397 if(expect) {
398 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
399 } else {
400 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
401 }
402 }
403
404 ++i;
405 }
406
407 uset_close(aa);
408 return FALSE;
409 }
410
411 static UBool
showAMinusB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)412 showAMinusB(const USet *a, const USet *b,
413 const char *a_name, const char *b_name,
414 UBool diffIsError) {
415 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
416 }
417
418 static UBool
showAIntersectB(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)419 showAIntersectB(const USet *a, const USet *b,
420 const char *a_name, const char *b_name,
421 UBool diffIsError) {
422 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
423 }
424
425 static UBool
compareUSets(const USet * a,const USet * b,const char * a_name,const char * b_name,UBool diffIsError)426 compareUSets(const USet *a, const USet *b,
427 const char *a_name, const char *b_name,
428 UBool diffIsError) {
429 /*
430 * Use an arithmetic & not a logical && so that both branches
431 * are always taken and all differences are shown.
432 */
433 return
434 showAMinusB(a, b, a_name, b_name, diffIsError) &
435 showAMinusB(b, a, b_name, a_name, diffIsError);
436 }
437
438 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumber()439 static void TestLetterNumber()
440 {
441 UChar i = 0x0000;
442
443 log_verbose("Testing for isalpha\n");
444 for (i = 0x0041; i < 0x005B; i++) {
445 if (!u_isalpha(i))
446 {
447 log_err("Failed isLetter test at %.4X\n", i);
448 }
449 }
450 for (i = 0x0660; i < 0x066A; i++) {
451 if (u_isalpha(i))
452 {
453 log_err("Failed isLetter test with numbers at %.4X\n", i);
454 }
455 }
456
457 log_verbose("Testing for isdigit\n");
458 for (i = 0x0660; i < 0x066A; i++) {
459 if (!u_isdigit(i))
460 {
461 log_verbose("Failed isNumber test at %.4X\n", i);
462 }
463 }
464
465 log_verbose("Testing for isalnum\n");
466 for (i = 0x0041; i < 0x005B; i++) {
467 if (!u_isalnum(i))
468 {
469 log_err("Failed isAlNum test at %.4X\n", i);
470 }
471 }
472 for (i = 0x0660; i < 0x066A; i++) {
473 if (!u_isalnum(i))
474 {
475 log_err("Failed isAlNum test at %.4X\n", i);
476 }
477 }
478
479 {
480 /*
481 * The following checks work only starting from Unicode 4.0.
482 * Check the version number here.
483 */
484 static UVersionInfo u401={ 4, 0, 1, 0 };
485 UVersionInfo version;
486 u_getUnicodeVersion(version);
487 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
488 return;
489 }
490 }
491
492 {
493 /*
494 * Sanity check:
495 * Verify that exactly the digit characters have decimal digit values.
496 * This assumption is used in the implementation of u_digit()
497 * (which checks nt=de)
498 * compared with the parallel java.lang.Character.digit()
499 * (which checks Nd).
500 *
501 * This was not true in Unicode 3.2 and earlier.
502 * Unicode 4.0 fixed discrepancies.
503 * Unicode 4.0.1 re-introduced problems in this area due to an
504 * unintentionally incomplete last-minute change.
505 */
506 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
507 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
508
509 USet *digits, *decimalValues;
510 UErrorCode errorCode;
511
512 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
513 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
514 errorCode=U_ZERO_ERROR;
515 digits=uset_openPattern(digitsPattern, 6, &errorCode);
516 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
517
518 if(U_SUCCESS(errorCode)) {
519 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
520 }
521
522 uset_close(digits);
523 uset_close(decimalValues);
524 }
525 }
526
testSampleCharProps(UBool propFn (UChar32),const char * propName,const UChar32 * sampleChars,int32_t sampleCharsLength,UBool expected)527 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
528 const UChar32 *sampleChars, int32_t sampleCharsLength,
529 UBool expected) {
530 int32_t i;
531 for (i = 0; i < sampleCharsLength; ++i) {
532 UBool result = propFn(sampleChars[i]);
533 if (result != expected) {
534 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
535 propName, sampleChars[i], result);
536 }
537 }
538 }
539
540 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMisc()541 static void TestMisc()
542 {
543 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
544 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
545 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
546 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
547 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
548 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
549 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
550 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
551 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
552 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
553 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
554
555 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
556
557 uint32_t mask;
558
559 int32_t i;
560 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
561 UVersionInfo realVersion;
562
563 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
564
565 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
566 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
567
568 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
569 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
570 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
571 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
572
573 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
574 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
575 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
576 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
577
578 testSampleCharProps(u_isdefined, "u_isdefined",
579 sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
580 testSampleCharProps(u_isdefined, "u_isdefined",
581 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
582
583 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
584 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
585
586 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
587 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
588
589 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
590 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
591 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
592 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
593 }
594 }
595
596 /* Tests the ICU version #*/
597 u_getVersion(realVersion);
598 u_versionToString(realVersion, icuVersion);
599 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
600 {
601 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
602 }
603 #if defined(ICU_VERSION)
604 /* test only happens where we have configure.in with VERSION - sanity check. */
605 if(strcmp(U_ICU_VERSION, ICU_VERSION))
606 {
607 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
608 }
609 #endif
610
611 /* test U_GC_... */
612 if(
613 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
614 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
615 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
616 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
617 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
618 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
619 ) {
620 log_err("error: U_GET_GC_MASK does not work properly\n");
621 }
622
623 mask=0;
624 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
625
626 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
627 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
628 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
629 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
630 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
631
632 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
633 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
634 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
635
636 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
637 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
638 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
639
640 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
641 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
642 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
643
644 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
645 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
646 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
647 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
648
649 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
650 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
651 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
652 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
653 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
654
655 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
656 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
657 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
658 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
659
660 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
661 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
662
663 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
664 log_err("error: problems with U_GC_XX_MASK constants\n");
665 }
666
667 mask=0;
668 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
669 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
670 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
671 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
672 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
673 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
674 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
675
676 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
677 log_err("error: problems with U_GC_Y_MASK constants\n");
678 }
679 {
680 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
681 for(i=0; i<10; i++){
682 if(digit[i]!=u_forDigit(i,10)){
683 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
684 }
685 }
686 }
687
688 /* test u_digit() */
689 {
690 static const struct {
691 UChar32 c;
692 int8_t radix, value;
693 } data[]={
694 /* base 16 */
695 { 0x0031, 16, 1 },
696 { 0x0038, 16, 8 },
697 { 0x0043, 16, 12 },
698 { 0x0066, 16, 15 },
699 { 0x00e4, 16, -1 },
700 { 0x0662, 16, 2 },
701 { 0x06f5, 16, 5 },
702 { 0xff13, 16, 3 },
703 { 0xff41, 16, 10 },
704
705 /* base 8 */
706 { 0x0031, 8, 1 },
707 { 0x0038, 8, -1 },
708 { 0x0043, 8, -1 },
709 { 0x0066, 8, -1 },
710 { 0x00e4, 8, -1 },
711 { 0x0662, 8, 2 },
712 { 0x06f5, 8, 5 },
713 { 0xff13, 8, 3 },
714 { 0xff41, 8, -1 },
715
716 /* base 36 */
717 { 0x5a, 36, 35 },
718 { 0x7a, 36, 35 },
719 { 0xff3a, 36, 35 },
720 { 0xff5a, 36, 35 },
721
722 /* wrong radix values */
723 { 0x0031, 1, -1 },
724 { 0xff3a, 37, -1 }
725 };
726
727 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
728 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
729 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
730 data[i].c,
731 data[i].radix,
732 u_digit(data[i].c, data[i].radix),
733 data[i].value);
734 }
735 }
736 }
737 }
738
739 /* test C/POSIX-style functions --------------------------------------------- */
740
741 /* bit flags */
742 #define ISAL 1
743 #define ISLO 2
744 #define ISUP 4
745
746 #define ISDI 8
747 #define ISXD 0x10
748
749 #define ISAN 0x20
750
751 #define ISPU 0x40
752 #define ISGR 0x80
753 #define ISPR 0x100
754
755 #define ISSP 0x200
756 #define ISBL 0x400
757 #define ISCN 0x800
758
759 /* C/POSIX-style functions, in the same order as the bit flags */
760 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
761
762 static const struct {
763 IsPOSIXClass *fn;
764 const char *name;
765 } posixClasses[]={
766 { u_isalpha, "isalpha" },
767 { u_islower, "islower" },
768 { u_isupper, "isupper" },
769 { u_isdigit, "isdigit" },
770 { u_isxdigit, "isxdigit" },
771 { u_isalnum, "isalnum" },
772 { u_ispunct, "ispunct" },
773 { u_isgraph, "isgraph" },
774 { u_isprint, "isprint" },
775 { u_isspace, "isspace" },
776 { u_isblank, "isblank" },
777 { u_iscntrl, "iscntrl" }
778 };
779
780 static const struct {
781 UChar32 c;
782 uint32_t posixResults;
783 } posixData[]={
784 { 0x0008, ISCN }, /* backspace */
785 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
786 { 0x000a, ISSP| ISCN }, /* LF */
787 { 0x000c, ISSP| ISCN }, /* FF */
788 { 0x000d, ISSP| ISCN }, /* CR */
789 { 0x0020, ISPR|ISSP|ISBL }, /* space */
790 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
791 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
792 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
793 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
794 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
795 { 0x007b, ISPU|ISGR|ISPR }, /* { */
796 { 0x0085, ISSP| ISCN }, /* NEL */
797 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
798 { 0x00a4, ISGR|ISPR }, /* currency sign */
799 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
800 { 0x0300, ISGR|ISPR }, /* combining grave */
801 { 0x0600, ISCN }, /* arabic number sign */
802 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
803 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
804 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
805 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
806 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
807 { 0x200b, ISCN }, /* ZWSP */
808 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
809 { 0x200e, ISCN }, /* LRM */
810 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
811 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
812 { 0x20ac, ISGR|ISPR }, /* Euro */
813 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
814 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
815 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
816 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
817 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
818 };
819
820 static void
TestPOSIX()821 TestPOSIX() {
822 uint32_t mask;
823 int32_t cl, i;
824 UBool expect;
825
826 mask=1;
827 for(cl=0; cl<12; ++cl) {
828 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
829 expect=(UBool)((posixData[i].posixResults&mask)!=0);
830 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
831 log_err("u_%s(U+%04x)=%s is wrong\n",
832 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
833 }
834 }
835 mask<<=1;
836 }
837 }
838
839 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrint()840 static void TestControlPrint()
841 {
842 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
843 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
844 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
845 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
846 UChar32 c;
847
848 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
849 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
850
851 testSampleCharProps(u_isprint, "u_isprint",
852 samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
853 testSampleCharProps(u_isprint, "u_isprint",
854 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
855
856 /* test all ISO 8 controls */
857 for(c=0; c<=0x9f; ++c) {
858 if(c==0x20) {
859 /* skip ASCII graphic characters and continue with DEL */
860 c=0x7f;
861 }
862 if(!u_iscntrl(c)) {
863 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
864 }
865 if(!u_isISOControl(c)) {
866 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
867 }
868 if(u_isprint(c)) {
869 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
870 }
871 }
872
873 /* test all Latin-1 graphic characters */
874 for(c=0x20; c<=0xff; ++c) {
875 if(c==0x7f) {
876 c=0xa0;
877 } else if(c==0xad) {
878 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
879 ++c;
880 }
881 if(!u_isprint(c)) {
882 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
883 }
884 }
885 }
886
887 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifier()888 static void TestIdentifier()
889 {
890 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
891 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
892 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
893 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
894 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
895 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
896 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
897 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
898 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
899 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
900
901 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
902 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
903 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
904 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
905
906 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
907 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
908 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
909 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
910
911 /* IDPart should imply IDStart */
912 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
913 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
914
915 testSampleCharProps(u_isIDStart, "u_isIDStart",
916 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
917 testSampleCharProps(u_isIDStart, "u_isIDStart",
918 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
919
920 testSampleCharProps(u_isIDPart, "u_isIDPart",
921 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
922 testSampleCharProps(u_isIDPart, "u_isIDPart",
923 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
924
925 /* IDPart should imply IDStart */
926 testSampleCharProps(u_isIDPart, "u_isIDPart",
927 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
928
929 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
930 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
931 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
932 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
933 }
934
935 /* for each line of UnicodeData.txt, check some of the properties */
936 typedef struct UnicodeDataContext {
937 #if UCONFIG_NO_NORMALIZATION
938 const void *dummy;
939 #else
940 const UNormalizer2 *nfc;
941 const UNormalizer2 *nfkc;
942 #endif
943 } UnicodeDataContext;
944
945 /*
946 * ### TODO
947 * This test fails incorrectly if the First or Last code point of a repetitive area
948 * is overridden, which is allowed and is encouraged for the PUAs.
949 * Currently, this means that both area First/Last and override lines are
950 * tested against the properties from the API,
951 * and the area boundary will not match and cause an error.
952 *
953 * This function should detect area boundaries and skip them for the test of individual
954 * code points' properties.
955 * Then it should check that the areas contain all the same properties except where overridden.
956 * For this, it would have had to set a flag for which code points were listed explicitly.
957 */
958 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)959 unicodeDataLineFn(void *context,
960 char *fields[][2], int32_t fieldCount,
961 UErrorCode *pErrorCode)
962 {
963 char buffer[100];
964 const char *d;
965 char *end;
966 uint32_t value;
967 UChar32 c;
968 int32_t i;
969 int8_t type;
970 int32_t dt;
971 UChar dm[32], s[32];
972 int32_t dmLength, length;
973
974 #if !UCONFIG_NO_NORMALIZATION
975 const UNormalizer2 *nfc, *nfkc;
976 #endif
977
978 /* get the character code, field 0 */
979 c=strtoul(fields[0][0], &end, 16);
980 if(end<=fields[0][0] || end!=fields[0][1]) {
981 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
982 return;
983 }
984 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
985 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
986 return;
987 }
988
989 /* get general category, field 2 */
990 *fields[2][1]=0;
991 type = (int8_t)tagValues[MakeProp(fields[2][0])];
992 if(u_charType(c)!=type) {
993 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
994 }
995 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
996 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
997 }
998
999 /* get canonical combining class, field 3 */
1000 value=strtoul(fields[3][0], &end, 10);
1001 if(end<=fields[3][0] || end!=fields[3][1]) {
1002 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1003 return;
1004 }
1005 if(value>255) {
1006 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1007 return;
1008 }
1009 #if !UCONFIG_NO_NORMALIZATION
1010 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1011 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1012 }
1013 nfkc=((UnicodeDataContext *)context)->nfkc;
1014 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1015 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1016 }
1017 #endif
1018
1019 /* get BiDi category, field 4 */
1020 *fields[4][1]=0;
1021 i=MakeDir(fields[4][0]);
1022 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1023 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1024 }
1025
1026 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1027 d=NULL;
1028 if(fields[5][0]==fields[5][1]) {
1029 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1030 if(c==0xac00 || c==0xd7a3) {
1031 dt=U_DT_CANONICAL;
1032 } else {
1033 dt=U_DT_NONE;
1034 }
1035 } else {
1036 d=fields[5][0];
1037 *fields[5][1]=0;
1038 dt=UCHAR_INVALID_CODE;
1039 if(*d=='<') {
1040 end=strchr(++d, '>');
1041 if(end!=NULL) {
1042 *end=0;
1043 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1044 d=u_skipWhitespace(end+1);
1045 }
1046 } else {
1047 dt=U_DT_CANONICAL;
1048 }
1049 }
1050 if(dt>U_DT_NONE) {
1051 if(c==0xac00) {
1052 dm[0]=0x1100;
1053 dm[1]=0x1161;
1054 dm[2]=0;
1055 dmLength=2;
1056 } else if(c==0xd7a3) {
1057 dm[0]=0xd788;
1058 dm[1]=0x11c2;
1059 dm[2]=0;
1060 dmLength=2;
1061 } else {
1062 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1063 }
1064 } else {
1065 dmLength=-1;
1066 }
1067 if(dt<0 || U_FAILURE(*pErrorCode)) {
1068 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1069 return;
1070 }
1071 #if !UCONFIG_NO_NORMALIZATION
1072 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1073 if(i!=dt) {
1074 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1075 }
1076 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1077 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1078 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1079 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1080 "or the Decomposition_Mapping is different (%s)\n",
1081 c, length, dmLength, u_errorName(*pErrorCode));
1082 return;
1083 }
1084 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1085 if(dt!=U_DT_CANONICAL) {
1086 dmLength=-1;
1087 }
1088 nfc=((UnicodeDataContext *)context)->nfc;
1089 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1090 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1091 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1092 "or the Decomposition_Mapping is different (%s)\n",
1093 c, length, dmLength, u_errorName(*pErrorCode));
1094 return;
1095 }
1096 /* recompose */
1097 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1098 UChar32 a, b, composite;
1099 i=0;
1100 U16_NEXT(dm, i, dmLength, a);
1101 U16_NEXT(dm, i, dmLength, b);
1102 /* i==dmLength */
1103 composite=unorm2_composePair(nfc, a, b);
1104 if(composite!=c) {
1105 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1106 (long)c, (long)a, (long)b, (long)composite);
1107 }
1108 /*
1109 * Note: NFKC has fewer round-trip mappings than NFC,
1110 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1111 */
1112 }
1113 #endif
1114
1115 /* get ISO Comment, field 11 */
1116 *fields[11][1]=0;
1117 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1118 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1119 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1120 c, u_errorName(*pErrorCode),
1121 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1122 fields[11][0]);
1123 }
1124
1125 /* get uppercase mapping, field 12 */
1126 if(fields[12][0]!=fields[12][1]) {
1127 value=strtoul(fields[12][0], &end, 16);
1128 if(end!=fields[12][1]) {
1129 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1130 return;
1131 }
1132 if((UChar32)value!=u_toupper(c)) {
1133 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1134 }
1135 } else {
1136 /* no case mapping: the API must map the code point to itself */
1137 if(c!=u_toupper(c)) {
1138 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1139 }
1140 }
1141
1142 /* get lowercase mapping, field 13 */
1143 if(fields[13][0]!=fields[13][1]) {
1144 value=strtoul(fields[13][0], &end, 16);
1145 if(end!=fields[13][1]) {
1146 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1147 return;
1148 }
1149 if((UChar32)value!=u_tolower(c)) {
1150 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1151 }
1152 } else {
1153 /* no case mapping: the API must map the code point to itself */
1154 if(c!=u_tolower(c)) {
1155 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1156 }
1157 }
1158
1159 /* get titlecase mapping, field 14 */
1160 if(fields[14][0]!=fields[14][1]) {
1161 value=strtoul(fields[14][0], &end, 16);
1162 if(end!=fields[14][1]) {
1163 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1164 return;
1165 }
1166 if((UChar32)value!=u_totitle(c)) {
1167 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1168 }
1169 } else {
1170 /* no case mapping: the API must map the code point to itself */
1171 if(c!=u_totitle(c)) {
1172 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1173 }
1174 }
1175 }
1176
1177 static UBool U_CALLCONV
enumTypeRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1178 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1179 static const UChar32 test[][2]={
1180 {0x41, U_UPPERCASE_LETTER},
1181 {0x308, U_NON_SPACING_MARK},
1182 {0xfffe, U_GENERAL_OTHER_TYPES},
1183 {0xe0041, U_FORMAT_CHAR},
1184 {0xeffff, U_UNASSIGNED}
1185 };
1186
1187 int32_t i, count;
1188
1189 if(0!=strcmp((const char *)context, "a1")) {
1190 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1191 return FALSE;
1192 }
1193
1194 count=UPRV_LENGTHOF(test);
1195 for(i=0; i<count; ++i) {
1196 if(start<=test[i][0] && test[i][0]<limit) {
1197 if(type!=(UCharCategory)test[i][1]) {
1198 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1199 start, limit, (long)type, test[i][0], test[i][1]);
1200 }
1201 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1202 return i==(count-1) ? FALSE : TRUE;
1203 }
1204 }
1205
1206 if(start>test[count-1][0]) {
1207 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1208 start, limit, (long)type);
1209 return FALSE;
1210 }
1211
1212 return TRUE;
1213 }
1214
1215 static UBool U_CALLCONV
enumDefaultsRange(const void * context,UChar32 start,UChar32 limit,UCharCategory type)1216 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1217 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1218 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1219 { 0x0590, U_LEFT_TO_RIGHT },
1220 { 0x0600, U_RIGHT_TO_LEFT },
1221 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1222 { 0x08A0, U_RIGHT_TO_LEFT },
1223 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1224 { 0x20A0, U_LEFT_TO_RIGHT },
1225 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1226 { 0xFB1D, U_LEFT_TO_RIGHT },
1227 { 0xFB50, U_RIGHT_TO_LEFT },
1228 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1229 { 0xFE70, U_LEFT_TO_RIGHT },
1230 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1231 { 0x10800, U_LEFT_TO_RIGHT },
1232 { 0x11000, U_RIGHT_TO_LEFT },
1233 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1234 { 0x1EE00, U_RIGHT_TO_LEFT },
1235 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1236 { 0x1F000, U_RIGHT_TO_LEFT },
1237 { 0x110000, U_LEFT_TO_RIGHT }
1238 };
1239
1240 UChar32 c;
1241 int32_t i;
1242 UCharDirection shouldBeDir;
1243
1244 /*
1245 * LineBreak.txt specifies:
1246 * # - Assigned characters that are not listed explicitly are given the value
1247 * # "AL".
1248 * # - Unassigned characters are given the value "XX".
1249 *
1250 * PUA characters are listed explicitly with "XX".
1251 * Verify that no assigned character has "XX".
1252 */
1253 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1254 c=start;
1255 while(c<limit) {
1256 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1257 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1258 }
1259 ++c;
1260 }
1261 }
1262
1263 /*
1264 * Verify default Bidi classes.
1265 * For recent Unicode versions, see UCD.html.
1266 *
1267 * For older Unicode versions:
1268 * See table 3-7 "Bidirectional Character Types" in UAX #9.
1269 * http://www.unicode.org/reports/tr9/
1270 *
1271 * See also DerivedBidiClass.txt for Cn code points!
1272 *
1273 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html)
1274 * changed some default values.
1275 * In particular, non-characters and unassigned Default Ignorable Code Points
1276 * change from L to BN.
1277 *
1278 * UCD.html version 4.0.1 does not yet reflect these changes.
1279 */
1280 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1281 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1282 c=start;
1283 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1284 if((int32_t)c<defaultBidi[i][0]) {
1285 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1286 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1287 shouldBeDir=U_BOUNDARY_NEUTRAL;
1288 } else {
1289 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1290 }
1291
1292 if( u_charDirection(c)!=shouldBeDir ||
1293 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1294 ) {
1295 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1296 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1297 }
1298 ++c;
1299 }
1300 }
1301 }
1302 }
1303
1304 return TRUE;
1305 }
1306
1307 /* tests for several properties */
TestUnicodeData()1308 static void TestUnicodeData()
1309 {
1310 UVersionInfo expectVersionArray;
1311 UVersionInfo versionArray;
1312 char *fields[15][2];
1313 UErrorCode errorCode;
1314 UChar32 c;
1315 int8_t type;
1316
1317 UnicodeDataContext context;
1318
1319 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1320 u_getUnicodeVersion(versionArray);
1321 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1322 {
1323 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1324 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1325 }
1326
1327 #if defined(ICU_UNICODE_VERSION)
1328 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1329 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1330 {
1331 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1332 }
1333 #endif
1334
1335 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1336 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1337 }
1338
1339 errorCode=U_ZERO_ERROR;
1340 #if !UCONFIG_NO_NORMALIZATION
1341 context.nfc=unorm2_getNFCInstance(&errorCode);
1342 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1343 if(U_FAILURE(errorCode)) {
1344 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1345 return;
1346 }
1347 #endif
1348 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1349 if(U_FAILURE(errorCode)) {
1350 return; /* if we couldn't parse UnicodeData.txt, we should return */
1351 }
1352
1353 /* sanity check on repeated properties */
1354 for(c=0xfffe; c<=0x10ffff;) {
1355 type=u_charType(c);
1356 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1357 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1358 }
1359 if(type!=U_UNASSIGNED) {
1360 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1361 }
1362 if((c&0xffff)==0xfffe) {
1363 ++c;
1364 } else {
1365 c+=0xffff;
1366 }
1367 }
1368
1369 /* test that PUA is not "unassigned" */
1370 for(c=0xe000; c<=0x10fffd;) {
1371 type=u_charType(c);
1372 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1373 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1374 }
1375 if(type==U_UNASSIGNED) {
1376 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1377 } else if(type!=U_PRIVATE_USE_CHAR) {
1378 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1379 }
1380 if(c==0xf8ff) {
1381 c=0xf0000;
1382 } else if(c==0xffffd) {
1383 c=0x100000;
1384 } else {
1385 ++c;
1386 }
1387 }
1388
1389 /* test u_enumCharTypes() */
1390 u_enumCharTypes(enumTypeRange, "a1");
1391
1392 /* check default properties */
1393 u_enumCharTypes(enumDefaultsRange, NULL);
1394 }
1395
TestCodeUnit()1396 static void TestCodeUnit(){
1397 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1398
1399 int32_t i;
1400
1401 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1402 UChar c=codeunit[i];
1403 if(i<4){
1404 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1405 log_err("ERROR: U+%04x is a single", c);
1406 }
1407
1408 }
1409 if(i >= 4 && i< 8){
1410 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1411 log_err("ERROR: U+%04x is a first surrogate", c);
1412 }
1413 }
1414 if(i >= 8 && i< 12){
1415 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1416 log_err("ERROR: U+%04x is a second surrogate", c);
1417 }
1418 }
1419 }
1420
1421 }
1422
TestCodePoint()1423 static void TestCodePoint(){
1424 const UChar32 codePoint[]={
1425 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1426 0xd800,
1427 0xdbff,
1428 0xdc00,
1429 0xdfff,
1430 0xdc04,
1431 0xd821,
1432 /*not a surrogate, valid, isUnicodeChar , not Error*/
1433 0x20ac,
1434 0xd7ff,
1435 0xe000,
1436 0xe123,
1437 0x0061,
1438 0xe065,
1439 0x20402,
1440 0x24506,
1441 0x23456,
1442 0x20402,
1443 0x10402,
1444 0x23456,
1445 /*not a surrogate, not valid, isUnicodeChar, isError */
1446 0x0015,
1447 0x009f,
1448 /*not a surrogate, not valid, not isUnicodeChar, isError */
1449 0xffff,
1450 0xfffe,
1451 };
1452 int32_t i;
1453 for(i=0; i<UPRV_LENGTHOF(codePoint); i++){
1454 UChar32 c=codePoint[i];
1455 if(i<6){
1456 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
1457 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1458 }
1459 if(UTF_IS_VALID(c)){
1460 log_err("ERROR: isValid() failed for U+%04x\n", c);
1461 }
1462 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1463 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1464 }
1465 if(UTF_IS_ERROR(c)){
1466 log_err("ERROR: isError() failed for U+%04x\n", c);
1467 }
1468 }else if(i >=6 && i<18){
1469 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1470 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1471 }
1472 if(!UTF_IS_VALID(c)){
1473 log_err("ERROR: isValid() failed for U+%04x\n", c);
1474 }
1475 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1476 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1477 }
1478 if(UTF_IS_ERROR(c)){
1479 log_err("ERROR: isError() failed for U+%04x\n", c);
1480 }
1481 }else if(i >=18 && i<20){
1482 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1483 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1484 }
1485 if(UTF_IS_VALID(c)){
1486 log_err("ERROR: isValid() failed for U+%04x\n", c);
1487 }
1488 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
1489 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1490 }
1491 if(!UTF_IS_ERROR(c)){
1492 log_err("ERROR: isError() failed for U+%04x\n", c);
1493 }
1494 }
1495 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1496 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
1497 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1498 }
1499 if(UTF_IS_VALID(c)){
1500 log_err("ERROR: isValid() failed for U+%04x\n", c);
1501 }
1502 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
1503 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1504 }
1505 if(!UTF_IS_ERROR(c)){
1506 log_err("ERROR: isError() failed for U+%04x\n", c);
1507 }
1508 }
1509 }
1510
1511 if(
1512 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1513 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1514 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1515 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1516 ) {
1517 log_err("error with U_IS_BMP()\n");
1518 }
1519
1520 if(
1521 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1522 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1523 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1524 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1525 ) {
1526 log_err("error with U_IS_SUPPLEMENTARY()\n");
1527 }
1528 }
1529
TestCharLength()1530 static void TestCharLength()
1531 {
1532 const int32_t codepoint[]={
1533 1, 0x0061,
1534 1, 0xe065,
1535 1, 0x20ac,
1536 2, 0x20402,
1537 2, 0x23456,
1538 2, 0x24506,
1539 2, 0x20402,
1540 2, 0x10402,
1541 1, 0xd7ff,
1542 1, 0xe000
1543 };
1544
1545 int32_t i;
1546 UBool multiple;
1547 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1548 UChar32 c=codepoint[i+1];
1549 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
1550 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1551 }
1552 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1553 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1554 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1555 }
1556 }
1557 }
1558
1559 /*internal functions ----*/
MakeProp(char * str)1560 static int32_t MakeProp(char* str)
1561 {
1562 int32_t result = 0;
1563 char* matchPosition =0;
1564
1565 matchPosition = strstr(tagStrings, str);
1566 if (matchPosition == 0)
1567 {
1568 log_err("unrecognized type letter ");
1569 log_err(str);
1570 }
1571 else
1572 result = (int32_t)((matchPosition - tagStrings) / 2);
1573 return result;
1574 }
1575
MakeDir(char * str)1576 static int32_t MakeDir(char* str)
1577 {
1578 int32_t pos = 0;
1579 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1580 if (strcmp(str, dirStrings[pos]) == 0) {
1581 return pos;
1582 }
1583 }
1584 return -1;
1585 }
1586
1587 /* test u_charName() -------------------------------------------------------- */
1588
1589 static const struct {
1590 uint32_t code;
1591 const char *name, *oldName, *extName, *alias;
1592 } names[]={
1593 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"},
1594 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1595 "LATIN CAPITAL LETTER OI",
1596 "LATIN CAPITAL LETTER GHA"},
1597 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1598 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" },
1599 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1600 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1601 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1602 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" },
1603 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" },
1604 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" },
1605 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" },
1606 {0xd800, "", "", "<lead surrogate-D800>" },
1607 {0xdc00, "", "", "<trail surrogate-DC00>" },
1608 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" },
1609 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" },
1610 {0xffff, "", "", "<noncharacter-FFFF>" },
1611 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1612 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1613 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1614 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" }
1615 };
1616
1617 static UBool
enumCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1618 enumCharNamesFn(void *context,
1619 UChar32 code, UCharNameChoice nameChoice,
1620 const char *name, int32_t length) {
1621 int32_t *pCount=(int32_t *)context;
1622 const char *expected;
1623 int i;
1624
1625 if(length<=0 || length!=(int32_t)strlen(name)) {
1626 /* should not be called with an empty string or invalid length */
1627 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1628 return TRUE;
1629 }
1630
1631 ++*pCount;
1632 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1633 if(code==(UChar32)names[i].code) {
1634 switch (nameChoice) {
1635 case U_EXTENDED_CHAR_NAME:
1636 if(0!=strcmp(name, names[i].extName)) {
1637 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1638 }
1639 break;
1640 case U_UNICODE_CHAR_NAME:
1641 if(0!=strcmp(name, names[i].name)) {
1642 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1643 }
1644 break;
1645 case U_UNICODE_10_CHAR_NAME:
1646 expected=names[i].oldName;
1647 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1648 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1649 }
1650 break;
1651 case U_CHAR_NAME_ALIAS:
1652 expected=names[i].alias;
1653 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1654 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1655 }
1656 break;
1657 case U_CHAR_NAME_CHOICE_COUNT:
1658 break;
1659 }
1660 break;
1661 }
1662 }
1663 return TRUE;
1664 }
1665
1666 struct enumExtCharNamesContext {
1667 uint32_t length;
1668 int32_t last;
1669 };
1670
1671 static UBool
enumExtCharNamesFn(void * context,UChar32 code,UCharNameChoice nameChoice,const char * name,int32_t length)1672 enumExtCharNamesFn(void *context,
1673 UChar32 code, UCharNameChoice nameChoice,
1674 const char *name, int32_t length) {
1675 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1676
1677 if (ecncp->last != (int32_t) code - 1) {
1678 if (ecncp->last < 0) {
1679 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1680 } else {
1681 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1682 }
1683 }
1684 ecncp->last = (int32_t) code;
1685
1686 if (!*name) {
1687 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1688 }
1689
1690 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1691 }
1692
1693 /**
1694 * This can be made more efficient by moving it into putil.c and having
1695 * it directly access the ebcdic translation tables.
1696 * TODO: If we get this method in putil.c, then delete it from here.
1697 */
1698 static UChar
u_charToUChar(char c)1699 u_charToUChar(char c) {
1700 UChar uc;
1701 u_charsToUChars(&c, &uc, 1);
1702 return uc;
1703 }
1704
1705 static void
TestCharNames()1706 TestCharNames() {
1707 static char name[80];
1708 UErrorCode errorCode=U_ZERO_ERROR;
1709 struct enumExtCharNamesContext extContext;
1710 const char *expected;
1711 int32_t length;
1712 UChar32 c;
1713 int32_t i;
1714
1715 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1716 length=uprv_getMaxCharNameLength();
1717 if(length==0) {
1718 /* no names data available */
1719 return;
1720 }
1721 if(length<83) { /* Unicode 3.2 max char name length */
1722 log_err("uprv_getMaxCharNameLength()=%d is too short");
1723 }
1724 /* ### TODO same tests for max ISO comment length as for max name length */
1725
1726 log_verbose("Testing u_charName()\n");
1727 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1728 /* modern Unicode character name */
1729 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1730 if(U_FAILURE(errorCode)) {
1731 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1732 return;
1733 }
1734 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1735 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1736 }
1737
1738 /* find the modern name */
1739 if (*names[i].name) {
1740 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1741 if(U_FAILURE(errorCode)) {
1742 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1743 return;
1744 }
1745 if(c!=(UChar32)names[i].code) {
1746 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1747 }
1748 }
1749
1750 /* Unicode 1.0 character name */
1751 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1752 if(U_FAILURE(errorCode)) {
1753 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1754 return;
1755 }
1756 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1757 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1758 }
1759
1760 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1761 if(names[i].oldName[0]!=0 /* && length>0 */) {
1762 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1763 if(U_FAILURE(errorCode)) {
1764 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1765 return;
1766 }
1767 if(c!=(UChar32)names[i].code) {
1768 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1769 }
1770 }
1771
1772 /* Unicode character name alias */
1773 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1774 if(U_FAILURE(errorCode)) {
1775 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1776 return;
1777 }
1778 expected=names[i].alias;
1779 if(expected==NULL) {
1780 expected="";
1781 }
1782 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1783 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1784 names[i].code, name, length, expected);
1785 }
1786
1787 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1788 if(expected[0]!=0 /* && length>0 */) {
1789 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1790 if(U_FAILURE(errorCode)) {
1791 log_err("u_charFromName(%s - alias) error %s\n",
1792 expected, u_errorName(errorCode));
1793 return;
1794 }
1795 if(c!=(UChar32)names[i].code) {
1796 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1797 expected, c, names[i].code);
1798 }
1799 }
1800 }
1801
1802 /* test u_enumCharNames() */
1803 length=0;
1804 errorCode=U_ZERO_ERROR;
1805 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1806 if(U_FAILURE(errorCode) || length<94140) {
1807 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1808 }
1809
1810 extContext.length = 0;
1811 extContext.last = -1;
1812 errorCode=U_ZERO_ERROR;
1813 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1814 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1815 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1816 }
1817
1818 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1819 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1820 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1821 }
1822
1823 /* Test getCharNameCharacters */
1824 if(!getTestOption(QUICK_OPTION)) {
1825 enum { BUFSIZE = 256 };
1826 UErrorCode ec = U_ZERO_ERROR;
1827 char buf[BUFSIZE];
1828 int32_t maxLength;
1829 UChar32 cp;
1830 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1831 int32_t l1, l2;
1832 UBool map[256];
1833 UBool ok;
1834
1835 USet* set = uset_open(1, 0); /* empty set */
1836 USet* dumb = uset_open(1, 0); /* empty set */
1837
1838 /*
1839 * uprv_getCharNameCharacters() will likely return more lowercase
1840 * letters than actual character names contain because
1841 * it includes all the characters in lowercased names of
1842 * general categories, for the full possible set of extended names.
1843 */
1844 {
1845 USetAdder sa={
1846 NULL,
1847 uset_add,
1848 uset_addRange,
1849 uset_addString,
1850 NULL /* don't need remove() */
1851 };
1852 sa.set=set;
1853 uprv_getCharNameCharacters(&sa);
1854 }
1855
1856 /* build set the dumb (but sure-fire) way */
1857 for (i=0; i<256; ++i) {
1858 map[i] = FALSE;
1859 }
1860
1861 maxLength=0;
1862 for (cp=0; cp<0x110000; ++cp) {
1863 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1864 buf, BUFSIZE, &ec);
1865 if (U_FAILURE(ec)) {
1866 log_err("FAIL: u_charName failed when it shouldn't\n");
1867 uset_close(set);
1868 uset_close(dumb);
1869 return;
1870 }
1871 if(len>maxLength) {
1872 maxLength=len;
1873 }
1874
1875 for (i=0; i<len; ++i) {
1876 if (!map[(uint8_t) buf[i]]) {
1877 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1878 map[(uint8_t) buf[i]] = TRUE;
1879 }
1880 }
1881
1882 /* test for leading/trailing whitespace */
1883 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1884 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1885 }
1886 }
1887
1888 if(map[(uint8_t)'\t']) {
1889 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1890 }
1891
1892 length=uprv_getMaxCharNameLength();
1893 if(length!=maxLength) {
1894 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1895 length, maxLength);
1896 }
1897
1898 /* compare the sets. Where is my uset_equals?!! */
1899 ok=TRUE;
1900 for(i=0; i<256; ++i) {
1901 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1902 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1903 /* ignore lowercase a-z that are in set but not in dumb */
1904 ok=TRUE;
1905 } else {
1906 ok=FALSE;
1907 break;
1908 }
1909 }
1910 }
1911
1912 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1913 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1914 if (U_FAILURE(ec)) {
1915 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1916 uset_close(set);
1917 uset_close(dumb);
1918 return;
1919 }
1920
1921 if (l1 >= BUFSIZE) {
1922 l1 = BUFSIZE-1;
1923 pat[l1] = 0;
1924 }
1925 if (l2 >= BUFSIZE) {
1926 l2 = BUFSIZE-1;
1927 dumbPat[l2] = 0;
1928 }
1929
1930 if (!ok) {
1931 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1932 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1933 } else if(getTestOption(VERBOSITY_OPTION)) {
1934 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
1935 }
1936
1937 uset_close(set);
1938 uset_close(dumb);
1939 }
1940
1941 /* ### TODO: test error cases and other interesting things */
1942 }
1943
1944 static void
TestUCharFromNameUnderflow()1945 TestUCharFromNameUnderflow() {
1946 // Ticket #10889: Underflow crash when there is no dash.
1947 UErrorCode errorCode=U_ZERO_ERROR;
1948 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<NO BREAK SPACE>", &errorCode);
1949 if(U_SUCCESS(errorCode)) {
1950 log_err("u_charFromName(<NO BREAK SPACE>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1951 }
1952
1953 // Test related edge cases.
1954 errorCode=U_ZERO_ERROR;
1955 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<-00a0>", &errorCode);
1956 if(U_SUCCESS(errorCode)) {
1957 log_err("u_charFromName(<-00a0>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1958 }
1959
1960 errorCode=U_ZERO_ERROR;
1961 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control->", &errorCode);
1962 if(U_SUCCESS(errorCode)) {
1963 log_err("u_charFromName(<control->) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1964 }
1965
1966 errorCode=U_ZERO_ERROR;
1967 c=u_charFromName(U_EXTENDED_CHAR_NAME, "<control-111111>", &errorCode);
1968 if(U_SUCCESS(errorCode)) {
1969 log_err("u_charFromName(<control-111111>) = U+%04x but should fail - %s\n", c, u_errorName(errorCode));
1970 }
1971 }
1972
1973 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
1974
1975 static void
TestMirroring()1976 TestMirroring() {
1977 USet *set;
1978 UErrorCode errorCode;
1979
1980 UChar32 start, end, c2, c3;
1981 int32_t i;
1982
1983 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1984
1985 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
1986
1987 log_verbose("Testing u_isMirrored()\n");
1988 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
1989 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
1990 )
1991 ) {
1992 log_err("u_isMirrored() does not work correctly\n");
1993 }
1994
1995 log_verbose("Testing u_charMirror()\n");
1996 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
1997 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
1998 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
1999 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2000 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2001 )
2002 ) {
2003 log_err("u_charMirror() does not work correctly\n");
2004 }
2005
2006 /* verify that Bidi_Mirroring_Glyph roundtrips */
2007 errorCode=U_ZERO_ERROR;
2008 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2009
2010 if (U_FAILURE(errorCode)) {
2011 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2012 } else {
2013 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2014 do {
2015 c2=u_charMirror(start);
2016 c3=u_charMirror(c2);
2017 if(c3!=start) {
2018 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2019 }
2020 c3=u_getBidiPairedBracket(start);
2021 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2022 if(c3!=start) {
2023 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2024 (long)start);
2025 }
2026 } else {
2027 if(c3!=c2) {
2028 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2029 (long)start, (long)c2);
2030 }
2031 }
2032 } while(++start<=end);
2033 }
2034 }
2035
2036 uset_close(set);
2037 }
2038
2039
2040 struct RunTestData
2041 {
2042 const char *runText;
2043 UScriptCode runCode;
2044 };
2045
2046 typedef struct RunTestData RunTestData;
2047
2048 static void
CheckScriptRuns(UScriptRun * scriptRun,int32_t * runStarts,const RunTestData * testData,int32_t nRuns,const char * prefix)2049 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2050 const char *prefix)
2051 {
2052 int32_t run, runStart, runLimit;
2053 UScriptCode runCode;
2054
2055 /* iterate over all the runs */
2056 run = 0;
2057 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2058 if (runStart != runStarts[run]) {
2059 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2060 prefix, run, runStarts[run], runStart);
2061 }
2062
2063 if (runLimit != runStarts[run + 1]) {
2064 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2065 prefix, run, runStarts[run + 1], runLimit);
2066 }
2067
2068 if (runCode != testData[run].runCode) {
2069 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2070 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2071 }
2072
2073 run += 1;
2074
2075 /* stop when we've seen all the runs we expect to see */
2076 if (run >= nRuns) {
2077 break;
2078 }
2079 }
2080
2081 /* Complain if we didn't see then number of runs we expected */
2082 if (run != nRuns) {
2083 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2084 }
2085 }
2086
2087 static void
TestUScriptRunAPI()2088 TestUScriptRunAPI()
2089 {
2090 static const RunTestData testData1[] = {
2091 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2092 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2093 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2094 {"English (", USCRIPT_LATIN},
2095 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2096 {") ", USCRIPT_LATIN},
2097 {"\\u6F22\\u5B75", USCRIPT_HAN},
2098 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2099 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2100 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2101 };
2102
2103 static const RunTestData testData2[] = {
2104 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2105 };
2106
2107 static const struct {
2108 const RunTestData *testData;
2109 int32_t nRuns;
2110 } testDataEntries[] = {
2111 {testData1, UPRV_LENGTHOF(testData1)},
2112 {testData2, UPRV_LENGTHOF(testData2)}
2113 };
2114
2115 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2116 int32_t testEntry;
2117
2118 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2119 UChar testString[1024];
2120 int32_t runStarts[256];
2121 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2122 const RunTestData *testData = testDataEntries[testEntry].testData;
2123
2124 int32_t run, stringLimit;
2125 UScriptRun *scriptRun = NULL;
2126 UErrorCode err;
2127
2128 /*
2129 * Fill in the test string and the runStarts array.
2130 */
2131 stringLimit = 0;
2132 for (run = 0; run < nTestRuns; run += 1) {
2133 runStarts[run] = stringLimit;
2134 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2135 /*stringLimit -= 1;*/
2136 }
2137
2138 /* The limit of the last run */
2139 runStarts[nTestRuns] = stringLimit;
2140
2141 /*
2142 * Make sure that calling uscript_OpenRun with a NULL text pointer
2143 * and a non-zero text length returns the correct error.
2144 */
2145 err = U_ZERO_ERROR;
2146 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2147
2148 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2149 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2150 }
2151
2152 if (scriptRun != NULL) {
2153 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2154 uscript_closeRun(scriptRun);
2155 }
2156
2157 /*
2158 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2159 * and a zero text length returns the correct error.
2160 */
2161 err = U_ZERO_ERROR;
2162 scriptRun = uscript_openRun(testString, 0, &err);
2163
2164 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2165 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2166 }
2167
2168 if (scriptRun != NULL) {
2169 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2170 uscript_closeRun(scriptRun);
2171 }
2172
2173 /*
2174 * Make sure that calling uscript_openRun with a NULL text pointer
2175 * and a zero text length doesn't return an error.
2176 */
2177 err = U_ZERO_ERROR;
2178 scriptRun = uscript_openRun(NULL, 0, &err);
2179
2180 if (U_FAILURE(err)) {
2181 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2182 }
2183
2184 /* Make sure that the empty iterator doesn't find any runs */
2185 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2186 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2187 }
2188
2189 /*
2190 * Make sure that calling uscript_setRunText with a NULL text pointer
2191 * and a non-zero text length returns the correct error.
2192 */
2193 err = U_ZERO_ERROR;
2194 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2195
2196 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2197 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2198 }
2199
2200 /*
2201 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2202 * and a zero text length returns the correct error.
2203 */
2204 err = U_ZERO_ERROR;
2205 uscript_setRunText(scriptRun, testString, 0, &err);
2206
2207 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2208 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2209 }
2210
2211 /*
2212 * Now call uscript_setRunText on the empty iterator
2213 * and make sure that it works.
2214 */
2215 err = U_ZERO_ERROR;
2216 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2217
2218 if (U_FAILURE(err)) {
2219 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2220 } else {
2221 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2222 }
2223
2224 uscript_closeRun(scriptRun);
2225
2226 /*
2227 * Now open an interator over the testString
2228 * using uscript_openRun and make sure that it works
2229 */
2230 scriptRun = uscript_openRun(testString, stringLimit, &err);
2231
2232 if (U_FAILURE(err)) {
2233 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2234 } else {
2235 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2236 }
2237
2238 /* Now reset the iterator, and make sure
2239 * that it still works.
2240 */
2241 uscript_resetRun(scriptRun);
2242
2243 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2244
2245 /* Close the iterator */
2246 uscript_closeRun(scriptRun);
2247 }
2248 }
2249
2250 /* test additional, non-core properties */
2251 static void
TestAdditionalProperties()2252 TestAdditionalProperties() {
2253 /* test data for u_charAge() */
2254 static const struct {
2255 UChar32 c;
2256 UVersionInfo version;
2257 } charAges[]={
2258 {0x41, { 1, 1, 0, 0 }},
2259 {0xffff, { 1, 1, 0, 0 }},
2260 {0x20ab, { 2, 0, 0, 0 }},
2261 {0x2fffe, { 2, 0, 0, 0 }},
2262 {0x20ac, { 2, 1, 0, 0 }},
2263 {0xfb1d, { 3, 0, 0, 0 }},
2264 {0x3f4, { 3, 1, 0, 0 }},
2265 {0x10300, { 3, 1, 0, 0 }},
2266 {0x220, { 3, 2, 0, 0 }},
2267 {0xff60, { 3, 2, 0, 0 }}
2268 };
2269
2270 /* test data for u_hasBinaryProperty() */
2271 static const int32_t
2272 props[][3]={ /* code point, property, value */
2273 { 0x0627, UCHAR_ALPHABETIC, TRUE },
2274 { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2275 { 0x2028, UCHAR_ALPHABETIC, FALSE },
2276
2277 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2278 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2279
2280 { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2281 { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2282
2283 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2284 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2285
2286 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2287 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2288 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2289 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2290 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2291
2292 { 0x058a, UCHAR_DASH, TRUE },
2293 { 0x007e, UCHAR_DASH, FALSE },
2294
2295 { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2296 { 0x3000, UCHAR_DIACRITIC, FALSE },
2297
2298 { 0x0e46, UCHAR_EXTENDER, TRUE },
2299 { 0x0020, UCHAR_EXTENDER, FALSE },
2300
2301 #if !UCONFIG_NO_NORMALIZATION
2302 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2303 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2304 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2305
2306 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */
2307 { 0x0308, UCHAR_NFD_INERT, FALSE },
2308
2309 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */
2310 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */
2311
2312 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */
2313 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */
2314 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */
2315 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */
2316 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */
2317 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */
2318
2319 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */
2320 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */
2321
2322 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2323 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2324 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2325 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2326 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2327 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2328 #endif
2329
2330 { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2331 { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2332 { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2333
2334 { 0x30fb, UCHAR_HYPHEN, TRUE },
2335 { 0xfe58, UCHAR_HYPHEN, FALSE },
2336
2337 { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2338 { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2339 { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2340
2341 { 0x2172, UCHAR_ID_START, TRUE },
2342 { 0x007a, UCHAR_ID_START, TRUE },
2343 { 0x0039, UCHAR_ID_START, FALSE },
2344
2345 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2346 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2347 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2348
2349 { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2350 { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2351
2352 { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2353 { 0x0345, UCHAR_LOWERCASE, TRUE },
2354 { 0x0030, UCHAR_LOWERCASE, FALSE },
2355
2356 { 0x1d7a9, UCHAR_MATH, TRUE },
2357 { 0x2135, UCHAR_MATH, TRUE },
2358 { 0x0062, UCHAR_MATH, FALSE },
2359
2360 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2361 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2362 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2363
2364 { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2365 { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2366 { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2367
2368 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2369 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2370
2371 { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2372 { 0x2162, UCHAR_UPPERCASE, TRUE },
2373 { 0x0345, UCHAR_UPPERCASE, FALSE },
2374
2375 { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2376 { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2377 { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2378
2379 { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2380 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2381 { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2382
2383 { 0x16ee, UCHAR_XID_START, TRUE },
2384 { 0x23456, UCHAR_XID_START, TRUE },
2385 { 0x1d1aa, UCHAR_XID_START, FALSE },
2386
2387 /*
2388 * Version break:
2389 * The following properties are only supported starting with the
2390 * Unicode version indicated in the second field.
2391 */
2392 { -1, 0x320, 0 },
2393
2394 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2395 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2396 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2397
2398 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */
2399 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */
2400 { 0xe0001, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */
2401 { 0xe0100, UCHAR_DEPRECATED, FALSE },
2402
2403 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2404 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2405 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2406 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2407
2408 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2409 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2410 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2411 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2412
2413 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2414 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2415
2416 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2417 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2418
2419 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2420 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2421
2422 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2423 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2424
2425 { 0x2e9b, UCHAR_RADICAL, TRUE },
2426 { 0x4e00, UCHAR_RADICAL, FALSE },
2427
2428 { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2429 { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2430
2431 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2432 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2433
2434 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2435
2436 { 0x002e, UCHAR_S_TERM, TRUE },
2437 { 0x0061, UCHAR_S_TERM, FALSE },
2438
2439 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2440 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2441 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2442 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2443
2444 /* enum/integer type properties */
2445
2446 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2447 /* test default Bidi classes for unassigned code points */
2448 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2449 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2450 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2451 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2452 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2453 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2454 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2455 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2456 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2457 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2458 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2459
2460 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2461 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2462 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2463 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2464 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2465 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2466 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2467
2468 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2469 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2470 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2471 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2472 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2473 { 0x1CBF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2474 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2475 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2476 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2477 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2478 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2479
2480 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2481 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2482
2483 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2484 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2485 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2486 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2487 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2488 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2489 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2490 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2491 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2492
2493 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2494 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2495 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2496 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2497 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2498 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2499 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2500 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2501 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2502 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2503 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2504 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2505 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2506 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2507 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2508 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2509 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2510
2511 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2512 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2513 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2514
2515 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2516 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2517 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2518 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2519 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2520
2521 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2522 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2523 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2524 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2525 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2526 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2527 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2528 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2529
2530 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2531 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2532 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2533 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2534 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2535 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2536 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2537 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2538 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2539 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2540 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2541 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2542 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2543 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2544 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2545 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2546
2547 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2548
2549 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */
2550
2551 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2552 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2553 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2554 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2555 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2556 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2557 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2558
2559 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2560 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2561 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2562 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2563
2564 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2565 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2566 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2567 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2568 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2569 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2570
2571 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2572 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2573 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2574 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2575
2576 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2577 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2578 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2579 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2580 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2581 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2582 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2583
2584 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2585 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2586 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2587 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2588
2589 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2590 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2591 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2592 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2593
2594 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2595 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2596 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2597 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2598 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2599
2600 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2601
2602 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2603
2604 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2605 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2606 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2607
2608 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2609 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2610 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2611 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2612 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2613
2614 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2615 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2616 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2617
2618 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2619 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2620 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2621 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2622
2623 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2624 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2625 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2626 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2627 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2628 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2629
2630 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2631 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2632 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2633 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2634
2635 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2636 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2637 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2638 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2639
2640 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2641 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2642 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2643 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2644
2645 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2646
2647 /* unassigned code points in new default Bidi R blocks */
2648 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2649 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2650
2651 /* test some script codes >127 */
2652 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2653 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2654 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2655
2656 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2657
2658 /* value changed in Unicode 6.0 */
2659 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2660
2661 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2662
2663 /* unassigned code points in new/changed default Bidi AL blocks */
2664 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2665 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2666
2667 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2668
2669 /* unassigned code points in the currency symbols block now default to ET */
2670 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2671 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2672
2673 /* new property in Unicode 6.3 */
2674 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2675 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2676 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2677 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2678 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2679 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2680
2681 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2682
2683 /* new character range with Joining_Group values */
2684 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2685 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2686 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2687 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2688 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2689
2690 /* undefined UProperty values */
2691 { 0x61, 0x4a7, 0 },
2692 { 0x234bc, 0x15ed, 0 }
2693 };
2694
2695 UVersionInfo version;
2696 UChar32 c;
2697 int32_t i, result, uVersion;
2698 UProperty which;
2699
2700 /* what is our Unicode version? */
2701 u_getUnicodeVersion(version);
2702 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2703
2704 u_charAge(0x20, version);
2705 if(version[0]==0) {
2706 /* no additional properties available */
2707 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2708 return;
2709 }
2710
2711 /* test u_charAge() */
2712 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2713 u_charAge(charAges[i].c, version);
2714 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2715 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2716 charAges[i].c,
2717 version[0], version[1], version[2], version[3],
2718 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2719 }
2720 }
2721
2722 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2723 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2724 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2725 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2726 u_getIntPropertyMinValue(0x2345)!=0
2727 ) {
2728 log_err("error: u_getIntPropertyMinValue() wrong\n");
2729 }
2730 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2731 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2732 }
2733 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2734 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2735 }
2736 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2737 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2738 }
2739 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2740 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2741 }
2742 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2743 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2744 }
2745 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2746 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2747 }
2748 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2749 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2750 }
2751 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2752 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2753 }
2754 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2755 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2756 }
2757 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2758 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2759 }
2760 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2761 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2762 }
2763 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2764 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2765 }
2766 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2767 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2768 }
2769 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2770 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2771 }
2772 /*JB#2410*/
2773 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2774 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2775 }
2776 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2777 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2778 }
2779 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2780 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2781 }
2782 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2783 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2784 }
2785 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2786 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2787 }
2788
2789 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2790 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2791 const char *whichName;
2792
2793 if(props[i][0]<0) {
2794 /* Unicode version break */
2795 if(uVersion<props[i][1]) {
2796 break; /* do not test properties that are not yet supported */
2797 } else {
2798 continue; /* skip this row */
2799 }
2800 }
2801
2802 c=(UChar32)props[i][0];
2803 which=(UProperty)props[i][1];
2804 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2805
2806 if(which<UCHAR_INT_START) {
2807 result=u_hasBinaryProperty(c, which);
2808 if(result!=props[i][2]) {
2809 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2810 c, whichName, result, i);
2811 }
2812 }
2813
2814 result=u_getIntPropertyValue(c, which);
2815 if(result!=props[i][2]) {
2816 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2817 c, whichName, result, props[i][2], i);
2818 }
2819
2820 /* test separate functions, too */
2821 switch((UProperty)props[i][1]) {
2822 case UCHAR_ALPHABETIC:
2823 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2824 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2825 props[i][0], result, i);
2826 }
2827 break;
2828 case UCHAR_LOWERCASE:
2829 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2830 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2831 props[i][0], result, i);
2832 }
2833 break;
2834 case UCHAR_UPPERCASE:
2835 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2836 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2837 props[i][0], result, i);
2838 }
2839 break;
2840 case UCHAR_WHITE_SPACE:
2841 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2842 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2843 props[i][0], result, i);
2844 }
2845 break;
2846 default:
2847 break;
2848 }
2849 }
2850 }
2851
2852 static void
TestNumericProperties(void)2853 TestNumericProperties(void) {
2854 /* see UnicodeData.txt, DerivedNumericValues.txt */
2855 static const struct {
2856 UChar32 c;
2857 int32_t type;
2858 double numValue;
2859 } values[]={
2860 { 0x0F33, U_NT_NUMERIC, -1./2. },
2861 { 0x0C66, U_NT_DECIMAL, 0 },
2862 { 0x96f6, U_NT_NUMERIC, 0 },
2863 { 0xa833, U_NT_NUMERIC, 1./16. },
2864 { 0x2152, U_NT_NUMERIC, 1./10. },
2865 { 0x2151, U_NT_NUMERIC, 1./9. },
2866 { 0x1245f, U_NT_NUMERIC, 1./8. },
2867 { 0x2150, U_NT_NUMERIC, 1./7. },
2868 { 0x2159, U_NT_NUMERIC, 1./6. },
2869 { 0x09f6, U_NT_NUMERIC, 3./16. },
2870 { 0x2155, U_NT_NUMERIC, 1./5. },
2871 { 0x00BD, U_NT_NUMERIC, 1./2. },
2872 { 0x0031, U_NT_DECIMAL, 1. },
2873 { 0x4e00, U_NT_NUMERIC, 1. },
2874 { 0x58f1, U_NT_NUMERIC, 1. },
2875 { 0x10320, U_NT_NUMERIC, 1. },
2876 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2877 { 0x00B2, U_NT_DIGIT, 2. },
2878 { 0x5f10, U_NT_NUMERIC, 2. },
2879 { 0x1813, U_NT_DECIMAL, 3. },
2880 { 0x5f0e, U_NT_NUMERIC, 3. },
2881 { 0x2173, U_NT_NUMERIC, 4. },
2882 { 0x8086, U_NT_NUMERIC, 4. },
2883 { 0x278E, U_NT_DIGIT, 5. },
2884 { 0x1D7F2, U_NT_DECIMAL, 6. },
2885 { 0x247A, U_NT_DIGIT, 7. },
2886 { 0x7396, U_NT_NUMERIC, 9. },
2887 { 0x1372, U_NT_NUMERIC, 10. },
2888 { 0x216B, U_NT_NUMERIC, 12. },
2889 { 0x16EE, U_NT_NUMERIC, 17. },
2890 { 0x249A, U_NT_NUMERIC, 19. },
2891 { 0x303A, U_NT_NUMERIC, 30. },
2892 { 0x5345, U_NT_NUMERIC, 30. },
2893 { 0x32B2, U_NT_NUMERIC, 37. },
2894 { 0x1375, U_NT_NUMERIC, 40. },
2895 { 0x10323, U_NT_NUMERIC, 50. },
2896 { 0x0BF1, U_NT_NUMERIC, 100. },
2897 { 0x964c, U_NT_NUMERIC, 100. },
2898 { 0x217E, U_NT_NUMERIC, 500. },
2899 { 0x2180, U_NT_NUMERIC, 1000. },
2900 { 0x4edf, U_NT_NUMERIC, 1000. },
2901 { 0x2181, U_NT_NUMERIC, 5000. },
2902 { 0x137C, U_NT_NUMERIC, 10000. },
2903 { 0x4e07, U_NT_NUMERIC, 10000. },
2904 { 0x12432, U_NT_NUMERIC, 216000. },
2905 { 0x12433, U_NT_NUMERIC, 432000. },
2906 { 0x4ebf, U_NT_NUMERIC, 100000000. },
2907 { 0x5146, U_NT_NUMERIC, 1000000000000. },
2908 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
2909 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
2910 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
2911 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
2912 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
2913 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
2914 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
2915 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
2916 };
2917
2918 double nv;
2919 UChar32 c;
2920 int32_t i, type;
2921
2922 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
2923 c=values[i].c;
2924 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
2925 nv=u_getNumericValue(c);
2926
2927 if(type!=values[i].type) {
2928 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
2929 }
2930 if(0.000001 <= fabs(nv - values[i].numValue)) {
2931 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
2932 }
2933 }
2934 }
2935
2936 /**
2937 * Test the property names and property value names API.
2938 */
2939 static void
TestPropertyNames(void)2940 TestPropertyNames(void) {
2941 int32_t p, v, choice=0, rev;
2942 UBool atLeastSomething = FALSE;
2943
2944 for (p=0; ; ++p) {
2945 UProperty propEnum = (UProperty)p;
2946 UBool sawProp = FALSE;
2947 if(p > 10 && !atLeastSomething) {
2948 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
2949 return;
2950 }
2951
2952 for (choice=0; ; ++choice) {
2953 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
2954 if (name) {
2955 if (!sawProp)
2956 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
2957 log_verbose("%d=\"%s\"", choice, name);
2958 sawProp = TRUE;
2959 atLeastSomething = TRUE;
2960
2961 /* test reverse mapping */
2962 rev = u_getPropertyEnum(name);
2963 if (rev != p) {
2964 log_err("Property round-trip failure: %d -> %s -> %d\n",
2965 p, name, rev);
2966 }
2967 }
2968 if (!name && choice>0) break;
2969 }
2970 if (sawProp) {
2971 /* looks like a valid property; check the values */
2972 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
2973 int32_t max = 0;
2974 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
2975 max = 255;
2976 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
2977 /* it's far too slow to iterate all the way up to
2978 the real max, U_GC_P_MASK */
2979 max = U_GC_NL_MASK;
2980 } else if (p == UCHAR_BLOCK) {
2981 /* UBlockCodes, unlike other values, start at 1 */
2982 max = 1;
2983 }
2984 log_verbose("\n");
2985 for (v=-1; ; ++v) {
2986 UBool sawValue = FALSE;
2987 for (choice=0; ; ++choice) {
2988 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
2989 if (vname) {
2990 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
2991 log_verbose("%d=\"%s\"", choice, vname);
2992 sawValue = TRUE;
2993
2994 /* test reverse mapping */
2995 rev = u_getPropertyValueEnum(propEnum, vname);
2996 if (rev != v) {
2997 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
2998 pname, v, vname, rev);
2999 }
3000 }
3001 if (!vname && choice>0) break;
3002 }
3003 if (sawValue) {
3004 log_verbose("\n");
3005 }
3006 if (!sawValue && v>=max) break;
3007 }
3008 }
3009 if (!sawProp) {
3010 if (p>=UCHAR_STRING_LIMIT) {
3011 break;
3012 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3013 p = UCHAR_STRING_START - 1;
3014 } else if (p>=UCHAR_MASK_LIMIT) {
3015 p = UCHAR_DOUBLE_START - 1;
3016 } else if (p>=UCHAR_INT_LIMIT) {
3017 p = UCHAR_MASK_START - 1;
3018 } else if (p>=UCHAR_BINARY_LIMIT) {
3019 p = UCHAR_INT_START - 1;
3020 }
3021 }
3022 }
3023 }
3024
3025 /**
3026 * Test the property values API. See JB#2410.
3027 */
3028 static void
TestPropertyValues(void)3029 TestPropertyValues(void) {
3030 int32_t i, p, min, max;
3031 UErrorCode ec;
3032
3033 /* Min should be 0 for everything. */
3034 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3035 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3036 UProperty propEnum = (UProperty)p;
3037 min = u_getIntPropertyMinValue(propEnum);
3038 if (min != 0) {
3039 if (p == UCHAR_BLOCK) {
3040 /* This is okay...for now. See JB#2487.
3041 TODO Update this for JB#2487. */
3042 } else {
3043 const char* name;
3044 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3045 if (name == NULL)
3046 name = "<ERROR>";
3047 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3048 name, min);
3049 }
3050 }
3051 }
3052
3053 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3054 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3055 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3056 }
3057
3058 /* Max should be -1 for invalid properties. */
3059 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3060 if (max != -1) {
3061 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3062 max);
3063 }
3064
3065 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3066 for (i=0; i<2; ++i) {
3067 int32_t script;
3068 const char* desc;
3069 ec = U_ZERO_ERROR;
3070 switch (i) {
3071 case 0:
3072 script = uscript_getScript(-1, &ec);
3073 desc = "uscript_getScript(-1)";
3074 break;
3075 case 1:
3076 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3077 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3078 break;
3079 default:
3080 log_err("Internal test error. Too many scripts\n");
3081 return;
3082 }
3083 /* We don't explicitly test ec. It should be U_FAILURE but it
3084 isn't documented as such. */
3085 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3086 log_err("FAIL: %s = %d, exp. 0\n",
3087 desc, script);
3088 }
3089 }
3090 }
3091
3092 /* various tests for consistency of UCD data and API behavior */
3093 static void
TestConsistency()3094 TestConsistency() {
3095 char buffer[300];
3096 USet *set1, *set2, *set3, *set4;
3097 UErrorCode errorCode;
3098
3099 UChar32 start, end;
3100 int32_t i, length;
3101
3102 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3103 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3104 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3105 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3106 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3107
3108 U_STRING_DECL(mathBlocksPattern,
3109 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3110 214);
3111 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3112 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3113 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3114 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3115
3116 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3117 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3118 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3119 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3120 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3121
3122 U_STRING_INIT(mathBlocksPattern,
3123 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3124 214);
3125 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3126 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3127 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3128 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3129
3130 /*
3131 * It used to be that UCD.html and its precursors said
3132 * "Those dashes used to mark connections between pieces of words,
3133 * plus the Katakana middle dot."
3134 *
3135 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3136 * but not from Hyphen.
3137 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3138 * Therefore, do not show errors when testing the Hyphen property.
3139 */
3140 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3141 "known to the UTC and not considered errors.\n");
3142
3143 errorCode=U_ZERO_ERROR;
3144 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3145 set2=uset_openPattern(dashPattern, 8, &errorCode);
3146 if(U_SUCCESS(errorCode)) {
3147 /* remove the Katakana middle dot(s) from set1 */
3148 uset_remove(set1, 0x30fb);
3149 uset_remove(set1, 0xff65); /* halfwidth variant */
3150 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3151 } else {
3152 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3153 }
3154
3155 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3156 set3=uset_openPattern(formatPattern, 6, &errorCode);
3157 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3158 if(U_SUCCESS(errorCode)) {
3159 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3160 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3161 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3162 } else {
3163 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3164 }
3165
3166 uset_close(set1);
3167 uset_close(set2);
3168 uset_close(set3);
3169 uset_close(set4);
3170
3171 /*
3172 * Check that each lowercase character has "small" in its name
3173 * and not "capital".
3174 * There are some such characters, some of which seem odd.
3175 * Use the verbose flag to see these notices.
3176 */
3177 errorCode=U_ZERO_ERROR;
3178 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3179 if(U_SUCCESS(errorCode)) {
3180 for(i=0;; ++i) {
3181 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3182 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3183 break; /* done */
3184 }
3185 if(U_FAILURE(errorCode)) {
3186 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3187 i, u_errorName(errorCode));
3188 break;
3189 }
3190 if(length!=0) {
3191 break; /* done with code points, got a string or -1 */
3192 }
3193
3194 while(start<=end) {
3195 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3196 if(U_FAILURE(errorCode)) {
3197 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3198 errorCode=U_ZERO_ERROR;
3199 }
3200 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3201 strstr(buffer, "SMALL CAPITAL")==NULL
3202 ) {
3203 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3204 }
3205 ++start;
3206 }
3207 }
3208 } else {
3209 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3210 }
3211 uset_close(set1);
3212
3213 /* verify that all assigned characters in Math blocks are exactly Math characters */
3214 errorCode=U_ZERO_ERROR;
3215 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3216 set2=uset_openPattern(mathPattern, 8, &errorCode);
3217 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3218 if(U_SUCCESS(errorCode)) {
3219 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3220 uset_complement(set3); /* assigned characters */
3221 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3222 compareUSets(set1, set2,
3223 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3224 TRUE);
3225 } else {
3226 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3227 }
3228 uset_close(set1);
3229 uset_close(set2);
3230 uset_close(set3);
3231
3232 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3233 errorCode=U_ZERO_ERROR;
3234 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3235 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3236 if(U_SUCCESS(errorCode)) {
3237 compareUSets(set1, set2,
3238 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3239 TRUE);
3240 } else {
3241 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3242 }
3243 uset_close(set1);
3244 uset_close(set2);
3245 }
3246
3247 /*
3248 * Starting with ICU4C 3.4, the core Unicode properties files
3249 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu)
3250 * are hardcoded in the common DLL and therefore not included
3251 * in the data package any more.
3252 * Test requiring these files are disabled so that
3253 * we need not jump through hoops (like adding snapshots of these files
3254 * to testdata).
3255 * See Jitterbug 4497.
3256 */
3257 #define HARDCODED_DATA_4497 1
3258
3259 /* API coverage for ucase.c */
TestUCase()3260 static void TestUCase() {
3261 #if !HARDCODED_DATA_4497
3262 UDataMemory *pData;
3263 UCaseProps *csp;
3264 const UCaseProps *ccsp;
3265 UErrorCode errorCode;
3266
3267 /* coverage for ucase_openBinary() */
3268 errorCode=U_ZERO_ERROR;
3269 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode);
3270 if(U_FAILURE(errorCode)) {
3271 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3272 u_errorName(errorCode));
3273 return;
3274 }
3275
3276 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3277 if(U_FAILURE(errorCode)) {
3278 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
3279 u_errorName(errorCode));
3280 udata_close(pData);
3281 return;
3282 }
3283
3284 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */
3285 log_err("ucase_openBinary() does not seem to return working UCaseProps\n");
3286 }
3287
3288 ucase_close(csp);
3289 udata_close(pData);
3290
3291 /* coverage for ucase_getDummy() */
3292 errorCode=U_ZERO_ERROR;
3293 ccsp=ucase_getDummy(&errorCode);
3294 if(ucase_tolower(ccsp, 0x41)!=0x41) {
3295 log_err("ucase_tolower(dummy, A)!=A\n");
3296 }
3297 #endif
3298 }
3299
3300 /* API coverage for ubidi_props.c */
TestUBiDiProps()3301 static void TestUBiDiProps() {
3302 #if !HARDCODED_DATA_4497
3303 UDataMemory *pData;
3304 UBiDiProps *bdp;
3305 const UBiDiProps *cbdp;
3306 UErrorCode errorCode;
3307
3308 /* coverage for ubidi_openBinary() */
3309 errorCode=U_ZERO_ERROR;
3310 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode);
3311 if(U_FAILURE(errorCode)) {
3312 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3313 u_errorName(errorCode));
3314 return;
3315 }
3316
3317 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
3318 if(U_FAILURE(errorCode)) {
3319 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n",
3320 u_errorName(errorCode));
3321 udata_close(pData);
3322 return;
3323 }
3324
3325 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */
3326 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n");
3327 }
3328
3329 ubidi_closeProps(bdp);
3330 udata_close(pData);
3331
3332 /* coverage for ubidi_getDummy() */
3333 errorCode=U_ZERO_ERROR;
3334 cbdp=ubidi_getDummy(&errorCode);
3335 if(ubidi_getClass(cbdp, 0x20)!=0) {
3336 log_err("ubidi_getClass(dummy, space)!=0\n");
3337 }
3338 #endif
3339 }
3340
3341 /* test case folding, compare return values with CaseFolding.txt ------------ */
3342
3343 /* bit set for which case foldings for a character have been tested already */
3344 enum {
3345 CF_SIMPLE=1,
3346 CF_FULL=2,
3347 CF_TURKIC=4,
3348 CF_ALL=7
3349 };
3350
3351 static void
testFold(UChar32 c,int which,UChar32 simple,UChar32 turkic,const UChar * full,int32_t fullLength,const UChar * turkicFull,int32_t turkicFullLength)3352 testFold(UChar32 c, int which,
3353 UChar32 simple, UChar32 turkic,
3354 const UChar *full, int32_t fullLength,
3355 const UChar *turkicFull, int32_t turkicFullLength) {
3356 UChar s[2], t[32];
3357 UChar32 c2;
3358 int32_t length, length2;
3359
3360 UErrorCode errorCode=U_ZERO_ERROR;
3361
3362 length=0;
3363 U16_APPEND_UNSAFE(s, length, c);
3364
3365 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3366 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3367 }
3368 if((which&CF_FULL)!=0) {
3369 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3370 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3371 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3372 }
3373 }
3374 if((which&CF_TURKIC)!=0) {
3375 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3376 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3377 }
3378
3379 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3380 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3381 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3382 }
3383 }
3384 }
3385
3386 /* test that c case-folds to itself */
3387 static void
testFoldToSelf(UChar32 c,int which)3388 testFoldToSelf(UChar32 c, int which) {
3389 UChar s[2];
3390 int32_t length;
3391
3392 length=0;
3393 U16_APPEND_UNSAFE(s, length, c);
3394 testFold(c, which, c, c, s, length, s, length);
3395 }
3396
3397 struct CaseFoldingData {
3398 USet *notSeen;
3399 UChar32 prev, prevSimple;
3400 UChar prevFull[32];
3401 int32_t prevFullLength;
3402 int which;
3403 };
3404 typedef struct CaseFoldingData CaseFoldingData;
3405
3406 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)3407 caseFoldingLineFn(void *context,
3408 char *fields[][2], int32_t fieldCount,
3409 UErrorCode *pErrorCode) {
3410 CaseFoldingData *pData=(CaseFoldingData *)context;
3411 char *end;
3412 UChar full[32];
3413 UChar32 c, prev, simple;
3414 int32_t count;
3415 int which;
3416 char status;
3417
3418 /* get code point */
3419 const char *s=u_skipWhitespace(fields[0][0]);
3420 if(0==strncmp(s, "0000..10FFFF", 12)) {
3421 /*
3422 * Ignore the line
3423 * # @missing: 0000..10FFFF; C; <code point>
3424 * because maps-to-self is already our default, and this line breaks this parser.
3425 */
3426 return;
3427 }
3428 c=(UChar32)strtoul(s, &end, 16);
3429 end=(char *)u_skipWhitespace(end);
3430 if(end<=fields[0][0] || end!=fields[0][1]) {
3431 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3432 *pErrorCode=U_PARSE_ERROR;
3433 return;
3434 }
3435
3436 /* get the status of this mapping */
3437 status=*u_skipWhitespace(fields[1][0]);
3438 if(status!='C' && status!='S' && status!='F' && status!='T') {
3439 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3440 *pErrorCode=U_PARSE_ERROR;
3441 return;
3442 }
3443
3444 /* get the mapping */
3445 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3446 if(U_FAILURE(*pErrorCode)) {
3447 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3448 return;
3449 }
3450
3451 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3452 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3453 simple=c;
3454 }
3455
3456 if(c!=(prev=pData->prev)) {
3457 /*
3458 * Test remaining mappings for the previous code point.
3459 * If a turkic folding was not mentioned, then it should fold the same
3460 * as the regular simple case folding.
3461 */
3462 UChar prevString[2];
3463 int32_t length;
3464
3465 length=0;
3466 U16_APPEND_UNSAFE(prevString, length, prev);
3467 testFold(prev, (~pData->which)&CF_ALL,
3468 prev, pData->prevSimple,
3469 prevString, length,
3470 pData->prevFull, pData->prevFullLength);
3471 pData->prev=pData->prevSimple=c;
3472 length=0;
3473 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3474 pData->prevFullLength=length;
3475 pData->which=0;
3476 }
3477
3478 /*
3479 * Turn the status into a bit set of case foldings to test.
3480 * Remember non-Turkic case foldings as defaults for Turkic mode.
3481 */
3482 switch(status) {
3483 case 'C':
3484 which=CF_SIMPLE|CF_FULL;
3485 pData->prevSimple=simple;
3486 u_memcpy(pData->prevFull, full, count);
3487 pData->prevFullLength=count;
3488 break;
3489 case 'S':
3490 which=CF_SIMPLE;
3491 pData->prevSimple=simple;
3492 break;
3493 case 'F':
3494 which=CF_FULL;
3495 u_memcpy(pData->prevFull, full, count);
3496 pData->prevFullLength=count;
3497 break;
3498 case 'T':
3499 which=CF_TURKIC;
3500 break;
3501 default:
3502 which=0;
3503 break; /* won't happen because of test above */
3504 }
3505
3506 testFold(c, which, simple, simple, full, count, full, count);
3507
3508 /* remember which case foldings of c have been tested */
3509 pData->which|=which;
3510
3511 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3512 uset_remove(pData->notSeen, c);
3513 }
3514
3515 static void
TestCaseFolding()3516 TestCaseFolding() {
3517 CaseFoldingData data={ NULL };
3518 char *fields[3][2];
3519 UErrorCode errorCode;
3520
3521 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3522
3523 errorCode=U_ZERO_ERROR;
3524 /* test BMP & plane 1 - nothing interesting above */
3525 data.notSeen=uset_open(0, 0x1ffff);
3526 data.prevFullLength=1; /* length of full case folding of U+0000 */
3527
3528 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3529 if(U_SUCCESS(errorCode)) {
3530 int32_t i, start, end;
3531
3532 /* add a pseudo-last line to finish testing of the actual last one */
3533 fields[0][0]=lastLine;
3534 fields[0][1]=lastLine+6;
3535 fields[1][0]=lastLine+7;
3536 fields[1][1]=lastLine+9;
3537 fields[2][0]=lastLine+10;
3538 fields[2][1]=lastLine+17;
3539 caseFoldingLineFn(&data, fields, 3, &errorCode);
3540
3541 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3542 for(i=0;
3543 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3544 U_SUCCESS(errorCode);
3545 ++i
3546 ) {
3547 do {
3548 testFoldToSelf(start, CF_ALL);
3549 } while(++start<=end);
3550 }
3551 }
3552
3553 uset_close(data.notSeen);
3554 }
3555