1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gencase.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
15 *
16 * This program reads several of the Unicode character database text files,
17 * parses them, and the case mapping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
21 */
22
23 #include <stdio.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "uarrsort.h"
32 #include "unewdata.h"
33 #include "uoptions.h"
34 #include "uparse.h"
35 #include "uprops.h"
36 #include "propsvec.h"
37 #include "gencase.h"
38
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40
41 /* data --------------------------------------------------------------------- */
42
43 UPropsVectors *pv;
44
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47 /*
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
52 */
53 static USet *caseSensitive;
54
55 /* prototypes --------------------------------------------------------------- */
56
57 static void
58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
59
60 static void
61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
62
63 static void
64 parseDB(const char *filename, UErrorCode *pErrorCode);
65
66 /* parse files with multiple binary properties ------------------------------ */
67
68 /* TODO: more common code, move functions to uparse.h|c */
69
70 /* TODO: similar to genprops/props2.c but not the same */
71
72 struct Binary {
73 const char *propName;
74 int32_t vecWord;
75 uint32_t vecValue, vecMask;
76 };
77 typedef struct Binary Binary;
78
79 struct Binaries {
80 const char *ucdFile;
81 const Binary *binaries;
82 int32_t binariesCount;
83 };
84 typedef struct Binaries Binaries;
85
86 static const Binary
87 propListNames[]={
88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK }
89 };
90
91 static const Binaries
92 propListBinaries={
93 "PropList", propListNames, LENGTHOF(propListNames)
94 };
95
96 static const Binary
97 derCorePropsNames[]={
98 { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK },
99 { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK },
100 /* Unicode 5.2 adds Case_Ignorable as a public property. See comments in store.c. */
101 { "Case_Ignorable", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
102 };
103
104 static const Binaries
105 derCorePropsBinaries={
106 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
107 };
108
109 /*
110 * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
111 * We need not distinguish between them because both add to case-ignorable.
112 * We ignore all other Word_Break values.
113 */
114 static const Binary
115 wordBreakNames[]={
116 { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
117 { "MidNumLet", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
118 };
119
120 static const Binaries
121 wordBreakBinaries={
122 "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
123 };
124
125 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)126 binariesLineFn(void *context,
127 char *fields[][2], int32_t fieldCount,
128 UErrorCode *pErrorCode) {
129 const Binaries *bin;
130 char *s;
131 uint32_t start, end;
132 int32_t i;
133
134 bin=(const Binaries *)context;
135
136 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
137 if(U_FAILURE(*pErrorCode)) {
138 fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
139 exit(*pErrorCode);
140 }
141
142 /* parse binary property name */
143 s=(char *)u_skipWhitespace(fields[1][0]);
144 for(i=0;; ++i) {
145 if(i==bin->binariesCount) {
146 /* ignore unrecognized properties */
147 return;
148 }
149 if(isToken(bin->binaries[i].propName, s)) {
150 break;
151 }
152 }
153
154 if(bin->binaries[i].vecMask==0) {
155 fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
156 (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
157 exit(U_INTERNAL_PROGRAM_ERROR);
158 }
159
160 upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
161 if(U_FAILURE(*pErrorCode)) {
162 fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
163 bin->binaries[i].propName, u_errorName(*pErrorCode));
164 exit(*pErrorCode);
165 }
166 }
167
168 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)169 parseBinariesFile(char *filename, char *basename, const char *suffix,
170 const Binaries *bin,
171 UErrorCode *pErrorCode) {
172 char *fields[2][2];
173
174 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
175 return;
176 }
177
178 writeUCDFilename(basename, bin->ucdFile, suffix);
179
180 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
181 if(U_FAILURE(*pErrorCode)) {
182 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
183 }
184 }
185
186 /* -------------------------------------------------------------------------- */
187
188 enum
189 {
190 HELP_H,
191 HELP_QUESTION_MARK,
192 VERBOSE,
193 COPYRIGHT,
194 DESTDIR,
195 SOURCEDIR,
196 UNICODE_VERSION,
197 ICUDATADIR,
198 CSOURCE
199 };
200
201 /* Keep these values in sync with the above enums */
202 static UOption options[]={
203 UOPTION_HELP_H,
204 UOPTION_HELP_QUESTION_MARK,
205 UOPTION_VERBOSE,
206 UOPTION_COPYRIGHT,
207 UOPTION_DESTDIR,
208 UOPTION_SOURCEDIR,
209 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
210 UOPTION_ICUDATADIR,
211 UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
212 };
213
214 extern int
main(int argc,char * argv[])215 main(int argc, char* argv[]) {
216 char filename[300];
217 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
218 char *basename=NULL;
219 UErrorCode errorCode=U_ZERO_ERROR;
220
221 U_MAIN_INIT_ARGS(argc, argv);
222
223 /* preset then read command line options */
224 options[DESTDIR].value=u_getDataDirectory();
225 options[SOURCEDIR].value="";
226 options[UNICODE_VERSION].value="";
227 options[ICUDATADIR].value=u_getDataDirectory();
228 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
229
230 /* error handling, printing usage message */
231 if(argc<0) {
232 fprintf(stderr,
233 "error in command line argument \"%s\"\n",
234 argv[-argc]);
235 }
236 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
237 /*
238 * Broken into chunks because the C89 standard says the minimum
239 * required supported string length is 509 bytes.
240 */
241 fprintf(stderr,
242 "Usage: %s [-options] [suffix]\n"
243 "\n"
244 "read the UnicodeData.txt file and other Unicode properties files and\n"
245 "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
246 "\n",
247 argv[0]);
248 fprintf(stderr,
249 "Options:\n"
250 "\t-h or -? or --help this usage text\n"
251 "\t-v or --verbose verbose output\n"
252 "\t-c or --copyright include a copyright notice\n"
253 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
254 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
255 fprintf(stderr,
256 "\t-d or --destdir destination directory, followed by the path\n"
257 "\t-s or --sourcedir source directory, followed by the path\n"
258 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
259 "\t followed by path, defaults to %s\n"
260 "\tsuffix suffix that is to be appended with a '-'\n"
261 "\t to the source file basenames before opening;\n"
262 "\t 'gencase new' will read UnicodeData-new.txt etc.\n",
263 u_getDataDirectory());
264 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
265 }
266
267 /* get the options values */
268 beVerbose=options[VERBOSE].doesOccur;
269 haveCopyright=options[COPYRIGHT].doesOccur;
270 srcDir=options[SOURCEDIR].value;
271 destDir=options[DESTDIR].value;
272
273 if(argc>=2) {
274 suffix=argv[1];
275 } else {
276 suffix=NULL;
277 }
278
279 if(options[UNICODE_VERSION].doesOccur) {
280 setUnicodeVersion(options[UNICODE_VERSION].value);
281 }
282 /* else use the default dataVersion in store.c */
283
284 if (options[ICUDATADIR].doesOccur) {
285 u_setDataDirectory(options[ICUDATADIR].value);
286 }
287
288 /* prepare the filename beginning with the source dir */
289 uprv_strcpy(filename, srcDir);
290 basename=filename+uprv_strlen(filename);
291 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
292 *basename++=U_FILE_SEP_CHAR;
293 }
294
295 /* initialize */
296 pv=upvec_open(2, &errorCode);
297 caseSensitive=uset_open(1, 0); /* empty set (start>end) */
298
299 /* process SpecialCasing.txt */
300 writeUCDFilename(basename, "SpecialCasing", suffix);
301 parseSpecialCasing(filename, &errorCode);
302
303 /* process CaseFolding.txt */
304 writeUCDFilename(basename, "CaseFolding", suffix);
305 parseCaseFolding(filename, &errorCode);
306
307 /* process additional properties files */
308 *basename=0;
309
310 parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
311
312 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
313
314 if(ucdVersion>=UNI_4_1) {
315 parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
316 }
317
318 /* process UnicodeData.txt */
319 writeUCDFilename(basename, "UnicodeData", suffix);
320 parseDB(filename, &errorCode);
321
322 /* process parsed data */
323 makeCaseClosure();
324
325 makeExceptions();
326
327 if(U_SUCCESS(errorCode)) {
328 /* write the properties data file */
329 generateData(destDir, options[CSOURCE].doesOccur);
330 }
331
332 u_cleanup();
333 return errorCode;
334 }
335
336 U_CFUNC void
writeUCDFilename(char * basename,const char * filename,const char * suffix)337 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
338 int32_t length=(int32_t)uprv_strlen(filename);
339 uprv_strcpy(basename, filename);
340 if(suffix!=NULL) {
341 basename[length++]='-';
342 uprv_strcpy(basename+length, suffix);
343 length+=(int32_t)uprv_strlen(suffix);
344 }
345 uprv_strcpy(basename+length, ".txt");
346 }
347
348 /* TODO: move to toolutil */
349 U_CFUNC UBool
isToken(const char * token,const char * s)350 isToken(const char *token, const char *s) {
351 const char *z;
352 int32_t j;
353
354 s=u_skipWhitespace(s);
355 for(j=0;; ++j) {
356 if(token[j]!=0) {
357 if(s[j]!=token[j]) {
358 break;
359 }
360 } else {
361 z=u_skipWhitespace(s+j);
362 if(*z==';' || *z==0) {
363 return TRUE;
364 } else {
365 break;
366 }
367 }
368 }
369
370 return FALSE;
371 }
372
373 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)374 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
375 const char *t, *z;
376 int32_t i, j;
377
378 s=u_skipWhitespace(s);
379 for(i=0; i<countTokens; ++i) {
380 t=tokens[i];
381 if(t!=NULL) {
382 for(j=0;; ++j) {
383 if(t[j]!=0) {
384 if(s[j]!=t[j]) {
385 break;
386 }
387 } else {
388 z=u_skipWhitespace(s+j);
389 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
390 return i;
391 } else {
392 break;
393 }
394 }
395 }
396 }
397 }
398 return -1;
399 }
400
401 static void
_set_addAll(USet * set,const UChar * s,int32_t length)402 _set_addAll(USet *set, const UChar *s, int32_t length) {
403 UChar32 c;
404 int32_t i;
405
406 /* needs length>=0 */
407 for(i=0; i<length; /* U16_NEXT advances i */) {
408 U16_NEXT(s, i, length, c);
409 uset_add(set, c);
410 }
411 }
412
413 /* parser for SpecialCasing.txt --------------------------------------------- */
414
415 #define MAX_SPECIAL_CASING_COUNT 500
416
417 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
418 static int32_t specialCasingCount=0;
419
420 static void U_CALLCONV
specialCasingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)421 specialCasingLineFn(void *context,
422 char *fields[][2], int32_t fieldCount,
423 UErrorCode *pErrorCode) {
424 char *end;
425
426 /* get code point */
427 specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
428 end=(char *)u_skipWhitespace(end);
429 if(end<=fields[0][0] || end!=fields[0][1]) {
430 fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
431 *pErrorCode=U_PARSE_ERROR;
432 exit(U_PARSE_ERROR);
433 }
434
435 /* is this a complex mapping? */
436 if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
437 /* there is some condition text in the fifth field */
438 specialCasings[specialCasingCount].isComplex=TRUE;
439
440 /* do not store any actual mappings for this */
441 specialCasings[specialCasingCount].lowerCase[0]=0;
442 specialCasings[specialCasingCount].upperCase[0]=0;
443 specialCasings[specialCasingCount].titleCase[0]=0;
444 } else {
445 /* just set the "complex" flag and get the case mappings */
446 specialCasings[specialCasingCount].isComplex=FALSE;
447 specialCasings[specialCasingCount].lowerCase[0]=
448 (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
449 specialCasings[specialCasingCount].upperCase[0]=
450 (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
451 specialCasings[specialCasingCount].titleCase[0]=
452 (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
453 if(U_FAILURE(*pErrorCode)) {
454 fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
455 exit(*pErrorCode);
456 }
457
458 uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
459 _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
460 _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
461 _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
462 }
463
464 if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
465 fprintf(stderr, "gencase: too many special casing mappings\n");
466 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
467 exit(U_INDEX_OUTOFBOUNDS_ERROR);
468 }
469 }
470
471 static int32_t U_CALLCONV
compareSpecialCasings(const void * context,const void * left,const void * right)472 compareSpecialCasings(const void *context, const void *left, const void *right) {
473 return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
474 }
475
476 static void
parseSpecialCasing(const char * filename,UErrorCode * pErrorCode)477 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
478 char *fields[5][2];
479 int32_t i, j;
480
481 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
482 return;
483 }
484
485 u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
486
487 /* sort the special casing entries by code point */
488 if(specialCasingCount>0) {
489 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
490 compareSpecialCasings, NULL, FALSE, pErrorCode);
491 }
492 if(U_FAILURE(*pErrorCode)) {
493 return;
494 }
495
496 /* replace multiple entries for any code point by one "complex" one */
497 j=0;
498 for(i=1; i<specialCasingCount; ++i) {
499 if(specialCasings[i-1].code==specialCasings[i].code) {
500 /* there is a duplicate code point */
501 specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */
502 specialCasings[i].isComplex=TRUE; /* make the following one complex */
503 specialCasings[i].lowerCase[0]=0;
504 specialCasings[i].upperCase[0]=0;
505 specialCasings[i].titleCase[0]=0;
506 ++j;
507 }
508 }
509
510 /* if some entries just were removed, then re-sort */
511 if(j>0) {
512 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
513 compareSpecialCasings, NULL, FALSE, pErrorCode);
514 specialCasingCount-=j;
515 }
516 if(U_FAILURE(*pErrorCode)) {
517 return;
518 }
519
520 /*
521 * Add one complex mapping to caseSensitive that was filtered out above:
522 * Greek final Sigma has a conditional mapping but not locale-sensitive,
523 * and it is taken when lowercasing just U+03A3 alone.
524 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
525 */
526 uset_add(caseSensitive, 0x3c2);
527 }
528
529 /* parser for CaseFolding.txt ----------------------------------------------- */
530
531 #define MAX_CASE_FOLDING_COUNT 2000
532
533 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
534 static int32_t caseFoldingCount=0;
535
536 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)537 caseFoldingLineFn(void *context,
538 char *fields[][2], int32_t fieldCount,
539 UErrorCode *pErrorCode) {
540 char *end;
541 static UChar32 prevCode=0;
542 int32_t count;
543 char status;
544
545 /* get code point */
546 caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
547 end=(char *)u_skipWhitespace(end);
548 if(end<=fields[0][0] || end!=fields[0][1]) {
549 fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
550 *pErrorCode=U_PARSE_ERROR;
551 exit(U_PARSE_ERROR);
552 }
553
554 /* get the status of this mapping */
555 caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
556 if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
557 fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
558 *pErrorCode=U_PARSE_ERROR;
559 exit(U_PARSE_ERROR);
560 }
561
562 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
563 if(status=='L') {
564 return;
565 }
566
567 /* get the mapping */
568 count=caseFoldings[caseFoldingCount].full[0]=
569 (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
570 if(U_FAILURE(*pErrorCode)) {
571 fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
572 exit(*pErrorCode);
573 }
574
575 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
576 if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
577 caseFoldings[caseFoldingCount].simple=0;
578 }
579
580 /* update the case-sensitive set */
581 if(status!='T') {
582 uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
583 _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
584 }
585
586 /* check the status */
587 if(status=='S') {
588 /* check if there was a full mapping for this code point before */
589 if( caseFoldingCount>0 &&
590 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
591 caseFoldings[caseFoldingCount-1].status=='F'
592 ) {
593 /* merge the two entries */
594 caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
595 return;
596 }
597 } else if(status=='F') {
598 /* check if there was a simple mapping for this code point before */
599 if( caseFoldingCount>0 &&
600 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
601 caseFoldings[caseFoldingCount-1].status=='S'
602 ) {
603 /* merge the two entries */
604 uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
605 return;
606 }
607 } else if(status=='I' || status=='T') {
608 /* check if there was a default mapping for this code point before (remove it) */
609 while(caseFoldingCount>0 &&
610 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
611 ) {
612 prevCode=0;
613 --caseFoldingCount;
614 }
615 /* store only a marker for special handling for cases like dotless i */
616 caseFoldings[caseFoldingCount].simple=0;
617 caseFoldings[caseFoldingCount].full[0]=0;
618 }
619
620 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
621 if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
622 fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
623 (unsigned long)caseFoldings[caseFoldingCount].code,
624 (unsigned long)prevCode);
625 *pErrorCode=U_PARSE_ERROR;
626 exit(U_PARSE_ERROR);
627 }
628 prevCode=caseFoldings[caseFoldingCount].code;
629
630 if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
631 fprintf(stderr, "gencase: too many case folding mappings\n");
632 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
633 exit(U_INDEX_OUTOFBOUNDS_ERROR);
634 }
635 }
636
637 static void
parseCaseFolding(const char * filename,UErrorCode * pErrorCode)638 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
639 char *fields[3][2];
640
641 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
642 return;
643 }
644
645 u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
646 }
647
648 /* parser for UnicodeData.txt ----------------------------------------------- */
649
650 /* general categories */
651 const char *const
652 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
653 "Cn",
654 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
655 "Mc", "Nd", "Nl", "No",
656 "Zs", "Zl", "Zp",
657 "Cc", "Cf", "Co", "Cs",
658 "Pd", "Ps", "Pe", "Pc", "Po",
659 "Sm", "Sc", "Sk", "So",
660 "Pi", "Pf"
661 };
662
663 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
664
665 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)666 unicodeDataLineFn(void *context,
667 char *fields[][2], int32_t fieldCount,
668 UErrorCode *pErrorCode) {
669 Props p;
670 char *end;
671 static UChar32 prevCode=0;
672 UChar32 value;
673 int32_t i;
674
675 /* reset the properties */
676 uprv_memset(&p, 0, sizeof(Props));
677
678 /* get the character code, field 0 */
679 p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
680 if(end<=fields[0][0] || end!=fields[0][1]) {
681 fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
682 *pErrorCode=U_PARSE_ERROR;
683 exit(U_PARSE_ERROR);
684 }
685
686 /* get general category, field 2 */
687 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
688 if(i>=0) {
689 p.gc=(uint8_t)i;
690 } else {
691 fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
692 fields[2][0], (unsigned long)p.code);
693 *pErrorCode=U_PARSE_ERROR;
694 exit(U_PARSE_ERROR);
695 }
696
697 /* get canonical combining class, field 3 */
698 value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
699 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
700 fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
701 *pErrorCode=U_PARSE_ERROR;
702 exit(U_PARSE_ERROR);
703 }
704 p.cc=(uint8_t)value;
705
706 /* get uppercase mapping, field 12 */
707 value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
708 if(end!=fields[12][1]) {
709 fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
710 (unsigned long)p.code);
711 *pErrorCode=U_PARSE_ERROR;
712 exit(U_PARSE_ERROR);
713 }
714 if(value!=0 && value!=p.code) {
715 p.upperCase=value;
716 uset_add(caseSensitive, p.code);
717 uset_add(caseSensitive, value);
718 }
719
720 /* get lowercase value, field 13 */
721 value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
722 if(end!=fields[13][1]) {
723 fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
724 (unsigned long)p.code);
725 *pErrorCode=U_PARSE_ERROR;
726 exit(U_PARSE_ERROR);
727 }
728 if(value!=0 && value!=p.code) {
729 p.lowerCase=value;
730 uset_add(caseSensitive, p.code);
731 uset_add(caseSensitive, value);
732 }
733
734 /* get titlecase value, field 14 */
735 value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
736 if(end!=fields[14][1]) {
737 fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
738 (unsigned long)p.code);
739 *pErrorCode=U_PARSE_ERROR;
740 exit(U_PARSE_ERROR);
741 }
742 if(value!=0 && value!=p.code) {
743 p.titleCase=value;
744 uset_add(caseSensitive, p.code);
745 uset_add(caseSensitive, value);
746 }
747
748 /* set additional properties from previously parsed files */
749 if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
750 p.specialCasing=specialCasings+specialCasingIndex++;
751 } else {
752 p.specialCasing=NULL;
753 }
754 if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
755 p.caseFolding=caseFoldings+caseFoldingIndex++;
756
757 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
758 if( p.caseFolding->status=='C' &&
759 p.caseFolding->simple==p.lowerCase
760 ) {
761 p.caseFolding=NULL;
762 }
763 } else {
764 p.caseFolding=NULL;
765 }
766
767 /* check for non-character code points */
768 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
769 fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
770 (unsigned long)p.code);
771 *pErrorCode=U_PARSE_ERROR;
772 exit(U_PARSE_ERROR);
773 }
774
775 /* check that the code points (p.code) are in ascending order */
776 if(p.code<=prevCode && p.code>0) {
777 fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
778 (unsigned long)p.code, (unsigned long)prevCode);
779 *pErrorCode=U_PARSE_ERROR;
780 exit(U_PARSE_ERROR);
781 }
782
783 /* properties for a single code point */
784 setProps(&p);
785
786 prevCode=p.code;
787 }
788
789 static void
parseDB(const char * filename,UErrorCode * pErrorCode)790 parseDB(const char *filename, UErrorCode *pErrorCode) {
791 char *fields[15][2];
792 UChar32 start, end;
793 int32_t i;
794
795 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
796 return;
797 }
798
799 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
800
801 /* are all sub-properties consumed? */
802 if(specialCasingIndex<specialCasingCount) {
803 fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
804 *pErrorCode=U_PARSE_ERROR;
805 exit(U_PARSE_ERROR);
806 }
807 if(caseFoldingIndex<caseFoldingCount) {
808 fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
809 *pErrorCode=U_PARSE_ERROR;
810 exit(U_PARSE_ERROR);
811 }
812
813 if(U_FAILURE(*pErrorCode)) {
814 return;
815 }
816
817 for(i=0;
818 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
819 ++i
820 ) {
821 addCaseSensitive(start, end);
822 }
823 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
824 *pErrorCode=U_ZERO_ERROR;
825 }
826 }
827
828 /*
829 * Hey, Emacs, please set the following:
830 *
831 * Local Variables:
832 * indent-tabs-mode: nil
833 * End:
834 *
835 */
836