1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gencase.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
15 *
16 * This program reads several of the Unicode character database text files,
17 * parses them, and the case mapping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
21 */
22
23 #include <stdio.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "uarrsort.h"
32 #include "unewdata.h"
33 #include "uoptions.h"
34 #include "uparse.h"
35 #include "uprops.h"
36 #include "propsvec.h"
37 #include "gencase.h"
38
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40
41 /* data --------------------------------------------------------------------- */
42
43 uint32_t *pv;
44
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47 /*
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
52 */
53 static USet *caseSensitive;
54
55 /* prototypes --------------------------------------------------------------- */
56
57 static void
58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
59
60 static void
61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
62
63 static void
64 parseDB(const char *filename, UErrorCode *pErrorCode);
65
66 /* parse files with multiple binary properties ------------------------------ */
67
68 /* TODO: more common code, move functions to uparse.h|c */
69
70 /* TODO: similar to genprops/props2.c but not the same */
71
72 struct Binary {
73 const char *propName;
74 int32_t vecWord;
75 uint32_t vecValue, vecMask;
76 };
77 typedef struct Binary Binary;
78
79 struct Binaries {
80 const char *ucdFile;
81 const Binary *binaries;
82 int32_t binariesCount;
83 };
84 typedef struct Binaries Binaries;
85
86 static const Binary
87 propListNames[]={
88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK }
89 };
90
91 static const Binaries
92 propListBinaries={
93 "PropList", propListNames, LENGTHOF(propListNames)
94 };
95
96 static const Binary
97 derCorePropsNames[]={
98 { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK },
99 { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK }
100 };
101
102 static const Binaries
103 derCorePropsBinaries={
104 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
105 };
106
107 /* treat Word_Break=MidLetter as a binary property (we ignore all other Word_Break values) */
108 static const Binary
109 wordBreakNames[]={
110 { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
111 };
112
113 static const Binaries
114 wordBreakBinaries={
115 "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
116 };
117
118 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)119 binariesLineFn(void *context,
120 char *fields[][2], int32_t fieldCount,
121 UErrorCode *pErrorCode) {
122 const Binaries *bin;
123 char *s;
124 uint32_t start, limit;
125 int32_t i;
126
127 bin=(const Binaries *)context;
128
129 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
130 if(U_FAILURE(*pErrorCode)) {
131 fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
132 exit(*pErrorCode);
133 }
134 ++limit;
135
136 /* parse binary property name */
137 s=(char *)u_skipWhitespace(fields[1][0]);
138 for(i=0;; ++i) {
139 if(i==bin->binariesCount) {
140 /* ignore unrecognized properties */
141 return;
142 }
143 if(isToken(bin->binaries[i].propName, s)) {
144 break;
145 }
146 }
147
148 if(bin->binaries[i].vecMask==0) {
149 fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
150 (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
151 exit(U_INTERNAL_PROGRAM_ERROR);
152 }
153
154 if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
155 fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
156 bin->binaries[i].propName, u_errorName(*pErrorCode));
157 exit(*pErrorCode);
158 }
159 }
160
161 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)162 parseBinariesFile(char *filename, char *basename, const char *suffix,
163 const Binaries *bin,
164 UErrorCode *pErrorCode) {
165 char *fields[2][2];
166
167 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
168 return;
169 }
170
171 writeUCDFilename(basename, bin->ucdFile, suffix);
172
173 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
174 if(U_FAILURE(*pErrorCode)) {
175 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
176 }
177 }
178
179 /* -------------------------------------------------------------------------- */
180
181 enum
182 {
183 HELP_H,
184 HELP_QUESTION_MARK,
185 VERBOSE,
186 COPYRIGHT,
187 DESTDIR,
188 SOURCEDIR,
189 UNICODE_VERSION,
190 ICUDATADIR,
191 CSOURCE
192 };
193
194 /* Keep these values in sync with the above enums */
195 static UOption options[]={
196 UOPTION_HELP_H,
197 UOPTION_HELP_QUESTION_MARK,
198 UOPTION_VERBOSE,
199 UOPTION_COPYRIGHT,
200 UOPTION_DESTDIR,
201 UOPTION_SOURCEDIR,
202 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
203 UOPTION_ICUDATADIR,
204 UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
205 };
206
207 extern int
main(int argc,char * argv[])208 main(int argc, char* argv[]) {
209 char filename[300];
210 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
211 char *basename=NULL;
212 UErrorCode errorCode=U_ZERO_ERROR;
213
214 U_MAIN_INIT_ARGS(argc, argv);
215
216 /* preset then read command line options */
217 options[DESTDIR].value=u_getDataDirectory();
218 options[SOURCEDIR].value="";
219 options[UNICODE_VERSION].value="";
220 options[ICUDATADIR].value=u_getDataDirectory();
221 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
222
223 /* error handling, printing usage message */
224 if(argc<0) {
225 fprintf(stderr,
226 "error in command line argument \"%s\"\n",
227 argv[-argc]);
228 }
229 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
230 /*
231 * Broken into chucks because the C89 standard says the minimum
232 * required supported string length is 509 bytes.
233 */
234 fprintf(stderr,
235 "Usage: %s [-options] [suffix]\n"
236 "\n"
237 "read the UnicodeData.txt file and other Unicode properties files and\n"
238 "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
239 "\n",
240 argv[0]);
241 fprintf(stderr,
242 "Options:\n"
243 "\t-h or -? or --help this usage text\n"
244 "\t-v or --verbose verbose output\n"
245 "\t-c or --copyright include a copyright notice\n"
246 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
247 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
248 fprintf(stderr,
249 "\t-d or --destdir destination directory, followed by the path\n"
250 "\t-s or --sourcedir source directory, followed by the path\n"
251 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
252 "\t followed by path, defaults to %s\n"
253 "\tsuffix suffix that is to be appended with a '-'\n"
254 "\t to the source file basenames before opening;\n"
255 "\t 'gencase new' will read UnicodeData-new.txt etc.\n",
256 u_getDataDirectory());
257 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
258 }
259
260 /* get the options values */
261 beVerbose=options[VERBOSE].doesOccur;
262 haveCopyright=options[COPYRIGHT].doesOccur;
263 srcDir=options[SOURCEDIR].value;
264 destDir=options[DESTDIR].value;
265
266 if(argc>=2) {
267 suffix=argv[1];
268 } else {
269 suffix=NULL;
270 }
271
272 if(options[UNICODE_VERSION].doesOccur) {
273 setUnicodeVersion(options[UNICODE_VERSION].value);
274 }
275 /* else use the default dataVersion in store.c */
276
277 if (options[ICUDATADIR].doesOccur) {
278 u_setDataDirectory(options[ICUDATADIR].value);
279 }
280
281 /* prepare the filename beginning with the source dir */
282 uprv_strcpy(filename, srcDir);
283 basename=filename+uprv_strlen(filename);
284 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
285 *basename++=U_FILE_SEP_CHAR;
286 }
287
288 /* initialize */
289 pv=upvec_open(2, 10000);
290 caseSensitive=uset_open(1, 0); /* empty set (start>end) */
291
292 /* process SpecialCasing.txt */
293 writeUCDFilename(basename, "SpecialCasing", suffix);
294 parseSpecialCasing(filename, &errorCode);
295
296 /* process CaseFolding.txt */
297 writeUCDFilename(basename, "CaseFolding", suffix);
298 parseCaseFolding(filename, &errorCode);
299
300 /* process additional properties files */
301 *basename=0;
302
303 parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
304
305 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
306
307 if(ucdVersion>=UNI_4_1) {
308 parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
309 }
310
311 /* process UnicodeData.txt */
312 writeUCDFilename(basename, "UnicodeData", suffix);
313 parseDB(filename, &errorCode);
314
315 /* process parsed data */
316 makeCaseClosure();
317
318 makeExceptions();
319
320 if(U_SUCCESS(errorCode)) {
321 /* write the properties data file */
322 generateData(destDir, options[CSOURCE].doesOccur);
323 }
324
325 u_cleanup();
326 return errorCode;
327 }
328
329 U_CFUNC void
writeUCDFilename(char * basename,const char * filename,const char * suffix)330 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
331 int32_t length=(int32_t)uprv_strlen(filename);
332 uprv_strcpy(basename, filename);
333 if(suffix!=NULL) {
334 basename[length++]='-';
335 uprv_strcpy(basename+length, suffix);
336 length+=(int32_t)uprv_strlen(suffix);
337 }
338 uprv_strcpy(basename+length, ".txt");
339 }
340
341 /* TODO: move to toolutil */
342 U_CFUNC UBool
isToken(const char * token,const char * s)343 isToken(const char *token, const char *s) {
344 const char *z;
345 int32_t j;
346
347 s=u_skipWhitespace(s);
348 for(j=0;; ++j) {
349 if(token[j]!=0) {
350 if(s[j]!=token[j]) {
351 break;
352 }
353 } else {
354 z=u_skipWhitespace(s+j);
355 if(*z==';' || *z==0) {
356 return TRUE;
357 } else {
358 break;
359 }
360 }
361 }
362
363 return FALSE;
364 }
365
366 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)367 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
368 const char *t, *z;
369 int32_t i, j;
370
371 s=u_skipWhitespace(s);
372 for(i=0; i<countTokens; ++i) {
373 t=tokens[i];
374 if(t!=NULL) {
375 for(j=0;; ++j) {
376 if(t[j]!=0) {
377 if(s[j]!=t[j]) {
378 break;
379 }
380 } else {
381 z=u_skipWhitespace(s+j);
382 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
383 return i;
384 } else {
385 break;
386 }
387 }
388 }
389 }
390 }
391 return -1;
392 }
393
394 static void
_set_addAll(USet * set,const UChar * s,int32_t length)395 _set_addAll(USet *set, const UChar *s, int32_t length) {
396 UChar32 c;
397 int32_t i;
398
399 /* needs length>=0 */
400 for(i=0; i<length; /* U16_NEXT advances i */) {
401 U16_NEXT(s, i, length, c);
402 uset_add(set, c);
403 }
404 }
405
406 /* parser for SpecialCasing.txt --------------------------------------------- */
407
408 #define MAX_SPECIAL_CASING_COUNT 500
409
410 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
411 static int32_t specialCasingCount=0;
412
413 static void U_CALLCONV
specialCasingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)414 specialCasingLineFn(void *context,
415 char *fields[][2], int32_t fieldCount,
416 UErrorCode *pErrorCode) {
417 char *end;
418
419 /* get code point */
420 specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
421 end=(char *)u_skipWhitespace(end);
422 if(end<=fields[0][0] || end!=fields[0][1]) {
423 fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
424 *pErrorCode=U_PARSE_ERROR;
425 exit(U_PARSE_ERROR);
426 }
427
428 /* is this a complex mapping? */
429 if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
430 /* there is some condition text in the fifth field */
431 specialCasings[specialCasingCount].isComplex=TRUE;
432
433 /* do not store any actual mappings for this */
434 specialCasings[specialCasingCount].lowerCase[0]=0;
435 specialCasings[specialCasingCount].upperCase[0]=0;
436 specialCasings[specialCasingCount].titleCase[0]=0;
437 } else {
438 /* just set the "complex" flag and get the case mappings */
439 specialCasings[specialCasingCount].isComplex=FALSE;
440 specialCasings[specialCasingCount].lowerCase[0]=
441 (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
442 specialCasings[specialCasingCount].upperCase[0]=
443 (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
444 specialCasings[specialCasingCount].titleCase[0]=
445 (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
446 if(U_FAILURE(*pErrorCode)) {
447 fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
448 exit(*pErrorCode);
449 }
450
451 uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
452 _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
453 _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
454 _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
455 }
456
457 if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
458 fprintf(stderr, "gencase: too many special casing mappings\n");
459 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
460 exit(U_INDEX_OUTOFBOUNDS_ERROR);
461 }
462 }
463
464 static int32_t U_CALLCONV
compareSpecialCasings(const void * context,const void * left,const void * right)465 compareSpecialCasings(const void *context, const void *left, const void *right) {
466 return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
467 }
468
469 static void
parseSpecialCasing(const char * filename,UErrorCode * pErrorCode)470 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
471 char *fields[5][2];
472 int32_t i, j;
473
474 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
475 return;
476 }
477
478 u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
479
480 /* sort the special casing entries by code point */
481 if(specialCasingCount>0) {
482 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
483 compareSpecialCasings, NULL, FALSE, pErrorCode);
484 }
485 if(U_FAILURE(*pErrorCode)) {
486 return;
487 }
488
489 /* replace multiple entries for any code point by one "complex" one */
490 j=0;
491 for(i=1; i<specialCasingCount; ++i) {
492 if(specialCasings[i-1].code==specialCasings[i].code) {
493 /* there is a duplicate code point */
494 specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */
495 specialCasings[i].isComplex=TRUE; /* make the following one complex */
496 specialCasings[i].lowerCase[0]=0;
497 specialCasings[i].upperCase[0]=0;
498 specialCasings[i].titleCase[0]=0;
499 ++j;
500 }
501 }
502
503 /* if some entries just were removed, then re-sort */
504 if(j>0) {
505 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
506 compareSpecialCasings, NULL, FALSE, pErrorCode);
507 specialCasingCount-=j;
508 }
509 if(U_FAILURE(*pErrorCode)) {
510 return;
511 }
512
513 /*
514 * Add one complex mapping to caseSensitive that was filtered out above:
515 * Greek final Sigma has a conditional mapping but not locale-sensitive,
516 * and it is taken when lowercasing just U+03A3 alone.
517 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
518 */
519 uset_add(caseSensitive, 0x3c2);
520 }
521
522 /* parser for CaseFolding.txt ----------------------------------------------- */
523
524 #define MAX_CASE_FOLDING_COUNT 2000
525
526 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
527 static int32_t caseFoldingCount=0;
528
529 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)530 caseFoldingLineFn(void *context,
531 char *fields[][2], int32_t fieldCount,
532 UErrorCode *pErrorCode) {
533 char *end;
534 static UChar32 prevCode=0;
535 int32_t count;
536 char status;
537
538 /* get code point */
539 caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
540 end=(char *)u_skipWhitespace(end);
541 if(end<=fields[0][0] || end!=fields[0][1]) {
542 fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
543 *pErrorCode=U_PARSE_ERROR;
544 exit(U_PARSE_ERROR);
545 }
546
547 /* get the status of this mapping */
548 caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
549 if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
550 fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
551 *pErrorCode=U_PARSE_ERROR;
552 exit(U_PARSE_ERROR);
553 }
554
555 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
556 if(status=='L') {
557 return;
558 }
559
560 /* get the mapping */
561 count=caseFoldings[caseFoldingCount].full[0]=
562 (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
563 if(U_FAILURE(*pErrorCode)) {
564 fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
565 exit(*pErrorCode);
566 }
567
568 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
569 if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
570 caseFoldings[caseFoldingCount].simple=0;
571 }
572
573 /* update the case-sensitive set */
574 if(status!='T') {
575 uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
576 _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
577 }
578
579 /* check the status */
580 if(status=='S') {
581 /* check if there was a full mapping for this code point before */
582 if( caseFoldingCount>0 &&
583 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
584 caseFoldings[caseFoldingCount-1].status=='F'
585 ) {
586 /* merge the two entries */
587 caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
588 return;
589 }
590 } else if(status=='F') {
591 /* check if there was a simple mapping for this code point before */
592 if( caseFoldingCount>0 &&
593 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
594 caseFoldings[caseFoldingCount-1].status=='S'
595 ) {
596 /* merge the two entries */
597 uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
598 return;
599 }
600 } else if(status=='I' || status=='T') {
601 /* check if there was a default mapping for this code point before (remove it) */
602 while(caseFoldingCount>0 &&
603 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
604 ) {
605 prevCode=0;
606 --caseFoldingCount;
607 }
608 /* store only a marker for special handling for cases like dotless i */
609 caseFoldings[caseFoldingCount].simple=0;
610 caseFoldings[caseFoldingCount].full[0]=0;
611 }
612
613 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
614 if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
615 fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
616 (unsigned long)caseFoldings[caseFoldingCount].code,
617 (unsigned long)prevCode);
618 *pErrorCode=U_PARSE_ERROR;
619 exit(U_PARSE_ERROR);
620 }
621 prevCode=caseFoldings[caseFoldingCount].code;
622
623 if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
624 fprintf(stderr, "gencase: too many case folding mappings\n");
625 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
626 exit(U_INDEX_OUTOFBOUNDS_ERROR);
627 }
628 }
629
630 static void
parseCaseFolding(const char * filename,UErrorCode * pErrorCode)631 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
632 char *fields[3][2];
633
634 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
635 return;
636 }
637
638 u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
639 }
640
641 /* parser for UnicodeData.txt ----------------------------------------------- */
642
643 /* general categories */
644 const char *const
645 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
646 "Cn",
647 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
648 "Mc", "Nd", "Nl", "No",
649 "Zs", "Zl", "Zp",
650 "Cc", "Cf", "Co", "Cs",
651 "Pd", "Ps", "Pe", "Pc", "Po",
652 "Sm", "Sc", "Sk", "So",
653 "Pi", "Pf"
654 };
655
656 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
657
658 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)659 unicodeDataLineFn(void *context,
660 char *fields[][2], int32_t fieldCount,
661 UErrorCode *pErrorCode) {
662 Props p;
663 char *end;
664 static UChar32 prevCode=0;
665 UChar32 value;
666 int32_t i;
667
668 /* reset the properties */
669 uprv_memset(&p, 0, sizeof(Props));
670
671 /* get the character code, field 0 */
672 p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
673 if(end<=fields[0][0] || end!=fields[0][1]) {
674 fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
675 *pErrorCode=U_PARSE_ERROR;
676 exit(U_PARSE_ERROR);
677 }
678
679 /* get general category, field 2 */
680 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
681 if(i>=0) {
682 p.gc=(uint8_t)i;
683 } else {
684 fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
685 fields[2][0], (unsigned long)p.code);
686 *pErrorCode=U_PARSE_ERROR;
687 exit(U_PARSE_ERROR);
688 }
689
690 /* get canonical combining class, field 3 */
691 value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
692 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
693 fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
694 *pErrorCode=U_PARSE_ERROR;
695 exit(U_PARSE_ERROR);
696 }
697 p.cc=(uint8_t)value;
698
699 /* get uppercase mapping, field 12 */
700 value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
701 if(end!=fields[12][1]) {
702 fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
703 (unsigned long)p.code);
704 *pErrorCode=U_PARSE_ERROR;
705 exit(U_PARSE_ERROR);
706 }
707 if(value!=0 && value!=p.code) {
708 p.upperCase=value;
709 uset_add(caseSensitive, p.code);
710 uset_add(caseSensitive, value);
711 }
712
713 /* get lowercase value, field 13 */
714 value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
715 if(end!=fields[13][1]) {
716 fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
717 (unsigned long)p.code);
718 *pErrorCode=U_PARSE_ERROR;
719 exit(U_PARSE_ERROR);
720 }
721 if(value!=0 && value!=p.code) {
722 p.lowerCase=value;
723 uset_add(caseSensitive, p.code);
724 uset_add(caseSensitive, value);
725 }
726
727 /* get titlecase value, field 14 */
728 value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
729 if(end!=fields[14][1]) {
730 fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
731 (unsigned long)p.code);
732 *pErrorCode=U_PARSE_ERROR;
733 exit(U_PARSE_ERROR);
734 }
735 if(value!=0 && value!=p.code) {
736 p.titleCase=value;
737 uset_add(caseSensitive, p.code);
738 uset_add(caseSensitive, value);
739 }
740
741 /* set additional properties from previously parsed files */
742 if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
743 p.specialCasing=specialCasings+specialCasingIndex++;
744 } else {
745 p.specialCasing=NULL;
746 }
747 if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
748 p.caseFolding=caseFoldings+caseFoldingIndex++;
749
750 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
751 if( p.caseFolding->status=='C' &&
752 p.caseFolding->simple==p.lowerCase
753 ) {
754 p.caseFolding=NULL;
755 }
756 } else {
757 p.caseFolding=NULL;
758 }
759
760 /* check for non-character code points */
761 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
762 fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
763 (unsigned long)p.code);
764 *pErrorCode=U_PARSE_ERROR;
765 exit(U_PARSE_ERROR);
766 }
767
768 /* check that the code points (p.code) are in ascending order */
769 if(p.code<=prevCode && p.code>0) {
770 fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
771 (unsigned long)p.code, (unsigned long)prevCode);
772 *pErrorCode=U_PARSE_ERROR;
773 exit(U_PARSE_ERROR);
774 }
775
776 /* properties for a single code point */
777 setProps(&p);
778
779 prevCode=p.code;
780 }
781
782 static void
parseDB(const char * filename,UErrorCode * pErrorCode)783 parseDB(const char *filename, UErrorCode *pErrorCode) {
784 char *fields[15][2];
785 UChar32 start, end;
786 int32_t i;
787
788 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
789 return;
790 }
791
792 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
793
794 /* are all sub-properties consumed? */
795 if(specialCasingIndex<specialCasingCount) {
796 fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
797 *pErrorCode=U_PARSE_ERROR;
798 exit(U_PARSE_ERROR);
799 }
800 if(caseFoldingIndex<caseFoldingCount) {
801 fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
802 *pErrorCode=U_PARSE_ERROR;
803 exit(U_PARSE_ERROR);
804 }
805
806 if(U_FAILURE(*pErrorCode)) {
807 return;
808 }
809
810 for(i=0;
811 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
812 ++i
813 ) {
814 addCaseSensitive(start, end);
815 }
816 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
817 *pErrorCode=U_ZERO_ERROR;
818 }
819 }
820
821 /*
822 * Hey, Emacs, please set the following:
823 *
824 * Local Variables:
825 * indent-tabs-mode: nil
826 * End:
827 *
828 */
829