1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gencase.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug28
14 * created by: Markus W. Scherer
15 *
16 * This program reads several of the Unicode character database text files,
17 * parses them, and the case mapping properties for each character.
18 * It then writes a binary file containing the properties
19 * that is designed to be used directly for random-access to
20 * the properties of each Unicode character.
21 */
22
23 #include <stdio.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "uarrsort.h"
32 #include "unewdata.h"
33 #include "uoptions.h"
34 #include "uparse.h"
35 #include "uprops.h"
36 #include "propsvec.h"
37 #include "gencase.h"
38
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40
41 /* data --------------------------------------------------------------------- */
42
43 UPropsVectors *pv;
44
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47 /*
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
52 */
53 static USet *caseSensitive;
54
55 /* prototypes --------------------------------------------------------------- */
56
57 static void
58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
59
60 static void
61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
62
63 static void
64 parseDB(const char *filename, UErrorCode *pErrorCode);
65
66 /* parse files with multiple binary properties ------------------------------ */
67
68 /* TODO: more common code, move functions to uparse.h|c */
69
70 /* TODO: similar to genprops/props2.c but not the same */
71
72 struct Binary {
73 const char *propName;
74 int32_t vecWord;
75 uint32_t vecValue, vecMask;
76 };
77 typedef struct Binary Binary;
78
79 struct Binaries {
80 const char *ucdFile;
81 const Binary *binaries;
82 int32_t binariesCount;
83 };
84 typedef struct Binaries Binaries;
85
86 static const Binary
87 propListNames[]={
88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK }
89 };
90
91 static const Binaries
92 propListBinaries={
93 "PropList", propListNames, LENGTHOF(propListNames)
94 };
95
96 static const Binary
97 derCorePropsNames[]={
98 { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK },
99 { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK }
100 };
101
102 static const Binaries
103 derCorePropsBinaries={
104 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
105 };
106
107 /*
108 * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
109 * We need not distinguish between them because both add to case-ignorable.
110 * We ignore all other Word_Break values.
111 */
112 static const Binary
113 wordBreakNames[]={
114 { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
115 { "MidNumLet", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
116 };
117
118 static const Binaries
119 wordBreakBinaries={
120 "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
121 };
122
123 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)124 binariesLineFn(void *context,
125 char *fields[][2], int32_t fieldCount,
126 UErrorCode *pErrorCode) {
127 const Binaries *bin;
128 char *s;
129 uint32_t start, end;
130 int32_t i;
131
132 bin=(const Binaries *)context;
133
134 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
135 if(U_FAILURE(*pErrorCode)) {
136 fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
137 exit(*pErrorCode);
138 }
139
140 /* parse binary property name */
141 s=(char *)u_skipWhitespace(fields[1][0]);
142 for(i=0;; ++i) {
143 if(i==bin->binariesCount) {
144 /* ignore unrecognized properties */
145 return;
146 }
147 if(isToken(bin->binaries[i].propName, s)) {
148 break;
149 }
150 }
151
152 if(bin->binaries[i].vecMask==0) {
153 fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
154 (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
155 exit(U_INTERNAL_PROGRAM_ERROR);
156 }
157
158 upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
159 if(U_FAILURE(*pErrorCode)) {
160 fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
161 bin->binaries[i].propName, u_errorName(*pErrorCode));
162 exit(*pErrorCode);
163 }
164 }
165
166 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)167 parseBinariesFile(char *filename, char *basename, const char *suffix,
168 const Binaries *bin,
169 UErrorCode *pErrorCode) {
170 char *fields[2][2];
171
172 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
173 return;
174 }
175
176 writeUCDFilename(basename, bin->ucdFile, suffix);
177
178 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
179 if(U_FAILURE(*pErrorCode)) {
180 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
181 }
182 }
183
184 /* -------------------------------------------------------------------------- */
185
186 enum
187 {
188 HELP_H,
189 HELP_QUESTION_MARK,
190 VERBOSE,
191 COPYRIGHT,
192 DESTDIR,
193 SOURCEDIR,
194 UNICODE_VERSION,
195 ICUDATADIR,
196 CSOURCE
197 };
198
199 /* Keep these values in sync with the above enums */
200 static UOption options[]={
201 UOPTION_HELP_H,
202 UOPTION_HELP_QUESTION_MARK,
203 UOPTION_VERBOSE,
204 UOPTION_COPYRIGHT,
205 UOPTION_DESTDIR,
206 UOPTION_SOURCEDIR,
207 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
208 UOPTION_ICUDATADIR,
209 UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
210 };
211
212 extern int
main(int argc,char * argv[])213 main(int argc, char* argv[]) {
214 char filename[300];
215 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
216 char *basename=NULL;
217 UErrorCode errorCode=U_ZERO_ERROR;
218
219 U_MAIN_INIT_ARGS(argc, argv);
220
221 /* preset then read command line options */
222 options[DESTDIR].value=u_getDataDirectory();
223 options[SOURCEDIR].value="";
224 options[UNICODE_VERSION].value="";
225 options[ICUDATADIR].value=u_getDataDirectory();
226 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
227
228 /* error handling, printing usage message */
229 if(argc<0) {
230 fprintf(stderr,
231 "error in command line argument \"%s\"\n",
232 argv[-argc]);
233 }
234 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
235 /*
236 * Broken into chucks because the C89 standard says the minimum
237 * required supported string length is 509 bytes.
238 */
239 fprintf(stderr,
240 "Usage: %s [-options] [suffix]\n"
241 "\n"
242 "read the UnicodeData.txt file and other Unicode properties files and\n"
243 "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
244 "\n",
245 argv[0]);
246 fprintf(stderr,
247 "Options:\n"
248 "\t-h or -? or --help this usage text\n"
249 "\t-v or --verbose verbose output\n"
250 "\t-c or --copyright include a copyright notice\n"
251 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
252 "\t-C or --csource generate a .c source file rather than the .icu binary\n");
253 fprintf(stderr,
254 "\t-d or --destdir destination directory, followed by the path\n"
255 "\t-s or --sourcedir source directory, followed by the path\n"
256 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
257 "\t followed by path, defaults to %s\n"
258 "\tsuffix suffix that is to be appended with a '-'\n"
259 "\t to the source file basenames before opening;\n"
260 "\t 'gencase new' will read UnicodeData-new.txt etc.\n",
261 u_getDataDirectory());
262 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
263 }
264
265 /* get the options values */
266 beVerbose=options[VERBOSE].doesOccur;
267 haveCopyright=options[COPYRIGHT].doesOccur;
268 srcDir=options[SOURCEDIR].value;
269 destDir=options[DESTDIR].value;
270
271 if(argc>=2) {
272 suffix=argv[1];
273 } else {
274 suffix=NULL;
275 }
276
277 if(options[UNICODE_VERSION].doesOccur) {
278 setUnicodeVersion(options[UNICODE_VERSION].value);
279 }
280 /* else use the default dataVersion in store.c */
281
282 if (options[ICUDATADIR].doesOccur) {
283 u_setDataDirectory(options[ICUDATADIR].value);
284 }
285
286 /* prepare the filename beginning with the source dir */
287 uprv_strcpy(filename, srcDir);
288 basename=filename+uprv_strlen(filename);
289 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
290 *basename++=U_FILE_SEP_CHAR;
291 }
292
293 /* initialize */
294 pv=upvec_open(2, &errorCode);
295 caseSensitive=uset_open(1, 0); /* empty set (start>end) */
296
297 /* process SpecialCasing.txt */
298 writeUCDFilename(basename, "SpecialCasing", suffix);
299 parseSpecialCasing(filename, &errorCode);
300
301 /* process CaseFolding.txt */
302 writeUCDFilename(basename, "CaseFolding", suffix);
303 parseCaseFolding(filename, &errorCode);
304
305 /* process additional properties files */
306 *basename=0;
307
308 parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
309
310 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
311
312 if(ucdVersion>=UNI_4_1) {
313 parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
314 }
315
316 /* process UnicodeData.txt */
317 writeUCDFilename(basename, "UnicodeData", suffix);
318 parseDB(filename, &errorCode);
319
320 /* process parsed data */
321 makeCaseClosure();
322
323 makeExceptions();
324
325 if(U_SUCCESS(errorCode)) {
326 /* write the properties data file */
327 generateData(destDir, options[CSOURCE].doesOccur);
328 }
329
330 u_cleanup();
331 return errorCode;
332 }
333
334 U_CFUNC void
writeUCDFilename(char * basename,const char * filename,const char * suffix)335 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
336 int32_t length=(int32_t)uprv_strlen(filename);
337 uprv_strcpy(basename, filename);
338 if(suffix!=NULL) {
339 basename[length++]='-';
340 uprv_strcpy(basename+length, suffix);
341 length+=(int32_t)uprv_strlen(suffix);
342 }
343 uprv_strcpy(basename+length, ".txt");
344 }
345
346 /* TODO: move to toolutil */
347 U_CFUNC UBool
isToken(const char * token,const char * s)348 isToken(const char *token, const char *s) {
349 const char *z;
350 int32_t j;
351
352 s=u_skipWhitespace(s);
353 for(j=0;; ++j) {
354 if(token[j]!=0) {
355 if(s[j]!=token[j]) {
356 break;
357 }
358 } else {
359 z=u_skipWhitespace(s+j);
360 if(*z==';' || *z==0) {
361 return TRUE;
362 } else {
363 break;
364 }
365 }
366 }
367
368 return FALSE;
369 }
370
371 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)372 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
373 const char *t, *z;
374 int32_t i, j;
375
376 s=u_skipWhitespace(s);
377 for(i=0; i<countTokens; ++i) {
378 t=tokens[i];
379 if(t!=NULL) {
380 for(j=0;; ++j) {
381 if(t[j]!=0) {
382 if(s[j]!=t[j]) {
383 break;
384 }
385 } else {
386 z=u_skipWhitespace(s+j);
387 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
388 return i;
389 } else {
390 break;
391 }
392 }
393 }
394 }
395 }
396 return -1;
397 }
398
399 static void
_set_addAll(USet * set,const UChar * s,int32_t length)400 _set_addAll(USet *set, const UChar *s, int32_t length) {
401 UChar32 c;
402 int32_t i;
403
404 /* needs length>=0 */
405 for(i=0; i<length; /* U16_NEXT advances i */) {
406 U16_NEXT(s, i, length, c);
407 uset_add(set, c);
408 }
409 }
410
411 /* parser for SpecialCasing.txt --------------------------------------------- */
412
413 #define MAX_SPECIAL_CASING_COUNT 500
414
415 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
416 static int32_t specialCasingCount=0;
417
418 static void U_CALLCONV
specialCasingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)419 specialCasingLineFn(void *context,
420 char *fields[][2], int32_t fieldCount,
421 UErrorCode *pErrorCode) {
422 char *end;
423
424 /* get code point */
425 specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
426 end=(char *)u_skipWhitespace(end);
427 if(end<=fields[0][0] || end!=fields[0][1]) {
428 fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
429 *pErrorCode=U_PARSE_ERROR;
430 exit(U_PARSE_ERROR);
431 }
432
433 /* is this a complex mapping? */
434 if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
435 /* there is some condition text in the fifth field */
436 specialCasings[specialCasingCount].isComplex=TRUE;
437
438 /* do not store any actual mappings for this */
439 specialCasings[specialCasingCount].lowerCase[0]=0;
440 specialCasings[specialCasingCount].upperCase[0]=0;
441 specialCasings[specialCasingCount].titleCase[0]=0;
442 } else {
443 /* just set the "complex" flag and get the case mappings */
444 specialCasings[specialCasingCount].isComplex=FALSE;
445 specialCasings[specialCasingCount].lowerCase[0]=
446 (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
447 specialCasings[specialCasingCount].upperCase[0]=
448 (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
449 specialCasings[specialCasingCount].titleCase[0]=
450 (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
451 if(U_FAILURE(*pErrorCode)) {
452 fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
453 exit(*pErrorCode);
454 }
455
456 uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
457 _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
458 _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
459 _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
460 }
461
462 if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
463 fprintf(stderr, "gencase: too many special casing mappings\n");
464 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
465 exit(U_INDEX_OUTOFBOUNDS_ERROR);
466 }
467 }
468
469 static int32_t U_CALLCONV
compareSpecialCasings(const void * context,const void * left,const void * right)470 compareSpecialCasings(const void *context, const void *left, const void *right) {
471 return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
472 }
473
474 static void
parseSpecialCasing(const char * filename,UErrorCode * pErrorCode)475 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
476 char *fields[5][2];
477 int32_t i, j;
478
479 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
480 return;
481 }
482
483 u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
484
485 /* sort the special casing entries by code point */
486 if(specialCasingCount>0) {
487 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
488 compareSpecialCasings, NULL, FALSE, pErrorCode);
489 }
490 if(U_FAILURE(*pErrorCode)) {
491 return;
492 }
493
494 /* replace multiple entries for any code point by one "complex" one */
495 j=0;
496 for(i=1; i<specialCasingCount; ++i) {
497 if(specialCasings[i-1].code==specialCasings[i].code) {
498 /* there is a duplicate code point */
499 specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */
500 specialCasings[i].isComplex=TRUE; /* make the following one complex */
501 specialCasings[i].lowerCase[0]=0;
502 specialCasings[i].upperCase[0]=0;
503 specialCasings[i].titleCase[0]=0;
504 ++j;
505 }
506 }
507
508 /* if some entries just were removed, then re-sort */
509 if(j>0) {
510 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
511 compareSpecialCasings, NULL, FALSE, pErrorCode);
512 specialCasingCount-=j;
513 }
514 if(U_FAILURE(*pErrorCode)) {
515 return;
516 }
517
518 /*
519 * Add one complex mapping to caseSensitive that was filtered out above:
520 * Greek final Sigma has a conditional mapping but not locale-sensitive,
521 * and it is taken when lowercasing just U+03A3 alone.
522 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
523 */
524 uset_add(caseSensitive, 0x3c2);
525 }
526
527 /* parser for CaseFolding.txt ----------------------------------------------- */
528
529 #define MAX_CASE_FOLDING_COUNT 2000
530
531 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
532 static int32_t caseFoldingCount=0;
533
534 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)535 caseFoldingLineFn(void *context,
536 char *fields[][2], int32_t fieldCount,
537 UErrorCode *pErrorCode) {
538 char *end;
539 static UChar32 prevCode=0;
540 int32_t count;
541 char status;
542
543 /* get code point */
544 caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
545 end=(char *)u_skipWhitespace(end);
546 if(end<=fields[0][0] || end!=fields[0][1]) {
547 fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
548 *pErrorCode=U_PARSE_ERROR;
549 exit(U_PARSE_ERROR);
550 }
551
552 /* get the status of this mapping */
553 caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
554 if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
555 fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
556 *pErrorCode=U_PARSE_ERROR;
557 exit(U_PARSE_ERROR);
558 }
559
560 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
561 if(status=='L') {
562 return;
563 }
564
565 /* get the mapping */
566 count=caseFoldings[caseFoldingCount].full[0]=
567 (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
568 if(U_FAILURE(*pErrorCode)) {
569 fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
570 exit(*pErrorCode);
571 }
572
573 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
574 if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
575 caseFoldings[caseFoldingCount].simple=0;
576 }
577
578 /* update the case-sensitive set */
579 if(status!='T') {
580 uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
581 _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
582 }
583
584 /* check the status */
585 if(status=='S') {
586 /* check if there was a full mapping for this code point before */
587 if( caseFoldingCount>0 &&
588 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
589 caseFoldings[caseFoldingCount-1].status=='F'
590 ) {
591 /* merge the two entries */
592 caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
593 return;
594 }
595 } else if(status=='F') {
596 /* check if there was a simple mapping for this code point before */
597 if( caseFoldingCount>0 &&
598 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
599 caseFoldings[caseFoldingCount-1].status=='S'
600 ) {
601 /* merge the two entries */
602 uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
603 return;
604 }
605 } else if(status=='I' || status=='T') {
606 /* check if there was a default mapping for this code point before (remove it) */
607 while(caseFoldingCount>0 &&
608 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
609 ) {
610 prevCode=0;
611 --caseFoldingCount;
612 }
613 /* store only a marker for special handling for cases like dotless i */
614 caseFoldings[caseFoldingCount].simple=0;
615 caseFoldings[caseFoldingCount].full[0]=0;
616 }
617
618 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
619 if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
620 fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
621 (unsigned long)caseFoldings[caseFoldingCount].code,
622 (unsigned long)prevCode);
623 *pErrorCode=U_PARSE_ERROR;
624 exit(U_PARSE_ERROR);
625 }
626 prevCode=caseFoldings[caseFoldingCount].code;
627
628 if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
629 fprintf(stderr, "gencase: too many case folding mappings\n");
630 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
631 exit(U_INDEX_OUTOFBOUNDS_ERROR);
632 }
633 }
634
635 static void
parseCaseFolding(const char * filename,UErrorCode * pErrorCode)636 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
637 char *fields[3][2];
638
639 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
640 return;
641 }
642
643 u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
644 }
645
646 /* parser for UnicodeData.txt ----------------------------------------------- */
647
648 /* general categories */
649 const char *const
650 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
651 "Cn",
652 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
653 "Mc", "Nd", "Nl", "No",
654 "Zs", "Zl", "Zp",
655 "Cc", "Cf", "Co", "Cs",
656 "Pd", "Ps", "Pe", "Pc", "Po",
657 "Sm", "Sc", "Sk", "So",
658 "Pi", "Pf"
659 };
660
661 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
662
663 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)664 unicodeDataLineFn(void *context,
665 char *fields[][2], int32_t fieldCount,
666 UErrorCode *pErrorCode) {
667 Props p;
668 char *end;
669 static UChar32 prevCode=0;
670 UChar32 value;
671 int32_t i;
672
673 /* reset the properties */
674 uprv_memset(&p, 0, sizeof(Props));
675
676 /* get the character code, field 0 */
677 p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
678 if(end<=fields[0][0] || end!=fields[0][1]) {
679 fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
680 *pErrorCode=U_PARSE_ERROR;
681 exit(U_PARSE_ERROR);
682 }
683
684 /* get general category, field 2 */
685 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
686 if(i>=0) {
687 p.gc=(uint8_t)i;
688 } else {
689 fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
690 fields[2][0], (unsigned long)p.code);
691 *pErrorCode=U_PARSE_ERROR;
692 exit(U_PARSE_ERROR);
693 }
694
695 /* get canonical combining class, field 3 */
696 value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
697 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
698 fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
699 *pErrorCode=U_PARSE_ERROR;
700 exit(U_PARSE_ERROR);
701 }
702 p.cc=(uint8_t)value;
703
704 /* get uppercase mapping, field 12 */
705 value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
706 if(end!=fields[12][1]) {
707 fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
708 (unsigned long)p.code);
709 *pErrorCode=U_PARSE_ERROR;
710 exit(U_PARSE_ERROR);
711 }
712 if(value!=0 && value!=p.code) {
713 p.upperCase=value;
714 uset_add(caseSensitive, p.code);
715 uset_add(caseSensitive, value);
716 }
717
718 /* get lowercase value, field 13 */
719 value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
720 if(end!=fields[13][1]) {
721 fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
722 (unsigned long)p.code);
723 *pErrorCode=U_PARSE_ERROR;
724 exit(U_PARSE_ERROR);
725 }
726 if(value!=0 && value!=p.code) {
727 p.lowerCase=value;
728 uset_add(caseSensitive, p.code);
729 uset_add(caseSensitive, value);
730 }
731
732 /* get titlecase value, field 14 */
733 value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
734 if(end!=fields[14][1]) {
735 fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
736 (unsigned long)p.code);
737 *pErrorCode=U_PARSE_ERROR;
738 exit(U_PARSE_ERROR);
739 }
740 if(value!=0 && value!=p.code) {
741 p.titleCase=value;
742 uset_add(caseSensitive, p.code);
743 uset_add(caseSensitive, value);
744 }
745
746 /* set additional properties from previously parsed files */
747 if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
748 p.specialCasing=specialCasings+specialCasingIndex++;
749 } else {
750 p.specialCasing=NULL;
751 }
752 if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
753 p.caseFolding=caseFoldings+caseFoldingIndex++;
754
755 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
756 if( p.caseFolding->status=='C' &&
757 p.caseFolding->simple==p.lowerCase
758 ) {
759 p.caseFolding=NULL;
760 }
761 } else {
762 p.caseFolding=NULL;
763 }
764
765 /* check for non-character code points */
766 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
767 fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
768 (unsigned long)p.code);
769 *pErrorCode=U_PARSE_ERROR;
770 exit(U_PARSE_ERROR);
771 }
772
773 /* check that the code points (p.code) are in ascending order */
774 if(p.code<=prevCode && p.code>0) {
775 fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
776 (unsigned long)p.code, (unsigned long)prevCode);
777 *pErrorCode=U_PARSE_ERROR;
778 exit(U_PARSE_ERROR);
779 }
780
781 /* properties for a single code point */
782 setProps(&p);
783
784 prevCode=p.code;
785 }
786
787 static void
parseDB(const char * filename,UErrorCode * pErrorCode)788 parseDB(const char *filename, UErrorCode *pErrorCode) {
789 char *fields[15][2];
790 UChar32 start, end;
791 int32_t i;
792
793 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
794 return;
795 }
796
797 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
798
799 /* are all sub-properties consumed? */
800 if(specialCasingIndex<specialCasingCount) {
801 fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
802 *pErrorCode=U_PARSE_ERROR;
803 exit(U_PARSE_ERROR);
804 }
805 if(caseFoldingIndex<caseFoldingCount) {
806 fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
807 *pErrorCode=U_PARSE_ERROR;
808 exit(U_PARSE_ERROR);
809 }
810
811 if(U_FAILURE(*pErrorCode)) {
812 return;
813 }
814
815 for(i=0;
816 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
817 ++i
818 ) {
819 addCaseSensitive(start, end);
820 }
821 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
822 *pErrorCode=U_ZERO_ERROR;
823 }
824 }
825
826 /*
827 * Hey, Emacs, please set the following:
828 *
829 * Local Variables:
830 * indent-tabs-mode: nil
831 * End:
832 *
833 */
834