• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2004-2005, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  gencase.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004aug28
14 *   created by: Markus W. Scherer
15 *
16 *   This program reads several of the Unicode character database text files,
17 *   parses them, and the case mapping properties for each character.
18 *   It then writes a binary file containing the properties
19 *   that is designed to be used directly for random-access to
20 *   the properties of each Unicode character.
21 */
22 
23 #include <stdio.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "uarrsort.h"
32 #include "unewdata.h"
33 #include "uoptions.h"
34 #include "uparse.h"
35 #include "uprops.h"
36 #include "propsvec.h"
37 #include "gencase.h"
38 
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40 
41 /* data --------------------------------------------------------------------- */
42 
43 uint32_t *pv;
44 
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46 
47 /*
48  * Unicode set collecting the case-sensitive characters;
49  * see uchar.h UCHAR_CASE_SENSITIVE.
50  * Add code points from case mappings/foldings in
51  * the root locale and with default options.
52  */
53 static USet *caseSensitive;
54 
55 /* prototypes --------------------------------------------------------------- */
56 
57 static void
58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
59 
60 static void
61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
62 
63 static void
64 parseDB(const char *filename, UErrorCode *pErrorCode);
65 
66 /* parse files with multiple binary properties ------------------------------ */
67 
68 /* TODO: more common code, move functions to uparse.h|c */
69 
70 /* TODO: similar to genprops/props2.c but not the same */
71 
72 struct Binary {
73     const char *propName;
74     int32_t vecWord;
75     uint32_t vecValue, vecMask;
76 };
77 typedef struct Binary Binary;
78 
79 struct Binaries {
80     const char *ucdFile;
81     const Binary *binaries;
82     int32_t binariesCount;
83 };
84 typedef struct Binaries Binaries;
85 
86 static const Binary
87 propListNames[]={
88     { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
89 };
90 
91 static const Binaries
92 propListBinaries={
93     "PropList", propListNames, LENGTHOF(propListNames)
94 };
95 
96 static const Binary
97 derCorePropsNames[]={
98     { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
99     { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK }
100 };
101 
102 static const Binaries
103 derCorePropsBinaries={
104     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
105 };
106 
107 /* treat Word_Break=MidLetter as a binary property (we ignore all other Word_Break values) */
108 static const Binary
109 wordBreakNames[]={
110     { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
111 };
112 
113 static const Binaries
114 wordBreakBinaries={
115     "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
116 };
117 
118 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)119 binariesLineFn(void *context,
120                char *fields[][2], int32_t fieldCount,
121                UErrorCode *pErrorCode) {
122     const Binaries *bin;
123     char *s;
124     uint32_t start, limit;
125     int32_t i;
126 
127     bin=(const Binaries *)context;
128 
129     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
130     if(U_FAILURE(*pErrorCode)) {
131         fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
132         exit(*pErrorCode);
133     }
134     ++limit;
135 
136     /* parse binary property name */
137     s=(char *)u_skipWhitespace(fields[1][0]);
138     for(i=0;; ++i) {
139         if(i==bin->binariesCount) {
140             /* ignore unrecognized properties */
141             return;
142         }
143         if(isToken(bin->binaries[i].propName, s)) {
144             break;
145         }
146     }
147 
148     if(bin->binaries[i].vecMask==0) {
149         fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
150                         (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
151         exit(U_INTERNAL_PROGRAM_ERROR);
152     }
153 
154     if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
155         fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
156                         bin->binaries[i].propName, u_errorName(*pErrorCode));
157         exit(*pErrorCode);
158     }
159 }
160 
161 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)162 parseBinariesFile(char *filename, char *basename, const char *suffix,
163                   const Binaries *bin,
164                   UErrorCode *pErrorCode) {
165     char *fields[2][2];
166 
167     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
168         return;
169     }
170 
171     writeUCDFilename(basename, bin->ucdFile, suffix);
172 
173     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
174     if(U_FAILURE(*pErrorCode)) {
175         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
176     }
177 }
178 
179 /* -------------------------------------------------------------------------- */
180 
181 enum
182 {
183     HELP_H,
184     HELP_QUESTION_MARK,
185     VERBOSE,
186     COPYRIGHT,
187     DESTDIR,
188     SOURCEDIR,
189     UNICODE_VERSION,
190     ICUDATADIR,
191     CSOURCE
192 };
193 
194 /* Keep these values in sync with the above enums */
195 static UOption options[]={
196     UOPTION_HELP_H,
197     UOPTION_HELP_QUESTION_MARK,
198     UOPTION_VERBOSE,
199     UOPTION_COPYRIGHT,
200     UOPTION_DESTDIR,
201     UOPTION_SOURCEDIR,
202     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
203     UOPTION_ICUDATADIR,
204     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
205 };
206 
207 extern int
main(int argc,char * argv[])208 main(int argc, char* argv[]) {
209     char filename[300];
210     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
211     char *basename=NULL;
212     UErrorCode errorCode=U_ZERO_ERROR;
213 
214     U_MAIN_INIT_ARGS(argc, argv);
215 
216     /* preset then read command line options */
217     options[DESTDIR].value=u_getDataDirectory();
218     options[SOURCEDIR].value="";
219     options[UNICODE_VERSION].value="";
220     options[ICUDATADIR].value=u_getDataDirectory();
221     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
222 
223     /* error handling, printing usage message */
224     if(argc<0) {
225         fprintf(stderr,
226             "error in command line argument \"%s\"\n",
227             argv[-argc]);
228     }
229     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
230         /*
231          * Broken into chucks because the C89 standard says the minimum
232          * required supported string length is 509 bytes.
233          */
234         fprintf(stderr,
235             "Usage: %s [-options] [suffix]\n"
236             "\n"
237             "read the UnicodeData.txt file and other Unicode properties files and\n"
238             "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
239             "\n",
240             argv[0]);
241         fprintf(stderr,
242             "Options:\n"
243             "\t-h or -? or --help  this usage text\n"
244             "\t-v or --verbose     verbose output\n"
245             "\t-c or --copyright   include a copyright notice\n"
246             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
247             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
248         fprintf(stderr,
249             "\t-d or --destdir     destination directory, followed by the path\n"
250             "\t-s or --sourcedir   source directory, followed by the path\n"
251             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
252             "\t                    followed by path, defaults to %s\n"
253             "\tsuffix              suffix that is to be appended with a '-'\n"
254             "\t                    to the source file basenames before opening;\n"
255             "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
256             u_getDataDirectory());
257         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
258     }
259 
260     /* get the options values */
261     beVerbose=options[VERBOSE].doesOccur;
262     haveCopyright=options[COPYRIGHT].doesOccur;
263     srcDir=options[SOURCEDIR].value;
264     destDir=options[DESTDIR].value;
265 
266     if(argc>=2) {
267         suffix=argv[1];
268     } else {
269         suffix=NULL;
270     }
271 
272     if(options[UNICODE_VERSION].doesOccur) {
273         setUnicodeVersion(options[UNICODE_VERSION].value);
274     }
275     /* else use the default dataVersion in store.c */
276 
277     if (options[ICUDATADIR].doesOccur) {
278         u_setDataDirectory(options[ICUDATADIR].value);
279     }
280 
281     /* prepare the filename beginning with the source dir */
282     uprv_strcpy(filename, srcDir);
283     basename=filename+uprv_strlen(filename);
284     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
285         *basename++=U_FILE_SEP_CHAR;
286     }
287 
288     /* initialize */
289     pv=upvec_open(2, 10000);
290     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
291 
292     /* process SpecialCasing.txt */
293     writeUCDFilename(basename, "SpecialCasing", suffix);
294     parseSpecialCasing(filename, &errorCode);
295 
296     /* process CaseFolding.txt */
297     writeUCDFilename(basename, "CaseFolding", suffix);
298     parseCaseFolding(filename, &errorCode);
299 
300     /* process additional properties files */
301     *basename=0;
302 
303     parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
304 
305     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
306 
307     if(ucdVersion>=UNI_4_1) {
308         parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
309     }
310 
311     /* process UnicodeData.txt */
312     writeUCDFilename(basename, "UnicodeData", suffix);
313     parseDB(filename, &errorCode);
314 
315     /* process parsed data */
316     makeCaseClosure();
317 
318     makeExceptions();
319 
320     if(U_SUCCESS(errorCode)) {
321         /* write the properties data file */
322         generateData(destDir, options[CSOURCE].doesOccur);
323     }
324 
325     u_cleanup();
326     return errorCode;
327 }
328 
329 U_CFUNC void
writeUCDFilename(char * basename,const char * filename,const char * suffix)330 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
331     int32_t length=(int32_t)uprv_strlen(filename);
332     uprv_strcpy(basename, filename);
333     if(suffix!=NULL) {
334         basename[length++]='-';
335         uprv_strcpy(basename+length, suffix);
336         length+=(int32_t)uprv_strlen(suffix);
337     }
338     uprv_strcpy(basename+length, ".txt");
339 }
340 
341 /* TODO: move to toolutil */
342 U_CFUNC UBool
isToken(const char * token,const char * s)343 isToken(const char *token, const char *s) {
344     const char *z;
345     int32_t j;
346 
347     s=u_skipWhitespace(s);
348     for(j=0;; ++j) {
349         if(token[j]!=0) {
350             if(s[j]!=token[j]) {
351                 break;
352             }
353         } else {
354             z=u_skipWhitespace(s+j);
355             if(*z==';' || *z==0) {
356                 return TRUE;
357             } else {
358                 break;
359             }
360         }
361     }
362 
363     return FALSE;
364 }
365 
366 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)367 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
368     const char *t, *z;
369     int32_t i, j;
370 
371     s=u_skipWhitespace(s);
372     for(i=0; i<countTokens; ++i) {
373         t=tokens[i];
374         if(t!=NULL) {
375             for(j=0;; ++j) {
376                 if(t[j]!=0) {
377                     if(s[j]!=t[j]) {
378                         break;
379                     }
380                 } else {
381                     z=u_skipWhitespace(s+j);
382                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
383                         return i;
384                     } else {
385                         break;
386                     }
387                 }
388             }
389         }
390     }
391     return -1;
392 }
393 
394 static void
_set_addAll(USet * set,const UChar * s,int32_t length)395 _set_addAll(USet *set, const UChar *s, int32_t length) {
396     UChar32 c;
397     int32_t i;
398 
399     /* needs length>=0 */
400     for(i=0; i<length; /* U16_NEXT advances i */) {
401         U16_NEXT(s, i, length, c);
402         uset_add(set, c);
403     }
404 }
405 
406 /* parser for SpecialCasing.txt --------------------------------------------- */
407 
408 #define MAX_SPECIAL_CASING_COUNT 500
409 
410 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
411 static int32_t specialCasingCount=0;
412 
413 static void U_CALLCONV
specialCasingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)414 specialCasingLineFn(void *context,
415                     char *fields[][2], int32_t fieldCount,
416                     UErrorCode *pErrorCode) {
417     char *end;
418 
419     /* get code point */
420     specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
421     end=(char *)u_skipWhitespace(end);
422     if(end<=fields[0][0] || end!=fields[0][1]) {
423         fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
424         *pErrorCode=U_PARSE_ERROR;
425         exit(U_PARSE_ERROR);
426     }
427 
428     /* is this a complex mapping? */
429     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
430         /* there is some condition text in the fifth field */
431         specialCasings[specialCasingCount].isComplex=TRUE;
432 
433         /* do not store any actual mappings for this */
434         specialCasings[specialCasingCount].lowerCase[0]=0;
435         specialCasings[specialCasingCount].upperCase[0]=0;
436         specialCasings[specialCasingCount].titleCase[0]=0;
437     } else {
438         /* just set the "complex" flag and get the case mappings */
439         specialCasings[specialCasingCount].isComplex=FALSE;
440         specialCasings[specialCasingCount].lowerCase[0]=
441             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
442         specialCasings[specialCasingCount].upperCase[0]=
443             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
444         specialCasings[specialCasingCount].titleCase[0]=
445             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
446         if(U_FAILURE(*pErrorCode)) {
447             fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
448             exit(*pErrorCode);
449         }
450 
451         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
452         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
453         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
454         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
455     }
456 
457     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
458         fprintf(stderr, "gencase: too many special casing mappings\n");
459         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
460         exit(U_INDEX_OUTOFBOUNDS_ERROR);
461     }
462 }
463 
464 static int32_t U_CALLCONV
compareSpecialCasings(const void * context,const void * left,const void * right)465 compareSpecialCasings(const void *context, const void *left, const void *right) {
466     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
467 }
468 
469 static void
parseSpecialCasing(const char * filename,UErrorCode * pErrorCode)470 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
471     char *fields[5][2];
472     int32_t i, j;
473 
474     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
475         return;
476     }
477 
478     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
479 
480     /* sort the special casing entries by code point */
481     if(specialCasingCount>0) {
482         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
483                        compareSpecialCasings, NULL, FALSE, pErrorCode);
484     }
485     if(U_FAILURE(*pErrorCode)) {
486         return;
487     }
488 
489     /* replace multiple entries for any code point by one "complex" one */
490     j=0;
491     for(i=1; i<specialCasingCount; ++i) {
492         if(specialCasings[i-1].code==specialCasings[i].code) {
493             /* there is a duplicate code point */
494             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
495             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
496             specialCasings[i].lowerCase[0]=0;
497             specialCasings[i].upperCase[0]=0;
498             specialCasings[i].titleCase[0]=0;
499             ++j;
500         }
501     }
502 
503     /* if some entries just were removed, then re-sort */
504     if(j>0) {
505         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
506                        compareSpecialCasings, NULL, FALSE, pErrorCode);
507         specialCasingCount-=j;
508     }
509     if(U_FAILURE(*pErrorCode)) {
510         return;
511     }
512 
513     /*
514      * Add one complex mapping to caseSensitive that was filtered out above:
515      * Greek final Sigma has a conditional mapping but not locale-sensitive,
516      * and it is taken when lowercasing just U+03A3 alone.
517      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
518      */
519     uset_add(caseSensitive, 0x3c2);
520 }
521 
522 /* parser for CaseFolding.txt ----------------------------------------------- */
523 
524 #define MAX_CASE_FOLDING_COUNT 2000
525 
526 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
527 static int32_t caseFoldingCount=0;
528 
529 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)530 caseFoldingLineFn(void *context,
531                   char *fields[][2], int32_t fieldCount,
532                   UErrorCode *pErrorCode) {
533     char *end;
534     static UChar32 prevCode=0;
535     int32_t count;
536     char status;
537 
538     /* get code point */
539     caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
540     end=(char *)u_skipWhitespace(end);
541     if(end<=fields[0][0] || end!=fields[0][1]) {
542         fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
543         *pErrorCode=U_PARSE_ERROR;
544         exit(U_PARSE_ERROR);
545     }
546 
547     /* get the status of this mapping */
548     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
549     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
550         fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
551         *pErrorCode=U_PARSE_ERROR;
552         exit(U_PARSE_ERROR);
553     }
554 
555     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
556     if(status=='L') {
557         return;
558     }
559 
560     /* get the mapping */
561     count=caseFoldings[caseFoldingCount].full[0]=
562         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
563     if(U_FAILURE(*pErrorCode)) {
564         fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
565         exit(*pErrorCode);
566     }
567 
568     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
569     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
570         caseFoldings[caseFoldingCount].simple=0;
571     }
572 
573     /* update the case-sensitive set */
574     if(status!='T') {
575         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
576         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
577     }
578 
579     /* check the status */
580     if(status=='S') {
581         /* check if there was a full mapping for this code point before */
582         if( caseFoldingCount>0 &&
583             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
584             caseFoldings[caseFoldingCount-1].status=='F'
585         ) {
586             /* merge the two entries */
587             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
588             return;
589         }
590     } else if(status=='F') {
591         /* check if there was a simple mapping for this code point before */
592         if( caseFoldingCount>0 &&
593             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
594             caseFoldings[caseFoldingCount-1].status=='S'
595         ) {
596             /* merge the two entries */
597             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
598             return;
599         }
600     } else if(status=='I' || status=='T') {
601         /* check if there was a default mapping for this code point before (remove it) */
602         while(caseFoldingCount>0 &&
603               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
604         ) {
605             prevCode=0;
606             --caseFoldingCount;
607         }
608         /* store only a marker for special handling for cases like dotless i */
609         caseFoldings[caseFoldingCount].simple=0;
610         caseFoldings[caseFoldingCount].full[0]=0;
611     }
612 
613     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
614     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
615         fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
616                 (unsigned long)caseFoldings[caseFoldingCount].code,
617                 (unsigned long)prevCode);
618         *pErrorCode=U_PARSE_ERROR;
619         exit(U_PARSE_ERROR);
620     }
621     prevCode=caseFoldings[caseFoldingCount].code;
622 
623     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
624         fprintf(stderr, "gencase: too many case folding mappings\n");
625         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
626         exit(U_INDEX_OUTOFBOUNDS_ERROR);
627     }
628 }
629 
630 static void
parseCaseFolding(const char * filename,UErrorCode * pErrorCode)631 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
632     char *fields[3][2];
633 
634     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
635         return;
636     }
637 
638     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
639 }
640 
641 /* parser for UnicodeData.txt ----------------------------------------------- */
642 
643 /* general categories */
644 const char *const
645 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
646     "Cn",
647     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
648     "Mc", "Nd", "Nl", "No",
649     "Zs", "Zl", "Zp",
650     "Cc", "Cf", "Co", "Cs",
651     "Pd", "Ps", "Pe", "Pc", "Po",
652     "Sm", "Sc", "Sk", "So",
653     "Pi", "Pf"
654 };
655 
656 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
657 
658 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)659 unicodeDataLineFn(void *context,
660                   char *fields[][2], int32_t fieldCount,
661                   UErrorCode *pErrorCode) {
662     Props p;
663     char *end;
664     static UChar32 prevCode=0;
665     UChar32 value;
666     int32_t i;
667 
668     /* reset the properties */
669     uprv_memset(&p, 0, sizeof(Props));
670 
671     /* get the character code, field 0 */
672     p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
673     if(end<=fields[0][0] || end!=fields[0][1]) {
674         fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
675         *pErrorCode=U_PARSE_ERROR;
676         exit(U_PARSE_ERROR);
677     }
678 
679     /* get general category, field 2 */
680     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
681     if(i>=0) {
682         p.gc=(uint8_t)i;
683     } else {
684         fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
685             fields[2][0], (unsigned long)p.code);
686         *pErrorCode=U_PARSE_ERROR;
687         exit(U_PARSE_ERROR);
688     }
689 
690     /* get canonical combining class, field 3 */
691     value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
692     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
693         fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
694         *pErrorCode=U_PARSE_ERROR;
695         exit(U_PARSE_ERROR);
696     }
697     p.cc=(uint8_t)value;
698 
699     /* get uppercase mapping, field 12 */
700     value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
701     if(end!=fields[12][1]) {
702         fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
703             (unsigned long)p.code);
704         *pErrorCode=U_PARSE_ERROR;
705         exit(U_PARSE_ERROR);
706     }
707     if(value!=0 && value!=p.code) {
708         p.upperCase=value;
709         uset_add(caseSensitive, p.code);
710         uset_add(caseSensitive, value);
711     }
712 
713     /* get lowercase value, field 13 */
714     value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
715     if(end!=fields[13][1]) {
716         fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
717             (unsigned long)p.code);
718         *pErrorCode=U_PARSE_ERROR;
719         exit(U_PARSE_ERROR);
720     }
721     if(value!=0 && value!=p.code) {
722         p.lowerCase=value;
723         uset_add(caseSensitive, p.code);
724         uset_add(caseSensitive, value);
725     }
726 
727     /* get titlecase value, field 14 */
728     value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
729     if(end!=fields[14][1]) {
730         fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
731             (unsigned long)p.code);
732         *pErrorCode=U_PARSE_ERROR;
733         exit(U_PARSE_ERROR);
734     }
735     if(value!=0 && value!=p.code) {
736         p.titleCase=value;
737         uset_add(caseSensitive, p.code);
738         uset_add(caseSensitive, value);
739     }
740 
741     /* set additional properties from previously parsed files */
742     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
743         p.specialCasing=specialCasings+specialCasingIndex++;
744     } else {
745         p.specialCasing=NULL;
746     }
747     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
748         p.caseFolding=caseFoldings+caseFoldingIndex++;
749 
750         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
751         if( p.caseFolding->status=='C' &&
752             p.caseFolding->simple==p.lowerCase
753         ) {
754             p.caseFolding=NULL;
755         }
756     } else {
757         p.caseFolding=NULL;
758     }
759 
760     /* check for non-character code points */
761     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
762         fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
763                 (unsigned long)p.code);
764         *pErrorCode=U_PARSE_ERROR;
765         exit(U_PARSE_ERROR);
766     }
767 
768     /* check that the code points (p.code) are in ascending order */
769     if(p.code<=prevCode && p.code>0) {
770         fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
771                 (unsigned long)p.code, (unsigned long)prevCode);
772         *pErrorCode=U_PARSE_ERROR;
773         exit(U_PARSE_ERROR);
774     }
775 
776     /* properties for a single code point */
777     setProps(&p);
778 
779     prevCode=p.code;
780 }
781 
782 static void
parseDB(const char * filename,UErrorCode * pErrorCode)783 parseDB(const char *filename, UErrorCode *pErrorCode) {
784     char *fields[15][2];
785     UChar32 start, end;
786     int32_t i;
787 
788     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
789         return;
790     }
791 
792     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
793 
794     /* are all sub-properties consumed? */
795     if(specialCasingIndex<specialCasingCount) {
796         fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
797         *pErrorCode=U_PARSE_ERROR;
798         exit(U_PARSE_ERROR);
799     }
800     if(caseFoldingIndex<caseFoldingCount) {
801         fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
802         *pErrorCode=U_PARSE_ERROR;
803         exit(U_PARSE_ERROR);
804     }
805 
806     if(U_FAILURE(*pErrorCode)) {
807         return;
808     }
809 
810     for(i=0;
811         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
812         ++i
813     ) {
814         addCaseSensitive(start, end);
815     }
816     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
817         *pErrorCode=U_ZERO_ERROR;
818     }
819 }
820 
821 /*
822  * Hey, Emacs, please set the following:
823  *
824  * Local Variables:
825  * indent-tabs-mode: nil
826  * End:
827  *
828  */
829