• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2004-2008, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  gencase.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004aug28
14 *   created by: Markus W. Scherer
15 *
16 *   This program reads several of the Unicode character database text files,
17 *   parses them, and the case mapping properties for each character.
18 *   It then writes a binary file containing the properties
19 *   that is designed to be used directly for random-access to
20 *   the properties of each Unicode character.
21 */
22 
23 #include <stdio.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/uset.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "uarrsort.h"
32 #include "unewdata.h"
33 #include "uoptions.h"
34 #include "uparse.h"
35 #include "uprops.h"
36 #include "propsvec.h"
37 #include "gencase.h"
38 
39 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40 
41 /* data --------------------------------------------------------------------- */
42 
43 UPropsVectors *pv;
44 
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46 
47 /*
48  * Unicode set collecting the case-sensitive characters;
49  * see uchar.h UCHAR_CASE_SENSITIVE.
50  * Add code points from case mappings/foldings in
51  * the root locale and with default options.
52  */
53 static USet *caseSensitive;
54 
55 /* prototypes --------------------------------------------------------------- */
56 
57 static void
58 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
59 
60 static void
61 parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
62 
63 static void
64 parseDB(const char *filename, UErrorCode *pErrorCode);
65 
66 /* parse files with multiple binary properties ------------------------------ */
67 
68 /* TODO: more common code, move functions to uparse.h|c */
69 
70 /* TODO: similar to genprops/props2.c but not the same */
71 
72 struct Binary {
73     const char *propName;
74     int32_t vecWord;
75     uint32_t vecValue, vecMask;
76 };
77 typedef struct Binary Binary;
78 
79 struct Binaries {
80     const char *ucdFile;
81     const Binary *binaries;
82     int32_t binariesCount;
83 };
84 typedef struct Binaries Binaries;
85 
86 static const Binary
87 propListNames[]={
88     { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
89 };
90 
91 static const Binaries
92 propListBinaries={
93     "PropList", propListNames, LENGTHOF(propListNames)
94 };
95 
96 static const Binary
97 derCorePropsNames[]={
98     { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
99     { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK }
100 };
101 
102 static const Binaries
103 derCorePropsBinaries={
104     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
105 };
106 
107 /*
108  * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
109  * We need not distinguish between them because both add to case-ignorable.
110  * We ignore all other Word_Break values.
111  */
112 static const Binary
113 wordBreakNames[]={
114     { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
115     { "MidNumLet",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
116 };
117 
118 static const Binaries
119 wordBreakBinaries={
120     "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
121 };
122 
123 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)124 binariesLineFn(void *context,
125                char *fields[][2], int32_t fieldCount,
126                UErrorCode *pErrorCode) {
127     const Binaries *bin;
128     char *s;
129     uint32_t start, end;
130     int32_t i;
131 
132     bin=(const Binaries *)context;
133 
134     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
135     if(U_FAILURE(*pErrorCode)) {
136         fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
137         exit(*pErrorCode);
138     }
139 
140     /* parse binary property name */
141     s=(char *)u_skipWhitespace(fields[1][0]);
142     for(i=0;; ++i) {
143         if(i==bin->binariesCount) {
144             /* ignore unrecognized properties */
145             return;
146         }
147         if(isToken(bin->binaries[i].propName, s)) {
148             break;
149         }
150     }
151 
152     if(bin->binaries[i].vecMask==0) {
153         fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
154                         (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
155         exit(U_INTERNAL_PROGRAM_ERROR);
156     }
157 
158     upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
159     if(U_FAILURE(*pErrorCode)) {
160         fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
161                         bin->binaries[i].propName, u_errorName(*pErrorCode));
162         exit(*pErrorCode);
163     }
164 }
165 
166 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)167 parseBinariesFile(char *filename, char *basename, const char *suffix,
168                   const Binaries *bin,
169                   UErrorCode *pErrorCode) {
170     char *fields[2][2];
171 
172     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
173         return;
174     }
175 
176     writeUCDFilename(basename, bin->ucdFile, suffix);
177 
178     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
179     if(U_FAILURE(*pErrorCode)) {
180         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
181     }
182 }
183 
184 /* -------------------------------------------------------------------------- */
185 
186 enum
187 {
188     HELP_H,
189     HELP_QUESTION_MARK,
190     VERBOSE,
191     COPYRIGHT,
192     DESTDIR,
193     SOURCEDIR,
194     UNICODE_VERSION,
195     ICUDATADIR,
196     CSOURCE
197 };
198 
199 /* Keep these values in sync with the above enums */
200 static UOption options[]={
201     UOPTION_HELP_H,
202     UOPTION_HELP_QUESTION_MARK,
203     UOPTION_VERBOSE,
204     UOPTION_COPYRIGHT,
205     UOPTION_DESTDIR,
206     UOPTION_SOURCEDIR,
207     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
208     UOPTION_ICUDATADIR,
209     UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
210 };
211 
212 extern int
main(int argc,char * argv[])213 main(int argc, char* argv[]) {
214     char filename[300];
215     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
216     char *basename=NULL;
217     UErrorCode errorCode=U_ZERO_ERROR;
218 
219     U_MAIN_INIT_ARGS(argc, argv);
220 
221     /* preset then read command line options */
222     options[DESTDIR].value=u_getDataDirectory();
223     options[SOURCEDIR].value="";
224     options[UNICODE_VERSION].value="";
225     options[ICUDATADIR].value=u_getDataDirectory();
226     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
227 
228     /* error handling, printing usage message */
229     if(argc<0) {
230         fprintf(stderr,
231             "error in command line argument \"%s\"\n",
232             argv[-argc]);
233     }
234     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
235         /*
236          * Broken into chucks because the C89 standard says the minimum
237          * required supported string length is 509 bytes.
238          */
239         fprintf(stderr,
240             "Usage: %s [-options] [suffix]\n"
241             "\n"
242             "read the UnicodeData.txt file and other Unicode properties files and\n"
243             "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
244             "\n",
245             argv[0]);
246         fprintf(stderr,
247             "Options:\n"
248             "\t-h or -? or --help  this usage text\n"
249             "\t-v or --verbose     verbose output\n"
250             "\t-c or --copyright   include a copyright notice\n"
251             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
252             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
253         fprintf(stderr,
254             "\t-d or --destdir     destination directory, followed by the path\n"
255             "\t-s or --sourcedir   source directory, followed by the path\n"
256             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
257             "\t                    followed by path, defaults to %s\n"
258             "\tsuffix              suffix that is to be appended with a '-'\n"
259             "\t                    to the source file basenames before opening;\n"
260             "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
261             u_getDataDirectory());
262         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
263     }
264 
265     /* get the options values */
266     beVerbose=options[VERBOSE].doesOccur;
267     haveCopyright=options[COPYRIGHT].doesOccur;
268     srcDir=options[SOURCEDIR].value;
269     destDir=options[DESTDIR].value;
270 
271     if(argc>=2) {
272         suffix=argv[1];
273     } else {
274         suffix=NULL;
275     }
276 
277     if(options[UNICODE_VERSION].doesOccur) {
278         setUnicodeVersion(options[UNICODE_VERSION].value);
279     }
280     /* else use the default dataVersion in store.c */
281 
282     if (options[ICUDATADIR].doesOccur) {
283         u_setDataDirectory(options[ICUDATADIR].value);
284     }
285 
286     /* prepare the filename beginning with the source dir */
287     uprv_strcpy(filename, srcDir);
288     basename=filename+uprv_strlen(filename);
289     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
290         *basename++=U_FILE_SEP_CHAR;
291     }
292 
293     /* initialize */
294     pv=upvec_open(2, &errorCode);
295     caseSensitive=uset_open(1, 0); /* empty set (start>end) */
296 
297     /* process SpecialCasing.txt */
298     writeUCDFilename(basename, "SpecialCasing", suffix);
299     parseSpecialCasing(filename, &errorCode);
300 
301     /* process CaseFolding.txt */
302     writeUCDFilename(basename, "CaseFolding", suffix);
303     parseCaseFolding(filename, &errorCode);
304 
305     /* process additional properties files */
306     *basename=0;
307 
308     parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
309 
310     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
311 
312     if(ucdVersion>=UNI_4_1) {
313         parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
314     }
315 
316     /* process UnicodeData.txt */
317     writeUCDFilename(basename, "UnicodeData", suffix);
318     parseDB(filename, &errorCode);
319 
320     /* process parsed data */
321     makeCaseClosure();
322 
323     makeExceptions();
324 
325     if(U_SUCCESS(errorCode)) {
326         /* write the properties data file */
327         generateData(destDir, options[CSOURCE].doesOccur);
328     }
329 
330     u_cleanup();
331     return errorCode;
332 }
333 
334 U_CFUNC void
writeUCDFilename(char * basename,const char * filename,const char * suffix)335 writeUCDFilename(char *basename, const char *filename, const char *suffix) {
336     int32_t length=(int32_t)uprv_strlen(filename);
337     uprv_strcpy(basename, filename);
338     if(suffix!=NULL) {
339         basename[length++]='-';
340         uprv_strcpy(basename+length, suffix);
341         length+=(int32_t)uprv_strlen(suffix);
342     }
343     uprv_strcpy(basename+length, ".txt");
344 }
345 
346 /* TODO: move to toolutil */
347 U_CFUNC UBool
isToken(const char * token,const char * s)348 isToken(const char *token, const char *s) {
349     const char *z;
350     int32_t j;
351 
352     s=u_skipWhitespace(s);
353     for(j=0;; ++j) {
354         if(token[j]!=0) {
355             if(s[j]!=token[j]) {
356                 break;
357             }
358         } else {
359             z=u_skipWhitespace(s+j);
360             if(*z==';' || *z==0) {
361                 return TRUE;
362             } else {
363                 break;
364             }
365         }
366     }
367 
368     return FALSE;
369 }
370 
371 static int32_t
getTokenIndex(const char * const tokens[],int32_t countTokens,const char * s)372 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
373     const char *t, *z;
374     int32_t i, j;
375 
376     s=u_skipWhitespace(s);
377     for(i=0; i<countTokens; ++i) {
378         t=tokens[i];
379         if(t!=NULL) {
380             for(j=0;; ++j) {
381                 if(t[j]!=0) {
382                     if(s[j]!=t[j]) {
383                         break;
384                     }
385                 } else {
386                     z=u_skipWhitespace(s+j);
387                     if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
388                         return i;
389                     } else {
390                         break;
391                     }
392                 }
393             }
394         }
395     }
396     return -1;
397 }
398 
399 static void
_set_addAll(USet * set,const UChar * s,int32_t length)400 _set_addAll(USet *set, const UChar *s, int32_t length) {
401     UChar32 c;
402     int32_t i;
403 
404     /* needs length>=0 */
405     for(i=0; i<length; /* U16_NEXT advances i */) {
406         U16_NEXT(s, i, length, c);
407         uset_add(set, c);
408     }
409 }
410 
411 /* parser for SpecialCasing.txt --------------------------------------------- */
412 
413 #define MAX_SPECIAL_CASING_COUNT 500
414 
415 static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
416 static int32_t specialCasingCount=0;
417 
418 static void U_CALLCONV
specialCasingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)419 specialCasingLineFn(void *context,
420                     char *fields[][2], int32_t fieldCount,
421                     UErrorCode *pErrorCode) {
422     char *end;
423 
424     /* get code point */
425     specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
426     end=(char *)u_skipWhitespace(end);
427     if(end<=fields[0][0] || end!=fields[0][1]) {
428         fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
429         *pErrorCode=U_PARSE_ERROR;
430         exit(U_PARSE_ERROR);
431     }
432 
433     /* is this a complex mapping? */
434     if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
435         /* there is some condition text in the fifth field */
436         specialCasings[specialCasingCount].isComplex=TRUE;
437 
438         /* do not store any actual mappings for this */
439         specialCasings[specialCasingCount].lowerCase[0]=0;
440         specialCasings[specialCasingCount].upperCase[0]=0;
441         specialCasings[specialCasingCount].titleCase[0]=0;
442     } else {
443         /* just set the "complex" flag and get the case mappings */
444         specialCasings[specialCasingCount].isComplex=FALSE;
445         specialCasings[specialCasingCount].lowerCase[0]=
446             (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
447         specialCasings[specialCasingCount].upperCase[0]=
448             (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
449         specialCasings[specialCasingCount].titleCase[0]=
450             (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
451         if(U_FAILURE(*pErrorCode)) {
452             fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
453             exit(*pErrorCode);
454         }
455 
456         uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
457         _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
458         _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
459         _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
460     }
461 
462     if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
463         fprintf(stderr, "gencase: too many special casing mappings\n");
464         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
465         exit(U_INDEX_OUTOFBOUNDS_ERROR);
466     }
467 }
468 
469 static int32_t U_CALLCONV
compareSpecialCasings(const void * context,const void * left,const void * right)470 compareSpecialCasings(const void *context, const void *left, const void *right) {
471     return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
472 }
473 
474 static void
parseSpecialCasing(const char * filename,UErrorCode * pErrorCode)475 parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
476     char *fields[5][2];
477     int32_t i, j;
478 
479     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
480         return;
481     }
482 
483     u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
484 
485     /* sort the special casing entries by code point */
486     if(specialCasingCount>0) {
487         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
488                        compareSpecialCasings, NULL, FALSE, pErrorCode);
489     }
490     if(U_FAILURE(*pErrorCode)) {
491         return;
492     }
493 
494     /* replace multiple entries for any code point by one "complex" one */
495     j=0;
496     for(i=1; i<specialCasingCount; ++i) {
497         if(specialCasings[i-1].code==specialCasings[i].code) {
498             /* there is a duplicate code point */
499             specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
500             specialCasings[i].isComplex=TRUE;       /* make the following one complex */
501             specialCasings[i].lowerCase[0]=0;
502             specialCasings[i].upperCase[0]=0;
503             specialCasings[i].titleCase[0]=0;
504             ++j;
505         }
506     }
507 
508     /* if some entries just were removed, then re-sort */
509     if(j>0) {
510         uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
511                        compareSpecialCasings, NULL, FALSE, pErrorCode);
512         specialCasingCount-=j;
513     }
514     if(U_FAILURE(*pErrorCode)) {
515         return;
516     }
517 
518     /*
519      * Add one complex mapping to caseSensitive that was filtered out above:
520      * Greek final Sigma has a conditional mapping but not locale-sensitive,
521      * and it is taken when lowercasing just U+03A3 alone.
522      * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
523      */
524     uset_add(caseSensitive, 0x3c2);
525 }
526 
527 /* parser for CaseFolding.txt ----------------------------------------------- */
528 
529 #define MAX_CASE_FOLDING_COUNT 2000
530 
531 static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
532 static int32_t caseFoldingCount=0;
533 
534 static void U_CALLCONV
caseFoldingLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)535 caseFoldingLineFn(void *context,
536                   char *fields[][2], int32_t fieldCount,
537                   UErrorCode *pErrorCode) {
538     char *end;
539     static UChar32 prevCode=0;
540     int32_t count;
541     char status;
542 
543     /* get code point */
544     caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
545     end=(char *)u_skipWhitespace(end);
546     if(end<=fields[0][0] || end!=fields[0][1]) {
547         fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
548         *pErrorCode=U_PARSE_ERROR;
549         exit(U_PARSE_ERROR);
550     }
551 
552     /* get the status of this mapping */
553     caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
554     if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
555         fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
556         *pErrorCode=U_PARSE_ERROR;
557         exit(U_PARSE_ERROR);
558     }
559 
560     /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
561     if(status=='L') {
562         return;
563     }
564 
565     /* get the mapping */
566     count=caseFoldings[caseFoldingCount].full[0]=
567         (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
568     if(U_FAILURE(*pErrorCode)) {
569         fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
570         exit(*pErrorCode);
571     }
572 
573     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
574     if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
575         caseFoldings[caseFoldingCount].simple=0;
576     }
577 
578     /* update the case-sensitive set */
579     if(status!='T') {
580         uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
581         _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
582     }
583 
584     /* check the status */
585     if(status=='S') {
586         /* check if there was a full mapping for this code point before */
587         if( caseFoldingCount>0 &&
588             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
589             caseFoldings[caseFoldingCount-1].status=='F'
590         ) {
591             /* merge the two entries */
592             caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
593             return;
594         }
595     } else if(status=='F') {
596         /* check if there was a simple mapping for this code point before */
597         if( caseFoldingCount>0 &&
598             caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
599             caseFoldings[caseFoldingCount-1].status=='S'
600         ) {
601             /* merge the two entries */
602             uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
603             return;
604         }
605     } else if(status=='I' || status=='T') {
606         /* check if there was a default mapping for this code point before (remove it) */
607         while(caseFoldingCount>0 &&
608               caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
609         ) {
610             prevCode=0;
611             --caseFoldingCount;
612         }
613         /* store only a marker for special handling for cases like dotless i */
614         caseFoldings[caseFoldingCount].simple=0;
615         caseFoldings[caseFoldingCount].full[0]=0;
616     }
617 
618     /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
619     if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
620         fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
621                 (unsigned long)caseFoldings[caseFoldingCount].code,
622                 (unsigned long)prevCode);
623         *pErrorCode=U_PARSE_ERROR;
624         exit(U_PARSE_ERROR);
625     }
626     prevCode=caseFoldings[caseFoldingCount].code;
627 
628     if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
629         fprintf(stderr, "gencase: too many case folding mappings\n");
630         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
631         exit(U_INDEX_OUTOFBOUNDS_ERROR);
632     }
633 }
634 
635 static void
parseCaseFolding(const char * filename,UErrorCode * pErrorCode)636 parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
637     char *fields[3][2];
638 
639     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
640         return;
641     }
642 
643     u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
644 }
645 
646 /* parser for UnicodeData.txt ----------------------------------------------- */
647 
648 /* general categories */
649 const char *const
650 genCategoryNames[U_CHAR_CATEGORY_COUNT]={
651     "Cn",
652     "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
653     "Mc", "Nd", "Nl", "No",
654     "Zs", "Zl", "Zp",
655     "Cc", "Cf", "Co", "Cs",
656     "Pd", "Ps", "Pe", "Pc", "Po",
657     "Sm", "Sc", "Sk", "So",
658     "Pi", "Pf"
659 };
660 
661 static int32_t specialCasingIndex=0, caseFoldingIndex=0;
662 
663 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)664 unicodeDataLineFn(void *context,
665                   char *fields[][2], int32_t fieldCount,
666                   UErrorCode *pErrorCode) {
667     Props p;
668     char *end;
669     static UChar32 prevCode=0;
670     UChar32 value;
671     int32_t i;
672 
673     /* reset the properties */
674     uprv_memset(&p, 0, sizeof(Props));
675 
676     /* get the character code, field 0 */
677     p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
678     if(end<=fields[0][0] || end!=fields[0][1]) {
679         fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
680         *pErrorCode=U_PARSE_ERROR;
681         exit(U_PARSE_ERROR);
682     }
683 
684     /* get general category, field 2 */
685     i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
686     if(i>=0) {
687         p.gc=(uint8_t)i;
688     } else {
689         fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
690             fields[2][0], (unsigned long)p.code);
691         *pErrorCode=U_PARSE_ERROR;
692         exit(U_PARSE_ERROR);
693     }
694 
695     /* get canonical combining class, field 3 */
696     value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
697     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
698         fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
699         *pErrorCode=U_PARSE_ERROR;
700         exit(U_PARSE_ERROR);
701     }
702     p.cc=(uint8_t)value;
703 
704     /* get uppercase mapping, field 12 */
705     value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
706     if(end!=fields[12][1]) {
707         fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
708             (unsigned long)p.code);
709         *pErrorCode=U_PARSE_ERROR;
710         exit(U_PARSE_ERROR);
711     }
712     if(value!=0 && value!=p.code) {
713         p.upperCase=value;
714         uset_add(caseSensitive, p.code);
715         uset_add(caseSensitive, value);
716     }
717 
718     /* get lowercase value, field 13 */
719     value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
720     if(end!=fields[13][1]) {
721         fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
722             (unsigned long)p.code);
723         *pErrorCode=U_PARSE_ERROR;
724         exit(U_PARSE_ERROR);
725     }
726     if(value!=0 && value!=p.code) {
727         p.lowerCase=value;
728         uset_add(caseSensitive, p.code);
729         uset_add(caseSensitive, value);
730     }
731 
732     /* get titlecase value, field 14 */
733     value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
734     if(end!=fields[14][1]) {
735         fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
736             (unsigned long)p.code);
737         *pErrorCode=U_PARSE_ERROR;
738         exit(U_PARSE_ERROR);
739     }
740     if(value!=0 && value!=p.code) {
741         p.titleCase=value;
742         uset_add(caseSensitive, p.code);
743         uset_add(caseSensitive, value);
744     }
745 
746     /* set additional properties from previously parsed files */
747     if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
748         p.specialCasing=specialCasings+specialCasingIndex++;
749     } else {
750         p.specialCasing=NULL;
751     }
752     if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
753         p.caseFolding=caseFoldings+caseFoldingIndex++;
754 
755         /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
756         if( p.caseFolding->status=='C' &&
757             p.caseFolding->simple==p.lowerCase
758         ) {
759             p.caseFolding=NULL;
760         }
761     } else {
762         p.caseFolding=NULL;
763     }
764 
765     /* check for non-character code points */
766     if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
767         fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
768                 (unsigned long)p.code);
769         *pErrorCode=U_PARSE_ERROR;
770         exit(U_PARSE_ERROR);
771     }
772 
773     /* check that the code points (p.code) are in ascending order */
774     if(p.code<=prevCode && p.code>0) {
775         fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
776                 (unsigned long)p.code, (unsigned long)prevCode);
777         *pErrorCode=U_PARSE_ERROR;
778         exit(U_PARSE_ERROR);
779     }
780 
781     /* properties for a single code point */
782     setProps(&p);
783 
784     prevCode=p.code;
785 }
786 
787 static void
parseDB(const char * filename,UErrorCode * pErrorCode)788 parseDB(const char *filename, UErrorCode *pErrorCode) {
789     char *fields[15][2];
790     UChar32 start, end;
791     int32_t i;
792 
793     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
794         return;
795     }
796 
797     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
798 
799     /* are all sub-properties consumed? */
800     if(specialCasingIndex<specialCasingCount) {
801         fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
802         *pErrorCode=U_PARSE_ERROR;
803         exit(U_PARSE_ERROR);
804     }
805     if(caseFoldingIndex<caseFoldingCount) {
806         fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
807         *pErrorCode=U_PARSE_ERROR;
808         exit(U_PARSE_ERROR);
809     }
810 
811     if(U_FAILURE(*pErrorCode)) {
812         return;
813     }
814 
815     for(i=0;
816         0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
817         ++i
818     ) {
819         addCaseSensitive(start, end);
820     }
821     if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
822         *pErrorCode=U_ZERO_ERROR;
823     }
824 }
825 
826 /*
827  * Hey, Emacs, please set the following:
828  *
829  * Local Variables:
830  * indent-tabs-mode: nil
831  * End:
832  *
833  */
834