• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2001-2005, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  gennorm.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2001may25
14 *   created by: Markus W. Scherer
15 *
16 *   This program reads the Unicode character database text file,
17 *   parses it, and extracts the data for normalization.
18 *   It then preprocesses it and writes a binary file for efficient use
19 *   in various Unicode text normalization processes.
20 */
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include "unicode/utypes.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "unicode/putil.h"
28 #include "unicode/uclean.h"
29 #include "unicode/udata.h"
30 #include "unicode/uset.h"
31 #include "cmemory.h"
32 #include "cstring.h"
33 #include "unewdata.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36 #include "unormimp.h"
37 
38 U_CDECL_BEGIN
39 #include "gennorm.h"
40 U_CDECL_END
41 
42 UBool beVerbose=FALSE, haveCopyright=TRUE;
43 
44 /* prototypes --------------------------------------------------------------- */
45 
46 static void
47 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
48 
49 static void
50 parseDB(const char *filename, UErrorCode *pErrorCode);
51 
52 /* -------------------------------------------------------------------------- */
53 
54 enum {
55     HELP_H,
56     HELP_QUESTION_MARK,
57     VERBOSE,
58     COPYRIGHT,
59     DESTDIR,
60     SOURCEDIR,
61     UNICODE_VERSION,
62     ICUDATADIR,
63     CSOURCE,
64     STORE_FLAGS
65 };
66 
67 static UOption options[]={
68     UOPTION_HELP_H,
69     UOPTION_HELP_QUESTION_MARK,
70     UOPTION_VERBOSE,
71     UOPTION_COPYRIGHT,
72     UOPTION_DESTDIR,
73     UOPTION_SOURCEDIR,
74     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
75     UOPTION_ICUDATADIR,
76     UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
77     UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
78 };
79 
80 extern int
main(int argc,char * argv[])81 main(int argc, char* argv[]) {
82 #if !UCONFIG_NO_NORMALIZATION
83     char filename[300];
84 #endif
85     const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
86     char *basename=NULL;
87     UErrorCode errorCode=U_ZERO_ERROR;
88 
89     U_MAIN_INIT_ARGS(argc, argv);
90 
91     /* preset then read command line options */
92     options[4].value=u_getDataDirectory();
93     options[5].value="";
94     options[6].value="3.0.0";
95     options[ICUDATADIR].value=u_getDataDirectory();
96     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
97 
98     /* error handling, printing usage message */
99     if(argc<0) {
100         fprintf(stderr,
101             "error in command line argument \"%s\"\n",
102             argv[-argc]);
103     }
104     if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
105         /*
106          * Broken into chucks because the C89 standard says the minimum
107          * required supported string length is 509 bytes.
108          */
109         fprintf(stderr,
110             "Usage: %s [-options] [suffix]\n"
111             "\n"
112             "Read the UnicodeData.txt file and other Unicode properties files and\n"
113             "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
114             "\n",
115             argv[0]);
116         fprintf(stderr,
117             "Options:\n"
118             "\t-h or -? or --help  this usage text\n"
119             "\t-v or --verbose     verbose output\n"
120             "\t-c or --copyright   include a copyright notice\n"
121             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
122             "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
123         fprintf(stderr,
124             "\t-p or --prune flags Prune for data modularization:\n"
125             "\t                    Determine what data is to be stored.\n"
126             "\t        0 (zero) stores minimal data (only for NFD)\n"
127             "\t        lowercase letters turn off data, uppercase turn on (use with 0)\n");
128         fprintf(stderr,
129             "\t        k: compatibility decompositions (NFKC, NFKD)\n"
130             "\t        c: composition data (NFC, NFKC)\n"
131             "\t        f: FCD data (will be generated at load time)\n"
132             "\t        a: auxiliary data (canonical closure etc.)\n"
133             "\t        x: exclusion sets (Unicode 3.2-level normalization)\n");
134         fprintf(stderr,
135             "\t-d or --destdir     destination directory, followed by the path\n"
136             "\t-s or --sourcedir   source directory, followed by the path\n"
137             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
138             "\t                    followed by path, defaults to <%s>\n"
139             "\tsuffix              suffix that is to be appended with a '-'\n"
140             "\t                    to the source file basenames before opening;\n"
141             "\t                    'gennorm new' will read UnicodeData-new.txt etc.\n",
142             u_getDataDirectory());
143         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
144     }
145 
146     /* get the options values */
147     beVerbose=options[2].doesOccur;
148     haveCopyright=options[3].doesOccur;
149     srcDir=options[5].value;
150     destDir=options[4].value;
151 
152     if(argc>=2) {
153         suffix=argv[1];
154     } else {
155         suffix=NULL;
156     }
157 
158 #if UCONFIG_NO_NORMALIZATION
159 
160     fprintf(stderr,
161         "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
162         " because UCONFIG_NO_NORMALIZATION is set, \n"
163         "see icu/source/common/unicode/uconfig.h\n");
164     generateData(destDir, options[CSOURCE].doesOccur);
165 
166 #else
167 
168     setUnicodeVersion(options[6].value);
169 
170     if (options[ICUDATADIR].doesOccur) {
171         u_setDataDirectory(options[ICUDATADIR].value);
172     }
173 
174     if(options[STORE_FLAGS].doesOccur) {
175         const char *s=options[STORE_FLAGS].value;
176         char c;
177 
178         while((c=*s++)!=0) {
179             switch(c) {
180             case '0':
181                 gStoreFlags=0;  /* store minimal data (only for NFD) */
182                 break;
183 
184             /* lowercase letters: omit data */
185             case 'k':
186                 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT);
187                 break;
188             case 'c':
189                 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION);
190                 break;
191             case 'f':
192                 gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD);
193                 break;
194             case 'a':
195                 gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX);
196                 break;
197             case 'x':
198                 gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS);
199                 break;
200 
201             /* uppercase letters: include data (use with 0) */
202             case 'K':
203                 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPAT);
204                 break;
205             case 'C':
206                 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPOSITION);
207                 break;
208             case 'F':
209                 gStoreFlags|=U_MASK(UGENNORM_STORE_FCD);
210                 break;
211             case 'A':
212                 gStoreFlags|=U_MASK(UGENNORM_STORE_AUX);
213                 break;
214             case 'X':
215                 gStoreFlags|=U_MASK(UGENNORM_STORE_EXCLUSIONS);
216                 break;
217 
218             default:
219                 fprintf(stderr, "ignoring undefined prune flag '%c'\n", c);
220                 break;
221             }
222         }
223     }
224 
225     /*
226      * Verify that we can work with properties
227      * but don't call u_init() because that needs unorm.icu which we are just
228      * going to build here.
229      */
230     {
231         U_STRING_DECL(ideo, "[:Ideographic:]", 15);
232         USet *set;
233 
234         U_STRING_INIT(ideo, "[:Ideographic:]", 15);
235         set=uset_openPattern(ideo, -1, &errorCode);
236         if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
237             fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
238             exit(errorCode);
239         }
240         uset_close(set);
241     }
242 
243     /* prepare the filename beginning with the source dir */
244     uprv_strcpy(filename, srcDir);
245     basename=filename+uprv_strlen(filename);
246     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
247         *basename++=U_FILE_SEP_CHAR;
248     }
249 
250     /* initialize */
251     init();
252 
253     /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
254     if(suffix==NULL) {
255         uprv_strcpy(basename, "DerivedNormalizationProps.txt");
256     } else {
257         uprv_strcpy(basename, "DerivedNormalizationProps");
258         basename[30]='-';
259         uprv_strcpy(basename+31, suffix);
260         uprv_strcat(basename+31, ".txt");
261     }
262     parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
263     if(U_FAILURE(errorCode)) {
264         /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
265         if(suffix==NULL) {
266             uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
267         } else {
268             uprv_strcpy(basename, "DerivedNormalizationProperties");
269             basename[30]='-';
270             uprv_strcpy(basename+31, suffix);
271             uprv_strcat(basename+31, ".txt");
272         }
273         parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
274     }
275 
276     /* process UnicodeData.txt */
277     if(suffix==NULL) {
278         uprv_strcpy(basename, "UnicodeData.txt");
279     } else {
280         uprv_strcpy(basename, "UnicodeData");
281         basename[11]='-';
282         uprv_strcpy(basename+12, suffix);
283         uprv_strcat(basename+12, ".txt");
284     }
285     parseDB(filename, &errorCode);
286 
287     /* process parsed data */
288     if(U_SUCCESS(errorCode)) {
289         processData();
290 
291         /* write the properties data file */
292         generateData(destDir, options[CSOURCE].doesOccur);
293 
294         cleanUpData();
295     }
296 
297 #endif
298 
299     return errorCode;
300 }
301 
302 #if !UCONFIG_NO_NORMALIZATION
303 
304 /* parser for DerivedNormalizationProperties.txt ---------------------------- */
305 
306 static void U_CALLCONV
derivedNormalizationPropertiesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)307 derivedNormalizationPropertiesLineFn(void *context,
308                                      char *fields[][2], int32_t fieldCount,
309                                      UErrorCode *pErrorCode) {
310     UChar string[32];
311     char *s;
312     uint32_t start, end;
313     int32_t count;
314     uint8_t qcFlags;
315 
316     /* get code point range */
317     count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
318     if(U_FAILURE(*pErrorCode)) {
319         fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
320         exit(*pErrorCode);
321     }
322 
323     /* ignore hangul - handle explicitly */
324     if(start==0xac00) {
325         return;
326     }
327 
328     /* get property - ignore unrecognized ones */
329     s=(char *)u_skipWhitespace(fields[1][0]);
330     if(*s=='N' && s[1]=='F') {
331         /* quick check flag */
332         qcFlags=0x11;
333         s+=2;
334         if(*s=='K') {
335             qcFlags<<=1;
336             ++s;
337         }
338 
339         if(*s=='C' && s[1]=='_') {
340             s+=2;
341         } else if(*s=='D' && s[1]=='_') {
342             qcFlags<<=2;
343             s+=2;
344         } else {
345             return;
346         }
347 
348         if(0==uprv_strncmp(s, "NO", 2)) {
349             qcFlags&=0xf;
350         } else if(0==uprv_strncmp(s, "MAYBE", 5)) {
351             qcFlags&=0x30;
352         } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
353             /*
354              * Unicode 4.0.1:
355              * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
356              */
357             /* start of the field */
358             s=(char *)u_skipWhitespace(s+1);
359             if(*s=='N') {
360                 qcFlags&=0xf;
361             } else if(*s=='M') {
362                 qcFlags&=0x30;
363             } else {
364                 return; /* do nothing for "Yes" because it's the default value */
365             }
366         } else {
367             return; /* do nothing for "Yes" because it's the default value */
368         }
369 
370         /* set this flag for all code points in this range */
371         while(start<=end) {
372             setQCFlags(start++, qcFlags);
373         }
374     } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
375         /* full composition exclusion */
376         while(start<=end) {
377             setCompositionExclusion(start++);
378         }
379     } else if(
380         ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') ||
381         (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
382 
383     ) {
384         /* FC_NFKC_Closure, parse field 2 to get the string */
385         char *t;
386 
387         /* start of the field */
388         s=(char *)u_skipWhitespace(s+1);
389 
390         /* find the end of the field */
391         for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
392         *t=0;
393 
394         string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
395         if(U_FAILURE(*pErrorCode)) {
396             fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
397             exit(*pErrorCode);
398         }
399         while(start<=end) {
400             setFNC(start++, string);
401         }
402     }
403 }
404 
405 static void
parseDerivedNormalizationProperties(const char * filename,UErrorCode * pErrorCode,UBool reportError)406 parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
407     char *fields[2][2];
408 
409     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
410         return;
411     }
412 
413     u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
414     if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
415         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
416         exit(*pErrorCode);
417     }
418 }
419 
420 /* parser for UnicodeData.txt ----------------------------------------------- */
421 
422 static void U_CALLCONV
unicodeDataLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)423 unicodeDataLineFn(void *context,
424                   char *fields[][2], int32_t fieldCount,
425                   UErrorCode *pErrorCode) {
426     uint32_t decomp[40];
427     Norm norm;
428     const char *s;
429     char *end;
430     uint32_t code, value;
431     int32_t length;
432     UBool isCompat, something=FALSE;
433 
434     /* ignore First and Last entries for ranges */
435     if( *fields[1][0]=='<' &&
436         (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
437         (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
438     ) {
439         return;
440     }
441 
442     /* reset the properties */
443     uprv_memset(&norm, 0, sizeof(Norm));
444 
445     /*
446      * The combiningIndex must not be initialized to 0 because 0 is the
447      * combiningIndex of the first forward-combining character.
448      */
449     norm.combiningIndex=0xffff;
450 
451     /* get the character code, field 0 */
452     code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
453     if(end<=fields[0][0] || end!=fields[0][1]) {
454         fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
455         *pErrorCode=U_PARSE_ERROR;
456         exit(U_PARSE_ERROR);
457     }
458 
459     /* get canonical combining class, field 3 */
460     value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
461     if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
462         fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
463         *pErrorCode=U_PARSE_ERROR;
464         exit(U_PARSE_ERROR);
465     }
466     if(value>0) {
467         norm.udataCC=(uint8_t)value;
468         something=TRUE;
469     }
470 
471     /* get the decomposition, field 5 */
472     if(fields[5][0]<fields[5][1]) {
473         if(*(s=fields[5][0])=='<') {
474             ++s;
475             isCompat=TRUE;
476 
477             /* skip and ignore the compatibility type name */
478             do {
479                 if(s==fields[5][1]) {
480                     /* missing '>' */
481                     fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
482                     *pErrorCode=U_PARSE_ERROR;
483                     exit(U_PARSE_ERROR);
484                 }
485             } while(*s++!='>');
486         } else {
487             isCompat=FALSE;
488         }
489 
490         /* parse the decomposition string */
491         length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
492         if(U_FAILURE(*pErrorCode)) {
493             fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
494                     (long)code, u_errorName(*pErrorCode));
495             exit(*pErrorCode);
496         }
497 
498         /* store the string */
499         if(length>0) {
500             something=TRUE;
501             if(isCompat) {
502                 norm.lenNFKD=(uint8_t)length;
503                 norm.nfkd=decomp;
504             } else {
505                 if(length>2) {
506                     fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
507                             (long)code, (long)length);
508                     *pErrorCode=U_PARSE_ERROR;
509                     exit(U_PARSE_ERROR);
510                 }
511                 norm.lenNFD=(uint8_t)length;
512                 norm.nfd=decomp;
513             }
514         }
515     }
516 
517     /* check for non-character code points */
518     if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
519         fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
520                 (long)code);
521         *pErrorCode=U_PARSE_ERROR;
522         exit(U_PARSE_ERROR);
523     }
524 
525     if(something) {
526         /* there are normalization values, so store them */
527 #if 0
528         if(beVerbose) {
529             printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
530                    (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
531         }
532 #endif
533         storeNorm(code, &norm);
534     }
535 }
536 
537 static void
parseDB(const char * filename,UErrorCode * pErrorCode)538 parseDB(const char *filename, UErrorCode *pErrorCode) {
539     char *fields[15][2];
540 
541     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
542         return;
543     }
544 
545     u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
546     if(U_FAILURE(*pErrorCode)) {
547         fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
548         exit(*pErrorCode);
549     }
550 }
551 
552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
553 
554 /*
555  * Hey, Emacs, please set the following:
556  *
557  * Local Variables:
558  * indent-tabs-mode: nil
559  * End:
560  *
561  */
562