• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  gennorm2.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov25
14 *   created by: Markus W. Scherer
15 *
16 *   This program reads text files that define Unicode normalization,
17 *   parses them, and builds a binary data file.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include "unicode/errorcode.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/putil.h"
29 #include "unicode/uchar.h"
30 #include "unicode/unistr.h"
31 #include "charstr.h"
32 #include "normalizer2impl.h"
33 #include "toolutil.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36 
37 #if UCONFIG_NO_NORMALIZATION
38 #include "unewdata.h"
39 #endif
40 
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42 
43 U_NAMESPACE_BEGIN
44 
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46 
47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
48 
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
51 #endif
52 
53 /* -------------------------------------------------------------------------- */
54 
55 enum {
56     HELP_H,
57     HELP_QUESTION_MARK,
58     VERBOSE,
59     COPYRIGHT,
60     SOURCEDIR,
61     OUTPUT_FILENAME,
62     UNICODE_VERSION,
63     OPT_FAST
64 };
65 
66 static UOption options[]={
67     UOPTION_HELP_H,
68     UOPTION_HELP_QUESTION_MARK,
69     UOPTION_VERBOSE,
70     UOPTION_COPYRIGHT,
71     UOPTION_SOURCEDIR,
72     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
73     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
74     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75 };
76 
77 extern "C" int
main(int argc,char * argv[])78 main(int argc, char* argv[]) {
79     U_MAIN_INIT_ARGS(argc, argv);
80 
81     /* preset then read command line options */
82     options[SOURCEDIR].value="";
83     options[UNICODE_VERSION].value=U_UNICODE_VERSION;
84     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
85 
86     /* error handling, printing usage message */
87     if(argc<0) {
88         fprintf(stderr,
89             "error in command line argument \"%s\"\n",
90             argv[-argc]);
91     }
92     if(!options[OUTPUT_FILENAME].doesOccur) {
93         argc=-1;
94     }
95     if( argc<2 ||
96         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
97     ) {
98         /*
99          * Broken into chunks because the C89 standard says the minimum
100          * required supported string length is 509 bytes.
101          */
102         fprintf(stderr,
103             "Usage: %s [-options] infiles+ -o outputfilename\n"
104             "\n"
105             "Reads the infiles with normalization data and\n"
106             "creates a binary file (outputfilename) with the data.\n"
107             "\n",
108             argv[0]);
109         fprintf(stderr,
110             "Options:\n"
111             "\t-h or -? or --help  this usage text\n"
112             "\t-v or --verbose     verbose output\n"
113             "\t-c or --copyright   include a copyright notice\n"
114             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
115         fprintf(stderr,
116             "\t-s or --sourcedir   source directory, followed by the path\n"
117             "\t-o or --output      output filename\n");
118         fprintf(stderr,
119             "\t      --fast        optimize the .nrm file for fast normalization,\n"
120             "\t                    which might increase its size  (Writes fully decomposed\n"
121             "\t                    regular mappings instead of delta mappings.\n"
122             "\t                    You should measure the runtime speed to make sure that\n"
123             "\t                    this is a good trade-off.)\n");
124         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
125     }
126 
127     beVerbose=options[VERBOSE].doesOccur;
128     haveCopyright=options[COPYRIGHT].doesOccur;
129 
130     IcuToolErrorCode errorCode("gennorm2/main()");
131 
132 #if UCONFIG_NO_NORMALIZATION
133 
134     fprintf(stderr,
135         "gennorm2 writes a dummy binary data file "
136         "because UCONFIG_NO_NORMALIZATION is set, \n"
137         "see icu/source/common/unicode/uconfig.h\n");
138     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
139     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
140     // return U_UNSUPPORTED_ERROR;
141     return 0;
142 
143 #else
144 
145     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
146     errorCode.assertSuccess();
147 
148     builder->setUnicodeVersion(options[UNICODE_VERSION].value);
149 
150     if(options[OPT_FAST].doesOccur) {
151         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
152     }
153 
154     // prepare the filename beginning with the source dir
155     CharString filename(options[SOURCEDIR].value, errorCode);
156     int32_t pathLength=filename.length();
157     if( pathLength>0 &&
158         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
159         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
160     ) {
161         filename.append(U_FILE_SEP_CHAR, errorCode);
162         pathLength=filename.length();
163     }
164 
165     for(int i=1; i<argc; ++i) {
166         printf("gennorm2: processing %s\n", argv[i]);
167         filename.append(argv[i], errorCode);
168         LocalStdioFilePointer f(fopen(filename.data(), "r"));
169         if(f==NULL) {
170             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
171             exit(U_FILE_ACCESS_ERROR);
172         }
173         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
174         parseFile(f.getAlias(), *builder);
175         filename.truncate(pathLength);
176     }
177 
178     builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
179 
180     return errorCode.get();
181 
182 #endif
183 }
184 
185 #if !UCONFIG_NO_NORMALIZATION
186 
parseFile(FILE * f,Normalizer2DataBuilder & builder)187 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
188     IcuToolErrorCode errorCode("gennorm2/parseFile()");
189     char line[300];
190     uint32_t startCP, endCP;
191     while(NULL!=fgets(line, (int)sizeof(line), f)) {
192         char *comment=(char *)strchr(line, '#');
193         if(comment!=NULL) {
194             *comment=0;
195         }
196         u_rtrim(line);
197         if(line[0]==0) {
198             continue;  // skip empty and comment-only lines
199         }
200         if(line[0]=='*') {
201             continue;  // reserved syntax
202         }
203         const char *delimiter;
204         int32_t rangeLength=
205             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
206         if(errorCode.isFailure()) {
207             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
208             exit(errorCode.reset());
209         }
210         delimiter=u_skipWhitespace(delimiter);
211         if(*delimiter==':') {
212             const char *s=u_skipWhitespace(delimiter+1);
213             char *end;
214             unsigned long value=strtoul(s, &end, 10);
215             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
216                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
217                 exit(U_PARSE_ERROR);
218             }
219             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
220                 builder.setCC(c, (uint8_t)value);
221             }
222             continue;
223         }
224         if(*delimiter=='-') {
225             if(*u_skipWhitespace(delimiter+1)!=0) {
226                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
227                 exit(U_PARSE_ERROR);
228             }
229             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
230                 builder.removeMapping(c);
231             }
232             continue;
233         }
234         if(*delimiter=='=' || *delimiter=='>') {
235             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
236             int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
237             if(errorCode.isFailure()) {
238                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
239                 exit(errorCode.reset());
240             }
241             UnicodeString mapping(FALSE, uchars, length);
242             if(*delimiter=='=') {
243                 if(rangeLength!=1) {
244                     fprintf(stderr,
245                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
246                             line);
247                     exit(U_PARSE_ERROR);
248                 }
249                 builder.setRoundTripMapping((UChar32)startCP, mapping);
250             } else {
251                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
252                     builder.setOneWayMapping(c, mapping);
253                 }
254             }
255             continue;
256         }
257         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
258         exit(U_PARSE_ERROR);
259     }
260 }
261 
262 #endif // !UCONFIG_NO_NORMALIZATION
263 
264 U_NAMESPACE_END
265 
266 /*
267  * Hey, Emacs, please set the following:
268  *
269  * Local Variables:
270  * indent-tabs-mode: nil
271  * End:
272  *
273  */
274