• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  gennorm2.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov25
14 *   created by: Markus W. Scherer
15 *
16 *   This program reads text files that define Unicode normalization,
17 *   parses them, and builds a binary data file.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include "unicode/errorcode.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/putil.h"
29 #include "unicode/uchar.h"
30 #include "unicode/unistr.h"
31 #include "charstr.h"
32 #include "normalizer2impl.h"
33 #include "toolutil.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36 
37 #if UCONFIG_NO_NORMALIZATION
38 #include "unewdata.h"
39 #endif
40 
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42 
43 U_NAMESPACE_BEGIN
44 
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46 
47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
48 
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
51 #endif
52 
53 /* -------------------------------------------------------------------------- */
54 
55 enum {
56     HELP_H,
57     HELP_QUESTION_MARK,
58     VERBOSE,
59     COPYRIGHT,
60     SOURCEDIR,
61     OUTPUT_FILENAME,
62     UNICODE_VERSION,
63     OPT_FAST
64 };
65 
66 static UOption options[]={
67     UOPTION_HELP_H,
68     UOPTION_HELP_QUESTION_MARK,
69     UOPTION_VERBOSE,
70     UOPTION_COPYRIGHT,
71     UOPTION_SOURCEDIR,
72     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
73     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
74     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75 };
76 
77 extern "C" int
main(int argc,char * argv[])78 main(int argc, char* argv[]) {
79     U_MAIN_INIT_ARGS(argc, argv);
80 
81     /* preset then read command line options */
82     options[SOURCEDIR].value="";
83     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
84 
85     /* error handling, printing usage message */
86     if(argc<0) {
87         fprintf(stderr,
88             "error in command line argument \"%s\"\n",
89             argv[-argc]);
90     }
91     if(!options[OUTPUT_FILENAME].doesOccur) {
92         argc=-1;
93     }
94     if( argc<2 ||
95         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
96     ) {
97         /*
98          * Broken into chunks because the C89 standard says the minimum
99          * required supported string length is 509 bytes.
100          */
101         fprintf(stderr,
102             "Usage: %s [-options] infiles+ -o outputfilename\n"
103             "\n"
104             "Reads the infiles with normalization data and\n"
105             "creates a binary file (outputfilename) with the data.\n"
106             "\n",
107             argv[0]);
108         fprintf(stderr,
109             "Options:\n"
110             "\t-h or -? or --help  this usage text\n"
111             "\t-v or --verbose     verbose output\n"
112             "\t-c or --copyright   include a copyright notice\n"
113             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
114         fprintf(stderr,
115             "\t-s or --sourcedir   source directory, followed by the path\n"
116             "\t-o or --output      output filename\n");
117         fprintf(stderr,
118             "\t      --fast        optimize the .nrm file for fast normalization,\n"
119             "\t                    which might increase its size  (Writes fully decomposed\n"
120             "\t                    regular mappings instead of delta mappings.\n"
121             "\t                    You should measure the runtime speed to make sure that\n"
122             "\t                    this is a good trade-off.)\n");
123         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
124     }
125 
126     beVerbose=options[VERBOSE].doesOccur;
127     haveCopyright=options[COPYRIGHT].doesOccur;
128 
129     IcuToolErrorCode errorCode("gennorm2/main()");
130 
131 #if UCONFIG_NO_NORMALIZATION
132 
133     fprintf(stderr,
134         "gennorm2 writes a dummy binary data file "
135         "because UCONFIG_NO_NORMALIZATION is set, \n"
136         "see icu/source/common/unicode/uconfig.h\n");
137     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
138     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
139     // return U_UNSUPPORTED_ERROR;
140     return 0;
141 
142 #else
143 
144     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
145     errorCode.assertSuccess();
146 
147     if(options[UNICODE_VERSION].doesOccur) {
148         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
149     }
150 
151     if(options[OPT_FAST].doesOccur) {
152         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
153     }
154 
155     // prepare the filename beginning with the source dir
156     CharString filename(options[SOURCEDIR].value, errorCode);
157     int32_t pathLength=filename.length();
158     if( pathLength>0 &&
159         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
160         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
161     ) {
162         filename.append(U_FILE_SEP_CHAR, errorCode);
163         pathLength=filename.length();
164     }
165 
166     for(int i=1; i<argc; ++i) {
167         printf("gennorm2: processing %s\n", argv[i]);
168         filename.append(argv[i], errorCode);
169         LocalStdioFilePointer f(fopen(filename.data(), "r"));
170         if(f==NULL) {
171             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
172             exit(U_FILE_ACCESS_ERROR);
173         }
174         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
175         parseFile(f.getAlias(), *builder);
176         filename.truncate(pathLength);
177     }
178 
179     builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
180 
181     return errorCode.get();
182 
183 #endif
184 }
185 
186 #if !UCONFIG_NO_NORMALIZATION
187 
parseFile(FILE * f,Normalizer2DataBuilder & builder)188 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
189     IcuToolErrorCode errorCode("gennorm2/parseFile()");
190     char line[300];
191     uint32_t startCP, endCP;
192     while(NULL!=fgets(line, (int)sizeof(line), f)) {
193         char *comment=(char *)strchr(line, '#');
194         if(comment!=NULL) {
195             *comment=0;
196         }
197         u_rtrim(line);
198         if(line[0]==0) {
199             continue;  // skip empty and comment-only lines
200         }
201         if(line[0]=='*') {
202             const char *s=u_skipWhitespace(line+1);
203             if(0==strncmp(s, "Unicode", 7)) {
204                 s=u_skipWhitespace(s+7);
205                 builder.setUnicodeVersion(s);
206             }
207             continue;  // reserved syntax
208         }
209         const char *delimiter;
210         int32_t rangeLength=
211             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
212         if(errorCode.isFailure()) {
213             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
214             exit(errorCode.reset());
215         }
216         delimiter=u_skipWhitespace(delimiter);
217         if(*delimiter==':') {
218             const char *s=u_skipWhitespace(delimiter+1);
219             char *end;
220             unsigned long value=strtoul(s, &end, 10);
221             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
222                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
223                 exit(U_PARSE_ERROR);
224             }
225             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
226                 builder.setCC(c, (uint8_t)value);
227             }
228             continue;
229         }
230         if(*delimiter=='-') {
231             if(*u_skipWhitespace(delimiter+1)!=0) {
232                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
233                 exit(U_PARSE_ERROR);
234             }
235             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
236                 builder.removeMapping(c);
237             }
238             continue;
239         }
240         if(*delimiter=='=' || *delimiter=='>') {
241             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
242             int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
243             if(errorCode.isFailure()) {
244                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
245                 exit(errorCode.reset());
246             }
247             UnicodeString mapping(FALSE, uchars, length);
248             if(*delimiter=='=') {
249                 if(rangeLength!=1) {
250                     fprintf(stderr,
251                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
252                             line);
253                     exit(U_PARSE_ERROR);
254                 }
255                 builder.setRoundTripMapping((UChar32)startCP, mapping);
256             } else {
257                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
258                     builder.setOneWayMapping(c, mapping);
259                 }
260             }
261             continue;
262         }
263         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
264         exit(U_PARSE_ERROR);
265     }
266 }
267 
268 #endif // !UCONFIG_NO_NORMALIZATION
269 
270 U_NAMESPACE_END
271 
272 /*
273  * Hey, Emacs, please set the following:
274  *
275  * Local Variables:
276  * indent-tabs-mode: nil
277  * End:
278  *
279  */
280