• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  gennorm2.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov25
14 *   created by: Markus W. Scherer
15 *
16 *   This program reads text files that define Unicode normalization,
17 *   parses them, and builds a binary data file.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include "unicode/errorcode.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/putil.h"
29 #include "unicode/uchar.h"
30 #include "unicode/unistr.h"
31 #include "charstr.h"
32 #include "normalizer2impl.h"
33 #include "toolutil.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36 
37 #if UCONFIG_NO_NORMALIZATION
38 #include "unewdata.h"
39 #endif
40 
41 U_NAMESPACE_BEGIN
42 
43 UBool beVerbose=FALSE, haveCopyright=TRUE;
44 
45 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
46 
47 #if !UCONFIG_NO_NORMALIZATION
48 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
49 #endif
50 
51 /* -------------------------------------------------------------------------- */
52 
53 enum {
54     HELP_H,
55     HELP_QUESTION_MARK,
56     VERBOSE,
57     COPYRIGHT,
58     SOURCEDIR,
59     OUTPUT_FILENAME,
60     UNICODE_VERSION,
61     WRITE_C_SOURCE,
62     OPT_FAST
63 };
64 
65 static UOption options[]={
66     UOPTION_HELP_H,
67     UOPTION_HELP_QUESTION_MARK,
68     UOPTION_VERBOSE,
69     UOPTION_COPYRIGHT,
70     UOPTION_SOURCEDIR,
71     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
72     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
73     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
74     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75 };
76 
77 extern "C" int
main(int argc,char * argv[])78 main(int argc, char* argv[]) {
79     U_MAIN_INIT_ARGS(argc, argv);
80 
81     /* preset then read command line options */
82     options[SOURCEDIR].value="";
83     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
84 
85     /* error handling, printing usage message */
86     if(argc<0) {
87         fprintf(stderr,
88             "error in command line argument \"%s\"\n",
89             argv[-argc]);
90     }
91     if(!options[OUTPUT_FILENAME].doesOccur) {
92         argc=-1;
93     }
94     if( argc<2 ||
95         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
96     ) {
97         /*
98          * Broken into chunks because the C89 standard says the minimum
99          * required supported string length is 509 bytes.
100          */
101         fprintf(stderr,
102             "Usage: %s [-options] infiles+ -o outputfilename\n"
103             "\n"
104             "Reads the infiles with normalization data and\n"
105             "creates a binary or C source file (outputfilename) with the data.\n"
106             "\n",
107             argv[0]);
108         fprintf(stderr,
109             "Options:\n"
110             "\t-h or -? or --help  this usage text\n"
111             "\t-v or --verbose     verbose output\n"
112             "\t-c or --copyright   include a copyright notice\n"
113             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
114         fprintf(stderr,
115             "\t-s or --sourcedir   source directory, followed by the path\n"
116             "\t-o or --output      output filename\n"
117             "\t      --csource     writes a C source file with initializers\n");
118         fprintf(stderr,
119             "\t      --fast        optimize the data for fast normalization,\n"
120             "\t                    which might increase its size  (Writes fully decomposed\n"
121             "\t                    regular mappings instead of delta mappings.\n"
122             "\t                    You should measure the runtime speed to make sure that\n"
123             "\t                    this is a good trade-off.)\n");
124         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
125     }
126 
127     beVerbose=options[VERBOSE].doesOccur;
128     haveCopyright=options[COPYRIGHT].doesOccur;
129 
130     IcuToolErrorCode errorCode("gennorm2/main()");
131 
132 #if UCONFIG_NO_NORMALIZATION
133 
134     fprintf(stderr,
135         "gennorm2 writes a dummy binary data file "
136         "because UCONFIG_NO_NORMALIZATION is set, \n"
137         "see icu/source/common/unicode/uconfig.h\n");
138     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
139     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
140     // return U_UNSUPPORTED_ERROR;
141     return 0;
142 
143 #else
144 
145     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
146     errorCode.assertSuccess();
147 
148     if(options[UNICODE_VERSION].doesOccur) {
149         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
150     }
151 
152     if(options[OPT_FAST].doesOccur) {
153         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
154     }
155 
156     // prepare the filename beginning with the source dir
157     CharString filename(options[SOURCEDIR].value, errorCode);
158     int32_t pathLength=filename.length();
159     if( pathLength>0 &&
160         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
161         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
162     ) {
163         filename.append(U_FILE_SEP_CHAR, errorCode);
164         pathLength=filename.length();
165     }
166 
167     for(int i=1; i<argc; ++i) {
168         printf("gennorm2: processing %s\n", argv[i]);
169         filename.append(argv[i], errorCode);
170         LocalStdioFilePointer f(fopen(filename.data(), "r"));
171         if(f==NULL) {
172             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
173             exit(U_FILE_ACCESS_ERROR);
174         }
175         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
176         parseFile(f.getAlias(), *builder);
177         filename.truncate(pathLength);
178     }
179 
180     if(options[WRITE_C_SOURCE].doesOccur) {
181         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
182     } else {
183         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
184     }
185 
186     return errorCode.get();
187 
188 #endif
189 }
190 
191 #if !UCONFIG_NO_NORMALIZATION
192 
parseFile(FILE * f,Normalizer2DataBuilder & builder)193 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
194     IcuToolErrorCode errorCode("gennorm2/parseFile()");
195     char line[300];
196     uint32_t startCP, endCP;
197     while(NULL!=fgets(line, (int)sizeof(line), f)) {
198         char *comment=(char *)strchr(line, '#');
199         if(comment!=NULL) {
200             *comment=0;
201         }
202         u_rtrim(line);
203         if(line[0]==0) {
204             continue;  // skip empty and comment-only lines
205         }
206         if(line[0]=='*') {
207             const char *s=u_skipWhitespace(line+1);
208             if(0==strncmp(s, "Unicode", 7)) {
209                 s=u_skipWhitespace(s+7);
210                 builder.setUnicodeVersion(s);
211             }
212             continue;  // reserved syntax
213         }
214         const char *delimiter;
215         int32_t rangeLength=
216             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
217         if(errorCode.isFailure()) {
218             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
219             exit(errorCode.reset());
220         }
221         delimiter=u_skipWhitespace(delimiter);
222         if(*delimiter==':') {
223             const char *s=u_skipWhitespace(delimiter+1);
224             char *end;
225             unsigned long value=strtoul(s, &end, 10);
226             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
227                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
228                 exit(U_PARSE_ERROR);
229             }
230             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
231                 builder.setCC(c, (uint8_t)value);
232             }
233             continue;
234         }
235         if(*delimiter=='-') {
236             if(*u_skipWhitespace(delimiter+1)!=0) {
237                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
238                 exit(U_PARSE_ERROR);
239             }
240             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
241                 builder.removeMapping(c);
242             }
243             continue;
244         }
245         if(*delimiter=='=' || *delimiter=='>') {
246             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
247             int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
248             if(errorCode.isFailure()) {
249                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
250                 exit(errorCode.reset());
251             }
252             UnicodeString mapping(FALSE, uchars, length);
253             if(*delimiter=='=') {
254                 if(rangeLength!=1) {
255                     fprintf(stderr,
256                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
257                             line);
258                     exit(U_PARSE_ERROR);
259                 }
260                 builder.setRoundTripMapping((UChar32)startCP, mapping);
261             } else {
262                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
263                     builder.setOneWayMapping(c, mapping);
264                 }
265             }
266             continue;
267         }
268         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
269         exit(U_PARSE_ERROR);
270     }
271 }
272 
273 #endif // !UCONFIG_NO_NORMALIZATION
274 
275 U_NAMESPACE_END
276 
277 /*
278  * Hey, Emacs, please set the following:
279  *
280  * Local Variables:
281  * indent-tabs-mode: nil
282  * End:
283  *
284  */
285