1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gennorm2.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov25
14 * created by: Markus W. Scherer
15 *
16 * This program reads text files that define Unicode normalization,
17 * parses them, and builds a binary data file.
18 */
19
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include "unicode/errorcode.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/putil.h"
29 #include "unicode/uchar.h"
30 #include "unicode/unistr.h"
31 #include "charstr.h"
32 #include "normalizer2impl.h"
33 #include "toolutil.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36
37 #if UCONFIG_NO_NORMALIZATION
38 #include "unewdata.h"
39 #endif
40
41 U_NAMESPACE_BEGIN
42
43 UBool beVerbose=FALSE, haveCopyright=TRUE;
44
45 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
46
47 #if !UCONFIG_NO_NORMALIZATION
48 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
49 #endif
50
51 /* -------------------------------------------------------------------------- */
52
53 enum {
54 HELP_H,
55 HELP_QUESTION_MARK,
56 VERBOSE,
57 COPYRIGHT,
58 SOURCEDIR,
59 OUTPUT_FILENAME,
60 UNICODE_VERSION,
61 WRITE_C_SOURCE,
62 OPT_FAST
63 };
64
65 static UOption options[]={
66 UOPTION_HELP_H,
67 UOPTION_HELP_QUESTION_MARK,
68 UOPTION_VERBOSE,
69 UOPTION_COPYRIGHT,
70 UOPTION_SOURCEDIR,
71 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
72 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
73 UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
74 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75 };
76
77 extern "C" int
main(int argc,char * argv[])78 main(int argc, char* argv[]) {
79 U_MAIN_INIT_ARGS(argc, argv);
80
81 /* preset then read command line options */
82 options[SOURCEDIR].value="";
83 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
84
85 /* error handling, printing usage message */
86 if(argc<0) {
87 fprintf(stderr,
88 "error in command line argument \"%s\"\n",
89 argv[-argc]);
90 }
91 if(!options[OUTPUT_FILENAME].doesOccur) {
92 argc=-1;
93 }
94 if( argc<2 ||
95 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
96 ) {
97 /*
98 * Broken into chunks because the C89 standard says the minimum
99 * required supported string length is 509 bytes.
100 */
101 fprintf(stderr,
102 "Usage: %s [-options] infiles+ -o outputfilename\n"
103 "\n"
104 "Reads the infiles with normalization data and\n"
105 "creates a binary or C source file (outputfilename) with the data.\n"
106 "\n",
107 argv[0]);
108 fprintf(stderr,
109 "Options:\n"
110 "\t-h or -? or --help this usage text\n"
111 "\t-v or --verbose verbose output\n"
112 "\t-c or --copyright include a copyright notice\n"
113 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
114 fprintf(stderr,
115 "\t-s or --sourcedir source directory, followed by the path\n"
116 "\t-o or --output output filename\n"
117 "\t --csource writes a C source file with initializers\n");
118 fprintf(stderr,
119 "\t --fast optimize the data for fast normalization,\n"
120 "\t which might increase its size (Writes fully decomposed\n"
121 "\t regular mappings instead of delta mappings.\n"
122 "\t You should measure the runtime speed to make sure that\n"
123 "\t this is a good trade-off.)\n");
124 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
125 }
126
127 beVerbose=options[VERBOSE].doesOccur;
128 haveCopyright=options[COPYRIGHT].doesOccur;
129
130 IcuToolErrorCode errorCode("gennorm2/main()");
131
132 #if UCONFIG_NO_NORMALIZATION
133
134 fprintf(stderr,
135 "gennorm2 writes a dummy binary data file "
136 "because UCONFIG_NO_NORMALIZATION is set, \n"
137 "see icu/source/common/unicode/uconfig.h\n");
138 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
139 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
140 // return U_UNSUPPORTED_ERROR;
141 return 0;
142
143 #else
144
145 LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
146 errorCode.assertSuccess();
147
148 if(options[UNICODE_VERSION].doesOccur) {
149 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
150 }
151
152 if(options[OPT_FAST].doesOccur) {
153 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
154 }
155
156 // prepare the filename beginning with the source dir
157 CharString filename(options[SOURCEDIR].value, errorCode);
158 int32_t pathLength=filename.length();
159 if( pathLength>0 &&
160 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
161 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
162 ) {
163 filename.append(U_FILE_SEP_CHAR, errorCode);
164 pathLength=filename.length();
165 }
166
167 for(int i=1; i<argc; ++i) {
168 printf("gennorm2: processing %s\n", argv[i]);
169 filename.append(argv[i], errorCode);
170 LocalStdioFilePointer f(fopen(filename.data(), "r"));
171 if(f==NULL) {
172 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
173 exit(U_FILE_ACCESS_ERROR);
174 }
175 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
176 parseFile(f.getAlias(), *builder);
177 filename.truncate(pathLength);
178 }
179
180 if(options[WRITE_C_SOURCE].doesOccur) {
181 builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
182 } else {
183 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
184 }
185
186 return errorCode.get();
187
188 #endif
189 }
190
191 #if !UCONFIG_NO_NORMALIZATION
192
parseFile(FILE * f,Normalizer2DataBuilder & builder)193 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
194 IcuToolErrorCode errorCode("gennorm2/parseFile()");
195 char line[300];
196 uint32_t startCP, endCP;
197 while(NULL!=fgets(line, (int)sizeof(line), f)) {
198 char *comment=(char *)strchr(line, '#');
199 if(comment!=NULL) {
200 *comment=0;
201 }
202 u_rtrim(line);
203 if(line[0]==0) {
204 continue; // skip empty and comment-only lines
205 }
206 if(line[0]=='*') {
207 const char *s=u_skipWhitespace(line+1);
208 if(0==strncmp(s, "Unicode", 7)) {
209 s=u_skipWhitespace(s+7);
210 builder.setUnicodeVersion(s);
211 }
212 continue; // reserved syntax
213 }
214 const char *delimiter;
215 int32_t rangeLength=
216 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
217 if(errorCode.isFailure()) {
218 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
219 exit(errorCode.reset());
220 }
221 delimiter=u_skipWhitespace(delimiter);
222 if(*delimiter==':') {
223 const char *s=u_skipWhitespace(delimiter+1);
224 char *end;
225 unsigned long value=strtoul(s, &end, 10);
226 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
227 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
228 exit(U_PARSE_ERROR);
229 }
230 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
231 builder.setCC(c, (uint8_t)value);
232 }
233 continue;
234 }
235 if(*delimiter=='-') {
236 if(*u_skipWhitespace(delimiter+1)!=0) {
237 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
238 exit(U_PARSE_ERROR);
239 }
240 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
241 builder.removeMapping(c);
242 }
243 continue;
244 }
245 if(*delimiter=='=' || *delimiter=='>') {
246 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
247 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
248 if(errorCode.isFailure()) {
249 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
250 exit(errorCode.reset());
251 }
252 UnicodeString mapping(FALSE, uchars, length);
253 if(*delimiter=='=') {
254 if(rangeLength!=1) {
255 fprintf(stderr,
256 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
257 line);
258 exit(U_PARSE_ERROR);
259 }
260 builder.setRoundTripMapping((UChar32)startCP, mapping);
261 } else {
262 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
263 builder.setOneWayMapping(c, mapping);
264 }
265 }
266 continue;
267 }
268 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
269 exit(U_PARSE_ERROR);
270 }
271 }
272
273 #endif // !UCONFIG_NO_NORMALIZATION
274
275 U_NAMESPACE_END
276
277 /*
278 * Hey, Emacs, please set the following:
279 *
280 * Local Variables:
281 * indent-tabs-mode: nil
282 * End:
283 *
284 */
285