1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gennorm2.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov25
14 * created by: Markus W. Scherer
15 *
16 * This program reads text files that define Unicode normalization,
17 * parses them, and builds a binary data file.
18 */
19
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include "unicode/errorcode.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/putil.h"
29 #include "unicode/uchar.h"
30 #include "unicode/unistr.h"
31 #include "charstr.h"
32 #include "normalizer2impl.h"
33 #include "toolutil.h"
34 #include "uoptions.h"
35 #include "uparse.h"
36
37 #if UCONFIG_NO_NORMALIZATION
38 #include "unewdata.h"
39 #endif
40
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42
43 U_NAMESPACE_BEGIN
44
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
48
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
51 #endif
52
53 /* -------------------------------------------------------------------------- */
54
55 enum {
56 HELP_H,
57 HELP_QUESTION_MARK,
58 VERBOSE,
59 COPYRIGHT,
60 SOURCEDIR,
61 OUTPUT_FILENAME,
62 UNICODE_VERSION,
63 OPT_FAST
64 };
65
66 static UOption options[]={
67 UOPTION_HELP_H,
68 UOPTION_HELP_QUESTION_MARK,
69 UOPTION_VERBOSE,
70 UOPTION_COPYRIGHT,
71 UOPTION_SOURCEDIR,
72 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
73 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
74 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
75 };
76
77 extern "C" int
main(int argc,char * argv[])78 main(int argc, char* argv[]) {
79 U_MAIN_INIT_ARGS(argc, argv);
80
81 /* preset then read command line options */
82 options[SOURCEDIR].value="";
83 options[UNICODE_VERSION].value=U_UNICODE_VERSION;
84 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
85
86 /* error handling, printing usage message */
87 if(argc<0) {
88 fprintf(stderr,
89 "error in command line argument \"%s\"\n",
90 argv[-argc]);
91 }
92 if(!options[OUTPUT_FILENAME].doesOccur) {
93 argc=-1;
94 }
95 if( argc<2 ||
96 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
97 ) {
98 /*
99 * Broken into chunks because the C89 standard says the minimum
100 * required supported string length is 509 bytes.
101 */
102 fprintf(stderr,
103 "Usage: %s [-options] infiles+ -o outputfilename\n"
104 "\n"
105 "Reads the infiles with normalization data and\n"
106 "creates a binary file (outputfilename) with the data.\n"
107 "\n",
108 argv[0]);
109 fprintf(stderr,
110 "Options:\n"
111 "\t-h or -? or --help this usage text\n"
112 "\t-v or --verbose verbose output\n"
113 "\t-c or --copyright include a copyright notice\n"
114 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
115 fprintf(stderr,
116 "\t-s or --sourcedir source directory, followed by the path\n"
117 "\t-o or --output output filename\n");
118 fprintf(stderr,
119 "\t --fast optimize the .nrm file for fast normalization,\n"
120 "\t which might increase its size (Writes fully decomposed\n"
121 "\t regular mappings instead of delta mappings.\n"
122 "\t You should measure the runtime speed to make sure that\n"
123 "\t this is a good trade-off.)\n");
124 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
125 }
126
127 beVerbose=options[VERBOSE].doesOccur;
128 haveCopyright=options[COPYRIGHT].doesOccur;
129
130 IcuToolErrorCode errorCode("gennorm2/main()");
131
132 #if UCONFIG_NO_NORMALIZATION
133
134 fprintf(stderr,
135 "gennorm2 writes a dummy binary data file "
136 "because UCONFIG_NO_NORMALIZATION is set, \n"
137 "see icu/source/common/unicode/uconfig.h\n");
138 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
139 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
140 // return U_UNSUPPORTED_ERROR;
141 return 0;
142
143 #else
144
145 LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
146 errorCode.assertSuccess();
147
148 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
149
150 if(options[OPT_FAST].doesOccur) {
151 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
152 }
153
154 // prepare the filename beginning with the source dir
155 CharString filename(options[SOURCEDIR].value, errorCode);
156 int32_t pathLength=filename.length();
157 if( pathLength>0 &&
158 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
159 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
160 ) {
161 filename.append(U_FILE_SEP_CHAR, errorCode);
162 pathLength=filename.length();
163 }
164
165 for(int i=1; i<argc; ++i) {
166 printf("gennorm2: processing %s\n", argv[i]);
167 filename.append(argv[i], errorCode);
168 LocalStdioFilePointer f(fopen(filename.data(), "r"));
169 if(f==NULL) {
170 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
171 exit(U_FILE_ACCESS_ERROR);
172 }
173 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
174 parseFile(f.getAlias(), *builder);
175 filename.truncate(pathLength);
176 }
177
178 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
179
180 return errorCode.get();
181
182 #endif
183 }
184
185 #if !UCONFIG_NO_NORMALIZATION
186
parseFile(FILE * f,Normalizer2DataBuilder & builder)187 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
188 IcuToolErrorCode errorCode("gennorm2/parseFile()");
189 char line[300];
190 uint32_t startCP, endCP;
191 while(NULL!=fgets(line, (int)sizeof(line), f)) {
192 char *comment=(char *)strchr(line, '#');
193 if(comment!=NULL) {
194 *comment=0;
195 }
196 u_rtrim(line);
197 if(line[0]==0) {
198 continue; // skip empty and comment-only lines
199 }
200 if(line[0]=='*') {
201 continue; // reserved syntax
202 }
203 const char *delimiter;
204 int32_t rangeLength=
205 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
206 if(errorCode.isFailure()) {
207 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
208 exit(errorCode.reset());
209 }
210 delimiter=u_skipWhitespace(delimiter);
211 if(*delimiter==':') {
212 const char *s=u_skipWhitespace(delimiter+1);
213 char *end;
214 unsigned long value=strtoul(s, &end, 10);
215 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
216 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
217 exit(U_PARSE_ERROR);
218 }
219 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
220 builder.setCC(c, (uint8_t)value);
221 }
222 continue;
223 }
224 if(*delimiter=='-') {
225 if(*u_skipWhitespace(delimiter+1)!=0) {
226 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
227 exit(U_PARSE_ERROR);
228 }
229 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
230 builder.removeMapping(c);
231 }
232 continue;
233 }
234 if(*delimiter=='=' || *delimiter=='>') {
235 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
236 int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
237 if(errorCode.isFailure()) {
238 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
239 exit(errorCode.reset());
240 }
241 UnicodeString mapping(FALSE, uchars, length);
242 if(*delimiter=='=') {
243 if(rangeLength!=1) {
244 fprintf(stderr,
245 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
246 line);
247 exit(U_PARSE_ERROR);
248 }
249 builder.setRoundTripMapping((UChar32)startCP, mapping);
250 } else {
251 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
252 builder.setOneWayMapping(c, mapping);
253 }
254 }
255 continue;
256 }
257 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
258 exit(U_PARSE_ERROR);
259 }
260 }
261
262 #endif // !UCONFIG_NO_NORMALIZATION
263
264 U_NAMESPACE_END
265
266 /*
267 * Hey, Emacs, please set the following:
268 *
269 * Local Variables:
270 * indent-tabs-mode: nil
271 * End:
272 *
273 */
274