1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: gennorm2.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
17 *
18 * This program reads text files that define Unicode normalization,
19 * parses them, and builds a binary data file.
20 */
21
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24
25 #include <fstream>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string>
29 #include <string.h>
30 #include "unicode/errorcode.h"
31 #include "unicode/localpointer.h"
32 #include "unicode/putil.h"
33 #include "unicode/uchar.h"
34 #include "unicode/unistr.h"
35 #include "charstr.h"
36 #include "normalizer2impl.h"
37 #include "toolutil.h"
38 #include "uoptions.h"
39 #include "uparse.h"
40
41 #if UCONFIG_NO_NORMALIZATION
42 #include "unewdata.h"
43 #endif
44
45 U_NAMESPACE_BEGIN
46
47 UBool beVerbose=FALSE, haveCopyright=TRUE;
48
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
51 #endif
52
53 /* -------------------------------------------------------------------------- */
54
55 enum {
56 HELP_H,
57 HELP_QUESTION_MARK,
58 VERBOSE,
59 COPYRIGHT,
60 SOURCEDIR,
61 OUTPUT_FILENAME,
62 UNICODE_VERSION,
63 WRITE_C_SOURCE,
64 WRITE_COMBINED_DATA,
65 OPT_FAST
66 };
67
68 static UOption options[]={
69 UOPTION_HELP_H,
70 UOPTION_HELP_QUESTION_MARK,
71 UOPTION_VERBOSE,
72 UOPTION_COPYRIGHT,
73 UOPTION_SOURCEDIR,
74 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
75 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
76 UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
77 UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
78 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
79 };
80
81 extern "C" int
main(int argc,char * argv[])82 main(int argc, char* argv[]) {
83 U_MAIN_INIT_ARGS(argc, argv);
84
85 /* preset then read command line options */
86 options[SOURCEDIR].value="";
87 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
88
89 /* error handling, printing usage message */
90 if(argc<0) {
91 fprintf(stderr,
92 "error in command line argument \"%s\"\n",
93 argv[-argc]);
94 }
95 if(!options[OUTPUT_FILENAME].doesOccur) {
96 argc=-1;
97 }
98 if( argc<2 ||
99 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
100 ) {
101 fprintf(stderr,
102 "Usage: %s [-options] infiles+ -o outputfilename\n"
103 "\n"
104 "Reads the infiles with normalization data and\n"
105 "creates a binary file, or a C source file (--csource), with the data,\n"
106 "or writes a data file with the combined data (--combined).\n"
107 "See https://unicode-org.github.io/icu/userguide/transforms/normalization#data-file-syntax\n"
108 "\n"
109 "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
110 "\n"
111 "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
112 "in input-file syntax to the outputfilename.\n"
113 "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
114 "(Useful for computing minimal incremental mapping data files.)\n"
115 "\n",
116 argv[0], argv[0]);
117 fprintf(stderr,
118 "Options:\n"
119 "\t-h or -? or --help this usage text\n"
120 "\t-v or --verbose verbose output\n"
121 "\t-c or --copyright include a copyright notice\n"
122 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
123 fprintf(stderr,
124 "\t-s or --sourcedir source directory, followed by the path\n"
125 "\t-o or --output output filename\n"
126 "\t --csource writes a C source file with initializers\n"
127 "\t --combined writes a .txt file (input-file syntax) with the\n"
128 "\t combined data from all of the input files\n");
129 fprintf(stderr,
130 "\t --fast optimize the data for fast normalization,\n"
131 "\t which might increase its size (Writes fully decomposed\n"
132 "\t regular mappings instead of delta mappings.\n"
133 "\t You should measure the runtime speed to make sure that\n"
134 "\t this is a good trade-off.)\n");
135 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
136 }
137
138 beVerbose=options[VERBOSE].doesOccur;
139 haveCopyright=options[COPYRIGHT].doesOccur;
140
141 IcuToolErrorCode errorCode("gennorm2/main()");
142
143 #if UCONFIG_NO_NORMALIZATION
144
145 fprintf(stderr,
146 "gennorm2 writes a dummy binary data file "
147 "because UCONFIG_NO_NORMALIZATION is set, \n"
148 "see icu/source/common/unicode/uconfig.h\n");
149 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
150 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
151 // return U_UNSUPPORTED_ERROR;
152 return 0;
153
154 #else
155
156 LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
157 LocalPointer<Normalizer2DataBuilder> b2;
158 LocalPointer<Normalizer2DataBuilder> diff;
159 Normalizer2DataBuilder *builder = b1.getAlias();
160 errorCode.assertSuccess();
161
162 if(options[UNICODE_VERSION].doesOccur) {
163 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
164 }
165
166 if(options[OPT_FAST].doesOccur) {
167 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
168 }
169
170 // prepare the filename beginning with the source dir
171 CharString filename(options[SOURCEDIR].value, errorCode);
172 int32_t pathLength=filename.length();
173 if( pathLength>0 &&
174 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
175 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
176 ) {
177 filename.append(U_FILE_SEP_CHAR, errorCode);
178 pathLength=filename.length();
179 }
180
181 bool doMinus = false;
182 for(int i=1; i<argc; ++i) {
183 printf("gennorm2: processing %s\n", argv[i]);
184 if(strcmp(argv[i], "minus") == 0) {
185 if(doMinus) {
186 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
187 exit(U_ILLEGAL_ARGUMENT_ERROR);
188 }
189 // Data from previous input files has been collected in b1.
190 // Collect data from further input files in b2.
191 b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
192 diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
193 errorCode.assertSuccess();
194 builder = b2.getAlias();
195 if(options[UNICODE_VERSION].doesOccur) {
196 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
197 }
198 if(options[OPT_FAST].doesOccur) {
199 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
200 }
201 doMinus = true;
202 continue;
203 }
204 filename.append(argv[i], errorCode);
205 std::ifstream f(filename.data());
206 if(f.fail()) {
207 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
208 exit(U_FILE_ACCESS_ERROR);
209 }
210 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
211 parseFile(f, *builder);
212 filename.truncate(pathLength);
213 }
214
215 if(doMinus) {
216 Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
217 diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
218 } else if(options[WRITE_COMBINED_DATA].doesOccur) {
219 builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
220 } else if(options[WRITE_C_SOURCE].doesOccur) {
221 builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
222 } else {
223 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
224 }
225
226 return errorCode.get();
227
228 #endif
229 }
230
231 #if !UCONFIG_NO_NORMALIZATION
232
parseFile(std::ifstream & f,Normalizer2DataBuilder & builder)233 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
234 IcuToolErrorCode errorCode("gennorm2/parseFile()");
235 std::string lineString;
236 uint32_t startCP, endCP;
237 while(std::getline(f, lineString)) {
238 if (lineString.empty()) {
239 continue; // skip empty lines.
240 }
241 #if (U_CPLUSPLUS_VERSION >= 11)
242 char *line = &lineString.front();
243 #else
244 char *line = &lineString.at(0);
245 #endif
246 char *comment=(char *)strchr(line, '#');
247 if(comment!=NULL) {
248 *comment=0;
249 }
250 u_rtrim(line);
251 if(line[0]==0) {
252 continue; // skip empty and comment-only lines
253 }
254 if(line[0]=='*') {
255 const char *s=u_skipWhitespace(line+1);
256 if(0==strncmp(s, "Unicode", 7)) {
257 s=u_skipWhitespace(s+7);
258 builder.setUnicodeVersion(s);
259 }
260 continue; // reserved syntax
261 }
262 const char *delimiter;
263 int32_t rangeLength=
264 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
265 if(errorCode.isFailure()) {
266 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
267 exit(errorCode.reset());
268 }
269 if (endCP >= 0xd800 && startCP <= 0xdfff) {
270 fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
271 line);
272 exit(U_ILLEGAL_ARGUMENT_ERROR);
273 }
274 delimiter=u_skipWhitespace(delimiter);
275 if(*delimiter==':') {
276 const char *s=u_skipWhitespace(delimiter+1);
277 char *end;
278 unsigned long value=strtoul(s, &end, 10);
279 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
280 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
281 exit(U_PARSE_ERROR);
282 }
283 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
284 builder.setCC(c, (uint8_t)value);
285 }
286 continue;
287 }
288 if(*delimiter=='-') {
289 if(*u_skipWhitespace(delimiter+1)!=0) {
290 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
291 exit(U_PARSE_ERROR);
292 }
293 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
294 builder.removeMapping(c);
295 }
296 continue;
297 }
298 if(*delimiter=='=' || *delimiter=='>') {
299 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
300 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
301 if(errorCode.isFailure()) {
302 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
303 exit(errorCode.reset());
304 }
305 UnicodeString mapping(FALSE, uchars, length);
306 if(*delimiter=='=') {
307 if(rangeLength!=1) {
308 fprintf(stderr,
309 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
310 line);
311 exit(U_PARSE_ERROR);
312 }
313 builder.setRoundTripMapping((UChar32)startCP, mapping);
314 } else {
315 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
316 builder.setOneWayMapping(c, mapping);
317 }
318 }
319 continue;
320 }
321 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
322 exit(U_PARSE_ERROR);
323 }
324 }
325
326 #endif // !UCONFIG_NO_NORMALIZATION
327
328 U_NAMESPACE_END
329
330 /*
331 * Hey, Emacs, please set the following:
332 *
333 * Local Variables:
334 * indent-tabs-mode: nil
335 * End:
336 *
337 */
338