1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: gennorm2.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
17 *
18 * This program reads text files that define Unicode normalization,
19 * parses them, and builds a binary data file.
20 */
21
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24
25 #include <fstream>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string>
29 #include <string.h>
30 #include "unicode/errorcode.h"
31 #include "unicode/localpointer.h"
32 #include "unicode/putil.h"
33 #include "unicode/uchar.h"
34 #include "unicode/unistr.h"
35 #include "charstr.h"
36 #include "normalizer2impl.h"
37 #include "toolutil.h"
38 #include "uoptions.h"
39 #include "uparse.h"
40
41 #if UCONFIG_NO_NORMALIZATION
42 #include "unewdata.h"
43 #endif
44
45 U_NAMESPACE_BEGIN
46
47 UBool beVerbose=false, haveCopyright=true;
48
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
51 #endif
52
53 /* -------------------------------------------------------------------------- */
54
55 enum {
56 HELP_H,
57 HELP_QUESTION_MARK,
58 VERBOSE,
59 COPYRIGHT,
60 SOURCEDIR,
61 OUTPUT_FILENAME,
62 UNICODE_VERSION,
63 WRITE_C_SOURCE,
64 WRITE_COMBINED_DATA,
65 OPT_FAST
66 };
67
68 static UOption options[]={
69 UOPTION_HELP_H,
70 UOPTION_HELP_QUESTION_MARK,
71 UOPTION_VERBOSE,
72 UOPTION_COPYRIGHT,
73 UOPTION_SOURCEDIR,
74 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
75 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
76 UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
77 UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
78 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
79 };
80
81 extern "C" int
main(int argc,char * argv[])82 main(int argc, char* argv[]) {
83 U_MAIN_INIT_ARGS(argc, argv);
84
85 /* preset then read command line options */
86 options[SOURCEDIR].value="";
87 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
88
89 /* error handling, printing usage message */
90 if(argc<0) {
91 fprintf(stderr,
92 "error in command line argument \"%s\"\n",
93 argv[-argc]);
94 }
95 if(!options[OUTPUT_FILENAME].doesOccur) {
96 argc=-1;
97 }
98 if( argc<2 ||
99 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
100 ) {
101 fprintf(stderr,
102 "Usage: %s [-options] infiles+ -o outputfilename\n"
103 "\n"
104 "Reads the infiles with normalization data and\n"
105 "creates a binary file, or a C source file (--csource), with the data,\n"
106 "or writes a data file with the combined data (--combined).\n"
107 "See https://unicode-org.github.io/icu/userguide/transforms/normalization#data-file-syntax\n"
108 "\n"
109 "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
110 "\n"
111 "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
112 "in input-file syntax to the outputfilename.\n"
113 "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
114 "(Useful for computing minimal incremental mapping data files.)\n"
115 "\n",
116 argv[0], argv[0]);
117 fprintf(stderr,
118 "Options:\n"
119 "\t-h or -? or --help this usage text\n"
120 "\t-v or --verbose verbose output\n"
121 "\t-c or --copyright include a copyright notice\n"
122 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
123 fprintf(stderr,
124 "\t-s or --sourcedir source directory, followed by the path\n"
125 "\t-o or --output output filename\n"
126 "\t --csource writes a C source file with initializers\n"
127 "\t --combined writes a .txt file (input-file syntax) with the\n"
128 "\t combined data from all of the input files\n");
129 fprintf(stderr,
130 "\t --fast optimize the data for fast normalization,\n"
131 "\t which might increase its size (Writes fully decomposed\n"
132 "\t regular mappings instead of delta mappings.\n"
133 "\t You should measure the runtime speed to make sure that\n"
134 "\t this is a good trade-off.)\n");
135 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
136 }
137
138 beVerbose=options[VERBOSE].doesOccur;
139 haveCopyright=options[COPYRIGHT].doesOccur;
140
141 IcuToolErrorCode errorCode("gennorm2/main()");
142
143 #if UCONFIG_NO_NORMALIZATION
144
145 fprintf(stderr,
146 "gennorm2 writes a dummy binary data file "
147 "because UCONFIG_NO_NORMALIZATION is set, \n"
148 "see icu/source/common/unicode/uconfig.h\n");
149 udata_createDummy(nullptr, nullptr, options[OUTPUT_FILENAME].value, errorCode);
150 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
151 // return U_UNSUPPORTED_ERROR;
152 return 0;
153
154 #else
155
156 LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
157 LocalPointer<Normalizer2DataBuilder> b2;
158 LocalPointer<Normalizer2DataBuilder> diff;
159 Normalizer2DataBuilder *builder = b1.getAlias();
160 errorCode.assertSuccess();
161
162 if(options[UNICODE_VERSION].doesOccur) {
163 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
164 }
165
166 if(options[OPT_FAST].doesOccur) {
167 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
168 }
169
170 // prepare the filename beginning with the source dir
171 CharString filename(options[SOURCEDIR].value, errorCode);
172 int32_t pathLength=filename.length();
173 if( pathLength>0 &&
174 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
175 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
176 ) {
177 filename.append(U_FILE_SEP_CHAR, errorCode);
178 pathLength=filename.length();
179 }
180
181 bool doMinus = false;
182 for(int i=1; i<argc; ++i) {
183 printf("gennorm2: processing %s\n", argv[i]);
184 if(strcmp(argv[i], "minus") == 0) {
185 if(doMinus) {
186 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
187 exit(U_ILLEGAL_ARGUMENT_ERROR);
188 }
189 // Data from previous input files has been collected in b1.
190 // Collect data from further input files in b2.
191 b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
192 diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
193 errorCode.assertSuccess();
194 builder = b2.getAlias();
195 if(options[UNICODE_VERSION].doesOccur) {
196 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
197 }
198 if(options[OPT_FAST].doesOccur) {
199 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
200 }
201 doMinus = true;
202 continue;
203 }
204 filename.append(argv[i], errorCode);
205 std::ifstream f(filename.data());
206 if(f.fail()) {
207 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
208 exit(U_FILE_ACCESS_ERROR);
209 }
210 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
211 parseFile(f, *builder);
212 filename.truncate(pathLength);
213 }
214
215 if(doMinus) {
216 Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
217 diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
218 } else if(options[WRITE_COMBINED_DATA].doesOccur) {
219 builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
220 } else if(options[WRITE_C_SOURCE].doesOccur) {
221 builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
222 } else {
223 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
224 }
225
226 return errorCode.get();
227
228 #endif
229 }
230
231 #if !UCONFIG_NO_NORMALIZATION
232
parseFile(std::ifstream & f,Normalizer2DataBuilder & builder)233 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
234 IcuToolErrorCode errorCode("gennorm2/parseFile()");
235 std::string lineString;
236 uint32_t startCP, endCP;
237 while(std::getline(f, lineString)) {
238 if (lineString.empty()) {
239 continue; // skip empty lines.
240 }
241 char *line = &lineString.front();
242 char *comment=(char *)strchr(line, '#');
243 if(comment!=nullptr) {
244 *comment=0;
245 }
246 u_rtrim(line);
247 if(line[0]==0) {
248 continue; // skip empty and comment-only lines
249 }
250 if(line[0]=='*') {
251 const char *s=u_skipWhitespace(line+1);
252 if(0==strncmp(s, "Unicode", 7)) {
253 s=u_skipWhitespace(s+7);
254 builder.setUnicodeVersion(s);
255 }
256 continue; // reserved syntax
257 }
258 const char *delimiter;
259 int32_t rangeLength=
260 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
261 if(errorCode.isFailure()) {
262 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
263 exit(errorCode.reset());
264 }
265 if (endCP >= 0xd800 && startCP <= 0xdfff) {
266 fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
267 line);
268 exit(U_ILLEGAL_ARGUMENT_ERROR);
269 }
270 delimiter=u_skipWhitespace(delimiter);
271 if(*delimiter==':') {
272 const char *s=u_skipWhitespace(delimiter+1);
273 char *end;
274 unsigned long value=strtoul(s, &end, 10);
275 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
276 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
277 exit(U_PARSE_ERROR);
278 }
279 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
280 builder.setCC(c, (uint8_t)value);
281 }
282 continue;
283 }
284 if(*delimiter=='-') {
285 if(*u_skipWhitespace(delimiter+1)!=0) {
286 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
287 exit(U_PARSE_ERROR);
288 }
289 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
290 builder.removeMapping(c);
291 }
292 continue;
293 }
294 if(*delimiter=='=' || *delimiter=='>') {
295 char16_t uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
296 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), nullptr, errorCode);
297 if(errorCode.isFailure()) {
298 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
299 exit(errorCode.reset());
300 }
301 UnicodeString mapping(false, uchars, length);
302 if(*delimiter=='=') {
303 if(rangeLength!=1) {
304 fprintf(stderr,
305 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
306 line);
307 exit(U_PARSE_ERROR);
308 }
309 builder.setRoundTripMapping((UChar32)startCP, mapping);
310 } else {
311 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
312 builder.setOneWayMapping(c, mapping);
313 }
314 }
315 continue;
316 }
317 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
318 exit(U_PARSE_ERROR);
319 }
320 }
321
322 #endif // !UCONFIG_NO_NORMALIZATION
323
324 U_NAMESPACE_END
325
326 /*
327 * Hey, Emacs, please set the following:
328 *
329 * Local Variables:
330 * indent-tabs-mode: nil
331 * End:
332 *
333 */
334