• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  gennorm2.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov25
16 *   created by: Markus W. Scherer
17 *
18 *   This program reads text files that define Unicode normalization,
19 *   parses them, and builds a binary data file.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24 
25 #include <fstream>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string>
29 #include <string.h>
30 #include "unicode/errorcode.h"
31 #include "unicode/localpointer.h"
32 #include "unicode/putil.h"
33 #include "unicode/uchar.h"
34 #include "unicode/unistr.h"
35 #include "charstr.h"
36 #include "normalizer2impl.h"
37 #include "toolutil.h"
38 #include "uoptions.h"
39 #include "uparse.h"
40 
41 #if UCONFIG_NO_NORMALIZATION
42 #include "unewdata.h"
43 #endif
44 
45 U_NAMESPACE_BEGIN
46 
47 UBool beVerbose=false, haveCopyright=true;
48 
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
51 #endif
52 
53 /* -------------------------------------------------------------------------- */
54 
55 enum {
56     HELP_H,
57     HELP_QUESTION_MARK,
58     VERBOSE,
59     COPYRIGHT,
60     SOURCEDIR,
61     OUTPUT_FILENAME,
62     UNICODE_VERSION,
63     WRITE_C_SOURCE,
64     WRITE_COMBINED_DATA,
65     OPT_FAST
66 };
67 
68 static UOption options[]={
69     UOPTION_HELP_H,
70     UOPTION_HELP_QUESTION_MARK,
71     UOPTION_VERBOSE,
72     UOPTION_COPYRIGHT,
73     UOPTION_SOURCEDIR,
74     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
75     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
76     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
77     UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
78     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
79 };
80 
81 extern "C" int
main(int argc,char * argv[])82 main(int argc, char* argv[]) {
83     U_MAIN_INIT_ARGS(argc, argv);
84 
85     /* preset then read command line options */
86     options[SOURCEDIR].value="";
87     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
88 
89     /* error handling, printing usage message */
90     if(argc<0) {
91         fprintf(stderr,
92             "error in command line argument \"%s\"\n",
93             argv[-argc]);
94     }
95     if(!options[OUTPUT_FILENAME].doesOccur) {
96         argc=-1;
97     }
98     if( argc<2 ||
99         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
100     ) {
101         fprintf(stderr,
102             "Usage: %s [-options] infiles+ -o outputfilename\n"
103             "\n"
104             "Reads the infiles with normalization data and\n"
105             "creates a binary file, or a C source file (--csource), with the data,\n"
106             "or writes a data file with the combined data (--combined).\n"
107             "See https://unicode-org.github.io/icu/userguide/transforms/normalization#data-file-syntax\n"
108             "\n"
109             "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
110             "\n"
111             "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
112             "in input-file syntax to the outputfilename.\n"
113             "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
114             "(Useful for computing minimal incremental mapping data files.)\n"
115             "\n",
116             argv[0], argv[0]);
117         fprintf(stderr,
118             "Options:\n"
119             "\t-h or -? or --help  this usage text\n"
120             "\t-v or --verbose     verbose output\n"
121             "\t-c or --copyright   include a copyright notice\n"
122             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
123         fprintf(stderr,
124             "\t-s or --sourcedir   source directory, followed by the path\n"
125             "\t-o or --output      output filename\n"
126             "\t      --csource     writes a C source file with initializers\n"
127             "\t      --combined    writes a .txt file (input-file syntax) with the\n"
128             "\t                    combined data from all of the input files\n");
129         fprintf(stderr,
130             "\t      --fast        optimize the data for fast normalization,\n"
131             "\t                    which might increase its size  (Writes fully decomposed\n"
132             "\t                    regular mappings instead of delta mappings.\n"
133             "\t                    You should measure the runtime speed to make sure that\n"
134             "\t                    this is a good trade-off.)\n");
135         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
136     }
137 
138     beVerbose=options[VERBOSE].doesOccur;
139     haveCopyright=options[COPYRIGHT].doesOccur;
140 
141     IcuToolErrorCode errorCode("gennorm2/main()");
142 
143 #if UCONFIG_NO_NORMALIZATION
144 
145     fprintf(stderr,
146         "gennorm2 writes a dummy binary data file "
147         "because UCONFIG_NO_NORMALIZATION is set, \n"
148         "see icu/source/common/unicode/uconfig.h\n");
149     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
150     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
151     // return U_UNSUPPORTED_ERROR;
152     return 0;
153 
154 #else
155 
156     LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
157     LocalPointer<Normalizer2DataBuilder> b2;
158     LocalPointer<Normalizer2DataBuilder> diff;
159     Normalizer2DataBuilder *builder = b1.getAlias();
160     errorCode.assertSuccess();
161 
162     if(options[UNICODE_VERSION].doesOccur) {
163         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
164     }
165 
166     if(options[OPT_FAST].doesOccur) {
167         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
168     }
169 
170     // prepare the filename beginning with the source dir
171     CharString filename(options[SOURCEDIR].value, errorCode);
172     int32_t pathLength=filename.length();
173     if( pathLength>0 &&
174         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
175         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
176     ) {
177         filename.append(U_FILE_SEP_CHAR, errorCode);
178         pathLength=filename.length();
179     }
180 
181     bool doMinus = false;
182     for(int i=1; i<argc; ++i) {
183         printf("gennorm2: processing %s\n", argv[i]);
184         if(strcmp(argv[i], "minus") == 0) {
185             if(doMinus) {
186                 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
187                 exit(U_ILLEGAL_ARGUMENT_ERROR);
188             }
189             // Data from previous input files has been collected in b1.
190             // Collect data from further input files in b2.
191             b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
192             diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
193             errorCode.assertSuccess();
194             builder = b2.getAlias();
195             if(options[UNICODE_VERSION].doesOccur) {
196                 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
197             }
198             if(options[OPT_FAST].doesOccur) {
199                 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
200             }
201             doMinus = true;
202             continue;
203         }
204         filename.append(argv[i], errorCode);
205         std::ifstream f(filename.data());
206         if(f.fail()) {
207             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
208             exit(U_FILE_ACCESS_ERROR);
209         }
210         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
211         parseFile(f, *builder);
212         filename.truncate(pathLength);
213     }
214 
215     if(doMinus) {
216         Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
217         diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
218     } else if(options[WRITE_COMBINED_DATA].doesOccur) {
219         builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
220     } else if(options[WRITE_C_SOURCE].doesOccur) {
221         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
222     } else {
223         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
224     }
225 
226     return errorCode.get();
227 
228 #endif
229 }
230 
231 #if !UCONFIG_NO_NORMALIZATION
232 
parseFile(std::ifstream & f,Normalizer2DataBuilder & builder)233 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
234     IcuToolErrorCode errorCode("gennorm2/parseFile()");
235     std::string lineString;
236     uint32_t startCP, endCP;
237     while(std::getline(f, lineString)) {
238         if (lineString.empty()) {
239             continue;  // skip empty lines.
240         }
241 #if (U_CPLUSPLUS_VERSION >= 11)
242         char *line = &lineString.front();
243 #else
244         char *line = &lineString.at(0);
245 #endif
246         char *comment=(char *)strchr(line, '#');
247         if(comment!=NULL) {
248             *comment=0;
249         }
250         u_rtrim(line);
251         if(line[0]==0) {
252             continue;  // skip empty and comment-only lines
253         }
254         if(line[0]=='*') {
255             const char *s=u_skipWhitespace(line+1);
256             if(0==strncmp(s, "Unicode", 7)) {
257                 s=u_skipWhitespace(s+7);
258                 builder.setUnicodeVersion(s);
259             }
260             continue;  // reserved syntax
261         }
262         const char *delimiter;
263         int32_t rangeLength=
264             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
265         if(errorCode.isFailure()) {
266             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
267             exit(errorCode.reset());
268         }
269         if (endCP >= 0xd800 && startCP <= 0xdfff) {
270                 fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
271                         line);
272                 exit(U_ILLEGAL_ARGUMENT_ERROR);
273         }
274         delimiter=u_skipWhitespace(delimiter);
275         if(*delimiter==':') {
276             const char *s=u_skipWhitespace(delimiter+1);
277             char *end;
278             unsigned long value=strtoul(s, &end, 10);
279             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
280                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
281                 exit(U_PARSE_ERROR);
282             }
283             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
284                 builder.setCC(c, (uint8_t)value);
285             }
286             continue;
287         }
288         if(*delimiter=='-') {
289             if(*u_skipWhitespace(delimiter+1)!=0) {
290                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
291                 exit(U_PARSE_ERROR);
292             }
293             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
294                 builder.removeMapping(c);
295             }
296             continue;
297         }
298         if(*delimiter=='=' || *delimiter=='>') {
299             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
300             int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
301             if(errorCode.isFailure()) {
302                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
303                 exit(errorCode.reset());
304             }
305             UnicodeString mapping(false, uchars, length);
306             if(*delimiter=='=') {
307                 if(rangeLength!=1) {
308                     fprintf(stderr,
309                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
310                             line);
311                     exit(U_PARSE_ERROR);
312                 }
313                 builder.setRoundTripMapping((UChar32)startCP, mapping);
314             } else {
315                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
316                     builder.setOneWayMapping(c, mapping);
317                 }
318             }
319             continue;
320         }
321         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
322         exit(U_PARSE_ERROR);
323     }
324 }
325 
326 #endif // !UCONFIG_NO_NORMALIZATION
327 
328 U_NAMESPACE_END
329 
330 /*
331  * Hey, Emacs, please set the following:
332  *
333  * Local Variables:
334  * indent-tabs-mode: nil
335  * End:
336  *
337  */
338