• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  gennorm2.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov25
16 *   created by: Markus W. Scherer
17 *
18 *   This program reads text files that define Unicode normalization,
19 *   parses them, and builds a binary data file.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24 
25 #include <fstream>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string>
29 #include <string.h>
30 #include "unicode/errorcode.h"
31 #include "unicode/localpointer.h"
32 #include "unicode/putil.h"
33 #include "unicode/uchar.h"
34 #include "unicode/unistr.h"
35 #include "charstr.h"
36 #include "normalizer2impl.h"
37 #include "toolutil.h"
38 #include "uoptions.h"
39 #include "uparse.h"
40 
41 #if UCONFIG_NO_NORMALIZATION
42 #include "unewdata.h"
43 #endif
44 
45 U_NAMESPACE_BEGIN
46 
47 UBool beVerbose=false, haveCopyright=true;
48 
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
51 #endif
52 
53 /* -------------------------------------------------------------------------- */
54 
55 enum {
56     HELP_H,
57     HELP_QUESTION_MARK,
58     VERBOSE,
59     COPYRIGHT,
60     SOURCEDIR,
61     OUTPUT_FILENAME,
62     UNICODE_VERSION,
63     WRITE_C_SOURCE,
64     WRITE_COMBINED_DATA,
65     OPT_FAST
66 };
67 
68 static UOption options[]={
69     UOPTION_HELP_H,
70     UOPTION_HELP_QUESTION_MARK,
71     UOPTION_VERBOSE,
72     UOPTION_COPYRIGHT,
73     UOPTION_SOURCEDIR,
74     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
75     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
76     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
77     UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
78     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
79 };
80 
81 extern "C" int
main(int argc,char * argv[])82 main(int argc, char* argv[]) {
83     U_MAIN_INIT_ARGS(argc, argv);
84 
85     /* preset then read command line options */
86     options[SOURCEDIR].value="";
87     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
88 
89     /* error handling, printing usage message */
90     if(argc<0) {
91         fprintf(stderr,
92             "error in command line argument \"%s\"\n",
93             argv[-argc]);
94     }
95     if(!options[OUTPUT_FILENAME].doesOccur) {
96         argc=-1;
97     }
98     if( argc<2 ||
99         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
100     ) {
101         fprintf(stderr,
102             "Usage: %s [-options] infiles+ -o outputfilename\n"
103             "\n"
104             "Reads the infiles with normalization data and\n"
105             "creates a binary file, or a C source file (--csource), with the data,\n"
106             "or writes a data file with the combined data (--combined).\n"
107             "See https://unicode-org.github.io/icu/userguide/transforms/normalization#data-file-syntax\n"
108             "\n"
109             "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
110             "\n"
111             "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
112             "in input-file syntax to the outputfilename.\n"
113             "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
114             "(Useful for computing minimal incremental mapping data files.)\n"
115             "\n",
116             argv[0], argv[0]);
117         fprintf(stderr,
118             "Options:\n"
119             "\t-h or -? or --help  this usage text\n"
120             "\t-v or --verbose     verbose output\n"
121             "\t-c or --copyright   include a copyright notice\n"
122             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
123         fprintf(stderr,
124             "\t-s or --sourcedir   source directory, followed by the path\n"
125             "\t-o or --output      output filename\n"
126             "\t      --csource     writes a C source file with initializers\n"
127             "\t      --combined    writes a .txt file (input-file syntax) with the\n"
128             "\t                    combined data from all of the input files\n");
129         fprintf(stderr,
130             "\t      --fast        optimize the data for fast normalization,\n"
131             "\t                    which might increase its size  (Writes fully decomposed\n"
132             "\t                    regular mappings instead of delta mappings.\n"
133             "\t                    You should measure the runtime speed to make sure that\n"
134             "\t                    this is a good trade-off.)\n");
135         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
136     }
137 
138     beVerbose=options[VERBOSE].doesOccur;
139     haveCopyright=options[COPYRIGHT].doesOccur;
140 
141     IcuToolErrorCode errorCode("gennorm2/main()");
142 
143 #if UCONFIG_NO_NORMALIZATION
144 
145     fprintf(stderr,
146         "gennorm2 writes a dummy binary data file "
147         "because UCONFIG_NO_NORMALIZATION is set, \n"
148         "see icu/source/common/unicode/uconfig.h\n");
149     udata_createDummy(nullptr, nullptr, options[OUTPUT_FILENAME].value, errorCode);
150     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
151     // return U_UNSUPPORTED_ERROR;
152     return 0;
153 
154 #else
155 
156     LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
157     LocalPointer<Normalizer2DataBuilder> b2;
158     LocalPointer<Normalizer2DataBuilder> diff;
159     Normalizer2DataBuilder *builder = b1.getAlias();
160     errorCode.assertSuccess();
161 
162     if(options[UNICODE_VERSION].doesOccur) {
163         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
164     }
165 
166     if(options[OPT_FAST].doesOccur) {
167         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
168     }
169 
170     // prepare the filename beginning with the source dir
171     CharString filename(options[SOURCEDIR].value, errorCode);
172     int32_t pathLength=filename.length();
173     if( pathLength>0 &&
174         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
175         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
176     ) {
177         filename.append(U_FILE_SEP_CHAR, errorCode);
178         pathLength=filename.length();
179     }
180 
181     bool doMinus = false;
182     for(int i=1; i<argc; ++i) {
183         printf("gennorm2: processing %s\n", argv[i]);
184         if(strcmp(argv[i], "minus") == 0) {
185             if(doMinus) {
186                 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
187                 exit(U_ILLEGAL_ARGUMENT_ERROR);
188             }
189             // Data from previous input files has been collected in b1.
190             // Collect data from further input files in b2.
191             b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
192             diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
193             errorCode.assertSuccess();
194             builder = b2.getAlias();
195             if(options[UNICODE_VERSION].doesOccur) {
196                 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
197             }
198             if(options[OPT_FAST].doesOccur) {
199                 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
200             }
201             doMinus = true;
202             continue;
203         }
204         filename.append(argv[i], errorCode);
205         std::ifstream f(filename.data());
206         if(f.fail()) {
207             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
208             exit(U_FILE_ACCESS_ERROR);
209         }
210         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
211         parseFile(f, *builder);
212         filename.truncate(pathLength);
213     }
214 
215     if(doMinus) {
216         Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
217         diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
218     } else if(options[WRITE_COMBINED_DATA].doesOccur) {
219         builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
220     } else if(options[WRITE_C_SOURCE].doesOccur) {
221         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
222     } else {
223         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
224     }
225 
226     return errorCode.get();
227 
228 #endif
229 }
230 
231 #if !UCONFIG_NO_NORMALIZATION
232 
parseFile(std::ifstream & f,Normalizer2DataBuilder & builder)233 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
234     IcuToolErrorCode errorCode("gennorm2/parseFile()");
235     std::string lineString;
236     uint32_t startCP, endCP;
237     while(std::getline(f, lineString)) {
238         if (lineString.empty()) {
239             continue;  // skip empty lines.
240         }
241         char *line = &lineString.front();
242         char *comment=(char *)strchr(line, '#');
243         if(comment!=nullptr) {
244             *comment=0;
245         }
246         u_rtrim(line);
247         if(line[0]==0) {
248             continue;  // skip empty and comment-only lines
249         }
250         if(line[0]=='*') {
251             const char *s=u_skipWhitespace(line+1);
252             if(0==strncmp(s, "Unicode", 7)) {
253                 s=u_skipWhitespace(s+7);
254                 builder.setUnicodeVersion(s);
255             }
256             continue;  // reserved syntax
257         }
258         const char *delimiter;
259         int32_t rangeLength=
260             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
261         if(errorCode.isFailure()) {
262             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
263             exit(errorCode.reset());
264         }
265         if (endCP >= 0xd800 && startCP <= 0xdfff) {
266                 fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
267                         line);
268                 exit(U_ILLEGAL_ARGUMENT_ERROR);
269         }
270         delimiter=u_skipWhitespace(delimiter);
271         if(*delimiter==':') {
272             const char *s=u_skipWhitespace(delimiter+1);
273             char *end;
274             unsigned long value=strtoul(s, &end, 10);
275             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
276                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
277                 exit(U_PARSE_ERROR);
278             }
279             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
280                 builder.setCC(c, (uint8_t)value);
281             }
282             continue;
283         }
284         if(*delimiter=='-') {
285             if(*u_skipWhitespace(delimiter+1)!=0) {
286                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
287                 exit(U_PARSE_ERROR);
288             }
289             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
290                 builder.removeMapping(c);
291             }
292             continue;
293         }
294         if(*delimiter=='=' || *delimiter=='>') {
295             char16_t uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
296             int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), nullptr, errorCode);
297             if(errorCode.isFailure()) {
298                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
299                 exit(errorCode.reset());
300             }
301             UnicodeString mapping(false, uchars, length);
302             if(*delimiter=='=') {
303                 if(rangeLength!=1) {
304                     fprintf(stderr,
305                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
306                             line);
307                     exit(U_PARSE_ERROR);
308                 }
309                 builder.setRoundTripMapping((UChar32)startCP, mapping);
310             } else {
311                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
312                     builder.setOneWayMapping(c, mapping);
313                 }
314             }
315             continue;
316         }
317         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
318         exit(U_PARSE_ERROR);
319     }
320 }
321 
322 #endif // !UCONFIG_NO_NORMALIZATION
323 
324 U_NAMESPACE_END
325 
326 /*
327  * Hey, Emacs, please set the following:
328  *
329  * Local Variables:
330  * indent-tabs-mode: nil
331  * End:
332  *
333  */
334