1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: gensprep.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003-02-06
16 * created by: Ram Viswanadha
17 *
18 * This program reads the Profile.txt files,
19 * parses them, and extracts the data for StringPrep profile.
20 * It then preprocesses it and writes a binary file for efficient use
21 * in various StringPrep conversion processes.
22 */
23
24 #define USPREP_TYPE_NAMES_ARRAY 1
25
26 #include <stdio.h>
27 #include <stdlib.h>
28
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "toolutil.h"
32 #include "unewdata.h"
33 #include "uoptions.h"
34 #include "uparse.h"
35 #include "sprpimpl.h"
36
37 #include "unicode/uclean.h"
38 #include "unicode/udata.h"
39 #include "unicode/utypes.h"
40 #include "unicode/putil.h"
41
42
43 U_CDECL_BEGIN
44 #include "gensprep.h"
45 U_CDECL_END
46
47 UBool beVerbose=FALSE, haveCopyright=TRUE;
48
49 #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt"
50
51 #define NORMALIZE_DIRECTIVE "normalize"
52 #define NORMALIZE_DIRECTIVE_LEN 9
53 #define CHECK_BIDI_DIRECTIVE "check-bidi"
54 #define CHECK_BIDI_DIRECTIVE_LEN 10
55
56 /* prototypes --------------------------------------------------------------- */
57
58 static void
59 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode);
60
61 static void
62 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode);
63
64
65 /* -------------------------------------------------------------------------- */
66
67 static UOption options[]={
68 UOPTION_HELP_H,
69 UOPTION_HELP_QUESTION_MARK,
70 UOPTION_VERBOSE,
71 UOPTION_COPYRIGHT,
72 UOPTION_DESTDIR,
73 UOPTION_SOURCEDIR,
74 UOPTION_ICUDATADIR,
75 UOPTION_BUNDLE_NAME,
76 { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 },
77 { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 },
78 { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0},
79 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
80 };
81
82 enum{
83 HELP,
84 HELP_QUESTION_MARK,
85 VERBOSE,
86 COPYRIGHT,
87 DESTDIR,
88 SOURCEDIR,
89 ICUDATADIR,
90 BUNDLE_NAME,
91 NORMALIZE,
92 NORM_CORRECTION_DIR,
93 CHECK_BIDI,
94 UNICODE_VERSION
95 };
96
printHelp(int argc,char * argv[])97 static int printHelp(int argc, char* argv[]){
98 /*
99 * Broken into chucks because the C89 standard says the minimum
100 * required supported string length is 509 bytes.
101 */
102 fprintf(stderr,
103 "Usage: %s [-options] [file_name]\n"
104 "\n"
105 "Read the files specified and\n"
106 "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n"
107 "\n",
108 argv[0]);
109 fprintf(stderr,
110 "Options:\n"
111 "\t-h or -? or --help print this usage text\n"
112 "\t-v or --verbose verbose output\n"
113 "\t-c or --copyright include a copyright notice\n");
114 fprintf(stderr,
115 "\t-d or --destdir destination directory, followed by the path\n"
116 "\t-s or --sourcedir source directory of ICU data, followed by the path\n"
117 "\t-b or --bundle-name generate the output data file with the name specified\n"
118 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
119 "\t followed by path, defaults to %s\n",
120 u_getDataDirectory());
121 fprintf(stderr,
122 "\t-n or --normalize turn on the option for normalization and include mappings\n"
123 "\t from NormalizationCorrections.txt from the given path,\n"
124 "\t e.g: /test/icu/source/data/unidata\n");
125 fprintf(stderr,
126 "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n"
127 "\t when the input file contains a normalization directive.\n"
128 "\t unlike -n/--normalize, this option does not force the\n"
129 "\t normalization.\n");
130 fprintf(stderr,
131 "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n"
132 "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n"
133 );
134 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
135 }
136
137
138 extern int
main(int argc,char * argv[])139 main(int argc, char* argv[]) {
140 #if !UCONFIG_NO_IDNA
141 char* filename = NULL;
142 #endif
143 const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL;
144 const char *bundleName=NULL, *inputFileName = NULL;
145 char *basename=NULL;
146 int32_t sprepOptions = 0;
147
148 UErrorCode errorCode=U_ZERO_ERROR;
149
150 U_MAIN_INIT_ARGS(argc, argv);
151
152 /* preset then read command line options */
153 options[DESTDIR].value=u_getDataDirectory();
154 options[SOURCEDIR].value="";
155 options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */
156 options[BUNDLE_NAME].value = DATA_NAME;
157 options[NORMALIZE].value = "";
158
159 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
160
161 /* error handling, printing usage message */
162 if(argc<0) {
163 fprintf(stderr,
164 "error in command line argument \"%s\"\n",
165 argv[-argc]);
166 }
167 if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
168 return printHelp(argc, argv);
169
170 }
171
172 /* get the options values */
173 beVerbose=options[VERBOSE].doesOccur;
174 haveCopyright=options[COPYRIGHT].doesOccur;
175 srcDir=options[SOURCEDIR].value;
176 destDir=options[DESTDIR].value;
177 bundleName = options[BUNDLE_NAME].value;
178 if(options[NORMALIZE].doesOccur) {
179 icuUniDataDir = options[NORMALIZE].value;
180 } else {
181 icuUniDataDir = options[NORM_CORRECTION_DIR].value;
182 }
183
184 if(argc<2) {
185 /* print the help message */
186 return printHelp(argc, argv);
187 } else {
188 inputFileName = argv[1];
189 }
190 if(!options[UNICODE_VERSION].doesOccur){
191 return printHelp(argc, argv);
192 }
193 if(options[ICUDATADIR].doesOccur) {
194 u_setDataDirectory(options[ICUDATADIR].value);
195 }
196 #if UCONFIG_NO_IDNA
197
198 fprintf(stderr,
199 "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
200 " because UCONFIG_NO_IDNA is set, \n"
201 "see icu/source/common/unicode/uconfig.h\n");
202 generateData(destDir, bundleName);
203
204 #else
205
206 setUnicodeVersion(options[UNICODE_VERSION].value);
207 filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + uprv_strlen(inputFileName) + (icuUniDataDir == NULL ? 0 : uprv_strlen(icuUniDataDir)) + 40); /* hopefully this should be enough */
208
209 /* prepare the filename beginning with the source dir */
210 if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){
211 filename[0] = '.';
212 filename[1] = U_FILE_SEP_CHAR;
213 uprv_strcpy(filename+2,srcDir);
214 }else{
215 uprv_strcpy(filename, srcDir);
216 }
217
218 basename=filename+uprv_strlen(filename);
219 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
220 *basename++=U_FILE_SEP_CHAR;
221 }
222
223 /* initialize */
224 init();
225
226 /* process the file */
227 uprv_strcpy(basename,inputFileName);
228 parseMappings(filename,FALSE, &errorCode);
229 if(U_FAILURE(errorCode)) {
230 fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode));
231 return errorCode;
232 }
233
234 if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */
235 /* set up directory for NormalizationCorrections.txt */
236 uprv_strcpy(filename,icuUniDataDir);
237 basename=filename+uprv_strlen(filename);
238 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
239 *basename++=U_FILE_SEP_CHAR;
240 }
241
242 *basename++=U_FILE_SEP_CHAR;
243 uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME);
244
245 parseNormalizationCorrections(filename,&errorCode);
246 if(U_FAILURE(errorCode)){
247 fprintf(stderr,"Could not open file %s for reading \n", filename);
248 return errorCode;
249 }
250 sprepOptions |= _SPREP_NORMALIZATION_ON;
251 }
252
253 if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */
254 sprepOptions |= _SPREP_CHECK_BIDI_ON;
255 }
256
257 setOptions(sprepOptions);
258
259 /* process parsed data */
260 if(U_SUCCESS(errorCode)) {
261 /* write the data file */
262 generateData(destDir, bundleName);
263
264 cleanUpData();
265 }
266
267 uprv_free(filename);
268
269 u_cleanup();
270
271 #endif
272
273 return errorCode;
274 }
275
276 #if !UCONFIG_NO_IDNA
277
278 static void U_CALLCONV
normalizationCorrectionsLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)279 normalizationCorrectionsLineFn(void *context,
280 char *fields[][2], int32_t fieldCount,
281 UErrorCode *pErrorCode) {
282 (void)context; // suppress compiler warnings about unused variable
283 (void)fieldCount; // suppress compiler warnings about unused variable
284 uint32_t mapping[40];
285 char *end, *s;
286 uint32_t code;
287 int32_t length;
288 UVersionInfo version;
289 UVersionInfo thisVersion;
290
291 /* get the character code, field 0 */
292 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
293 if(U_FAILURE(*pErrorCode)) {
294 fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]);
295 exit(*pErrorCode);
296 }
297 /* Original (erroneous) decomposition */
298 s = fields[1][0];
299
300 /* parse the mapping string */
301 length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
302
303 /* ignore corrected decomposition */
304
305 u_versionFromString(version,fields[3][0] );
306 u_versionFromString(thisVersion, "3.2.0");
307
308
309
310 if(U_FAILURE(*pErrorCode)) {
311 fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n",
312 (long)code, u_errorName(*pErrorCode));
313 exit(*pErrorCode);
314 }
315
316 /* store the mapping */
317 if( version[0] > thisVersion[0] ||
318 ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1]))
319 ){
320 storeMapping(code,mapping, length, USPREP_MAP, pErrorCode);
321 }
322 setUnicodeVersionNC(version);
323 }
324
325 static void
parseNormalizationCorrections(const char * filename,UErrorCode * pErrorCode)326 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) {
327 char *fields[4][2];
328
329 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
330 return;
331 }
332
333 u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode);
334
335 /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */
336
337 if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) {
338 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
339 exit(*pErrorCode);
340 }
341 }
342
343 static void U_CALLCONV
strprepProfileLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)344 strprepProfileLineFn(void *context,
345 char *fields[][2], int32_t fieldCount,
346 UErrorCode *pErrorCode) {
347 (void)fieldCount; // suppress compiler warnings about unused variable
348 uint32_t mapping[40];
349 char *end, *map;
350 uint32_t code;
351 int32_t length;
352 /*UBool* mapWithNorm = (UBool*) context;*/
353 const char* typeName;
354 uint32_t rangeStart=0,rangeEnd =0;
355 const char* filename = (const char*) context;
356 const char *s;
357
358 s = u_skipWhitespace(fields[0][0]);
359 if (*s == '@') {
360 /* special directive */
361 s++;
362 length = (int32_t)(fields[0][1] - s);
363 if (length >= NORMALIZE_DIRECTIVE_LEN
364 && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) {
365 options[NORMALIZE].doesOccur = TRUE;
366 return;
367 }
368 else if (length >= CHECK_BIDI_DIRECTIVE_LEN
369 && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) {
370 options[CHECK_BIDI].doesOccur = TRUE;
371 return;
372 }
373 else {
374 fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]);
375 }
376 }
377
378 typeName = fields[2][0];
379 map = fields[1][0];
380
381 if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){
382
383 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
384 if(U_FAILURE(*pErrorCode)){
385 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
386 return;
387 }
388
389 /* store the range */
390 storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode);
391
392 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){
393
394 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
395 if(U_FAILURE(*pErrorCode)){
396 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
397 return;
398 }
399
400 /* store the range */
401 storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode);
402
403 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){
404
405 /* get the character code, field 0 */
406 code=(uint32_t)uprv_strtoul(s, &end, 16);
407 if(end<=s || end!=fields[0][1]) {
408 fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]);
409 *pErrorCode=U_PARSE_ERROR;
410 exit(U_PARSE_ERROR);
411 }
412
413 /* parse the mapping string */
414 length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode);
415
416 /* store the mapping */
417 storeMapping(code,mapping, length,USPREP_MAP, pErrorCode);
418
419 }else{
420 *pErrorCode = U_INVALID_FORMAT_ERROR;
421 }
422
423 if(U_FAILURE(*pErrorCode)) {
424 fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename,
425 fields[0][0],fields[2][0],u_errorName(*pErrorCode));
426 exit(*pErrorCode);
427 }
428
429 }
430
431 static void
parseMappings(const char * filename,UBool reportError,UErrorCode * pErrorCode)432 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) {
433 char *fields[3][2];
434
435 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
436 return;
437 }
438
439 u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode);
440
441 /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/
442
443 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
444 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
445 exit(*pErrorCode);
446 }
447 }
448
449
450 #endif /* #if !UCONFIG_NO_IDNA */
451
452 /*
453 * Hey, Emacs, please set the following:
454 *
455 * Local Variables:
456 * indent-tabs-mode: nil
457 * End:
458 *
459 */
460