1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /******************************************************************************
4 * Copyright (C) 2008-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *******************************************************************************
7 */
8 #include "unicode/utypes.h"
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include "unicode/utypes.h"
13 #include "unicode/putil.h"
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "filestrm.h"
17 #include "toolutil.h"
18 #include "unicode/uclean.h"
19 #include "unewdata.h"
20 #include "putilimp.h"
21 #include "pkg_gencmn.h"
22
23 #define STRING_STORE_SIZE 200000
24
25 #define COMMON_DATA_NAME U_ICUDATA_NAME
26 #define DATA_TYPE "dat"
27
28 /* ICU package data file format (.dat files) ------------------------------- ***
29
30 Description of the data format after the usual ICU data file header
31 (UDataInfo etc.).
32
33 Format version 1
34
35 A .dat package file contains a simple Table of Contents of item names,
36 followed by the items themselves:
37
38 1. ToC table
39
40 uint32_t count; - number of items
41 UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item:
42 uint32_t nameOffset; - offset of the item name
43 uint32_t dataOffset; - offset of the item data
44 both are byte offsets from the beginning of the data
45
46 2. item name strings
47
48 All item names are stored as char * strings in one block between the ToC table
49 and the data items.
50
51 3. data items
52
53 The data items are stored following the item names block.
54 Each data item is 16-aligned.
55 The data items are stored in the sorted order of their names.
56
57 Therefore, the top of the name strings block is the offset of the first item,
58 the length of the last item is the difference between its offset and
59 the .dat file length, and the length of all previous items is the difference
60 between its offset and the next one.
61
62 ----------------------------------------------------------------------------- */
63
64 /* UDataInfo cf. udata.h */
65 static const UDataInfo dataInfo={
66 sizeof(UDataInfo),
67 0,
68
69 U_IS_BIG_ENDIAN,
70 U_CHARSET_FAMILY,
71 sizeof(UChar),
72 0,
73
74 {0x43, 0x6d, 0x6e, 0x44}, /* dataFormat="CmnD" */
75 {1, 0, 0, 0}, /* formatVersion */
76 {3, 0, 0, 0} /* dataVersion */
77 };
78
79 static uint32_t maxSize;
80
81 static char stringStore[STRING_STORE_SIZE];
82 static uint32_t stringTop=0, basenameTotal=0;
83
84 typedef struct {
85 char *pathname, *basename;
86 uint32_t basenameLength, basenameOffset, fileSize, fileOffset;
87 } File;
88
89 #define CHUNK_FILE_COUNT 256
90 static File *files = NULL;
91 static uint32_t fileCount=0;
92 static uint32_t fileMax = 0;
93
94
95 static char *symPrefix = NULL;
96
97 #define LINE_BUFFER_SIZE 512
98 /* prototypes --------------------------------------------------------------- */
99
100 static void
101 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose);
102
103 static char *
104 allocString(uint32_t length);
105
106 static int
107 compareFiles(const void *file1, const void *file2);
108
109 static char *
110 pathToFullPath(const char *path, const char *source);
111
112 /* map non-tree separator (such as '\') to tree separator ('/') inplace. */
113 static void
114 fixDirToTreePath(char *s);
115 /* -------------------------------------------------------------------------- */
116
117 U_CAPI void U_EXPORT2
createCommonDataFile(const char * destDir,const char * name,const char * entrypointName,const char * type,const char * source,const char * copyRight,const char * dataFile,uint32_t max_size,UBool sourceTOC,UBool verbose,char * gencmnFileName)118 createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
119 const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) {
120 static char buffer[4096];
121 char *line;
122 char *linePtr;
123 char *s = NULL;
124 UErrorCode errorCode=U_ZERO_ERROR;
125 uint32_t i, fileOffset, basenameOffset, length, nread;
126 FileStream *in, *file;
127
128 line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE);
129 if (line == NULL) {
130 fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE);
131 exit(U_MEMORY_ALLOCATION_ERROR);
132 }
133
134 linePtr = line;
135
136 maxSize = max_size;
137
138 if (destDir == NULL) {
139 destDir = u_getDataDirectory();
140 }
141 if (name == NULL) {
142 name = COMMON_DATA_NAME;
143 }
144 if (type == NULL) {
145 type = DATA_TYPE;
146 }
147 if (source == NULL) {
148 source = ".";
149 }
150
151 if (dataFile == NULL) {
152 in = T_FileStream_stdin();
153 } else {
154 in = T_FileStream_open(dataFile, "r");
155 if(in == NULL) {
156 fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile);
157 exit(U_FILE_ACCESS_ERROR);
158 }
159 }
160
161 if (verbose) {
162 if(sourceTOC) {
163 printf("generating %s_%s.c (table of contents source file)\n", name, type);
164 } else {
165 printf("generating %s.%s (common data file with table of contents)\n", name, type);
166 }
167 }
168
169 /* read the list of files and get their lengths */
170 while((s != NULL && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr),
171 LINE_BUFFER_SIZE))!=NULL) {
172 /* remove trailing newline characters and parse space separated items */
173 if (s != NULL && *s != 0) {
174 line=s;
175 } else {
176 s=line;
177 }
178 while(*s!=0) {
179 if(*s==' ') {
180 *s=0;
181 ++s;
182 break;
183 } else if(*s=='\r' || *s=='\n') {
184 *s=0;
185 break;
186 }
187 ++s;
188 }
189
190 /* check for comment */
191
192 if (*line == '#') {
193 continue;
194 }
195
196 /* add the file */
197 #if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
198 {
199 char *t;
200 while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) {
201 *t = U_FILE_SEP_CHAR;
202 }
203 }
204 #endif
205 addFile(getLongPathname(line), name, source, sourceTOC, verbose);
206 }
207
208 uprv_free(linePtr);
209
210 if(in!=T_FileStream_stdin()) {
211 T_FileStream_close(in);
212 }
213
214 if(fileCount==0) {
215 fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == NULL ? "<stdin>" : dataFile);
216 return;
217 }
218
219 /* sort the files by basename */
220 qsort(files, fileCount, sizeof(File), compareFiles);
221
222 if(!sourceTOC) {
223 UNewDataMemory *out;
224
225 /* determine the offsets of all basenames and files in this common one */
226 basenameOffset=4+8*fileCount;
227 fileOffset=(basenameOffset+(basenameTotal+15))&~0xf;
228 for(i=0; i<fileCount; ++i) {
229 files[i].fileOffset=fileOffset;
230 fileOffset+=(files[i].fileSize+15)&~0xf;
231 files[i].basenameOffset=basenameOffset;
232 basenameOffset+=files[i].basenameLength;
233 }
234
235 /* create the output file */
236 out=udata_create(destDir, type, name,
237 &dataInfo,
238 copyRight == NULL ? U_COPYRIGHT_STRING : copyRight,
239 &errorCode);
240 if(U_FAILURE(errorCode)) {
241 fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n",
242 destDir, name, type,
243 u_errorName(errorCode));
244 exit(errorCode);
245 }
246
247 /* write the table of contents */
248 udata_write32(out, fileCount);
249 for(i=0; i<fileCount; ++i) {
250 udata_write32(out, files[i].basenameOffset);
251 udata_write32(out, files[i].fileOffset);
252 }
253
254 /* write the basenames */
255 for(i=0; i<fileCount; ++i) {
256 udata_writeString(out, files[i].basename, files[i].basenameLength);
257 }
258 length=4+8*fileCount+basenameTotal;
259
260 /* copy the files */
261 for(i=0; i<fileCount; ++i) {
262 /* pad to 16-align the next file */
263 length&=0xf;
264 if(length!=0) {
265 udata_writePadding(out, 16-length);
266 }
267
268 if (verbose) {
269 printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
270 }
271
272 /* copy the next file */
273 file=T_FileStream_open(files[i].pathname, "rb");
274 if(file==NULL) {
275 fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname);
276 exit(U_FILE_ACCESS_ERROR);
277 }
278 for(nread = 0;;) {
279 length=T_FileStream_read(file, buffer, sizeof(buffer));
280 if(length <= 0) {
281 break;
282 }
283 nread += length;
284 udata_writeBlock(out, buffer, length);
285 }
286 T_FileStream_close(file);
287 length=files[i].fileSize;
288
289 if (nread != files[i].fileSize) {
290 fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname, (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
291 exit(U_FILE_ACCESS_ERROR);
292 }
293 }
294
295 /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */
296 length&=0xf;
297 if(length!=0) {
298 udata_writePadding(out, 16-length);
299 }
300
301 /* finish */
302 udata_finish(out, &errorCode);
303 if(U_FAILURE(errorCode)) {
304 fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode));
305 exit(errorCode);
306 }
307 } else {
308 /* write a .c source file with the table of contents */
309 char *filename;
310 FileStream *out;
311
312 /* create the output filename */
313 filename=s=buffer;
314 uprv_strcpy(filename, destDir);
315 s=filename+uprv_strlen(filename);
316 if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) {
317 *s++=U_FILE_SEP_CHAR;
318 }
319 uprv_strcpy(s, name);
320 if(*(type)!=0) {
321 s+=uprv_strlen(s);
322 *s++='_';
323 uprv_strcpy(s, type);
324 }
325 s+=uprv_strlen(s);
326 uprv_strcpy(s, ".c");
327
328 /* open the output file */
329 out=T_FileStream_open(filename, "w");
330 if (gencmnFileName != NULL) {
331 uprv_strcpy(gencmnFileName, filename);
332 }
333 if(out==NULL) {
334 fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename);
335 exit(U_FILE_ACCESS_ERROR);
336 }
337
338 /* write the source file */
339 sprintf(buffer,
340 "/*\n"
341 " * ICU common data table of contents for %s.%s\n"
342 " * Automatically generated by icu/source/tools/gencmn/gencmn .\n"
343 " */\n\n"
344 "#include \"unicode/utypes.h\"\n"
345 "#include \"unicode/udata.h\"\n"
346 "\n"
347 "/* external symbol declarations for data (%d files) */\n",
348 name, type, fileCount);
349 T_FileStream_writeLine(out, buffer);
350
351 sprintf(buffer, "extern const char\n %s%s[]", symPrefix?symPrefix:"", files[0].pathname);
352 T_FileStream_writeLine(out, buffer);
353 for(i=1; i<fileCount; ++i) {
354 sprintf(buffer, ",\n %s%s[]", symPrefix?symPrefix:"", files[i].pathname);
355 T_FileStream_writeLine(out, buffer);
356 }
357 T_FileStream_writeLine(out, ";\n\n");
358
359 sprintf(
360 buffer,
361 "U_EXPORT struct {\n"
362 " uint16_t headerSize;\n"
363 " uint8_t magic1, magic2;\n"
364 " UDataInfo info;\n"
365 " char padding[%lu];\n"
366 " uint32_t count, reserved;\n"
367 " struct {\n"
368 " const char *name;\n"
369 " const void *data;\n"
370 " } toc[%lu];\n"
371 "} U_EXPORT2 %s_dat = {\n"
372 " 32, 0xda, 0x27, {\n"
373 " %lu, 0,\n"
374 " %u, %u, %u, 0,\n"
375 " {0x54, 0x6f, 0x43, 0x50},\n"
376 " {1, 0, 0, 0},\n"
377 " {0, 0, 0, 0}\n"
378 " },\n"
379 " \"\", %lu, 0, {\n",
380 (unsigned long)32-4-sizeof(UDataInfo),
381 (unsigned long)fileCount,
382 entrypointName,
383 (unsigned long)sizeof(UDataInfo),
384 U_IS_BIG_ENDIAN,
385 U_CHARSET_FAMILY,
386 U_SIZEOF_UCHAR,
387 (unsigned long)fileCount
388 );
389 T_FileStream_writeLine(out, buffer);
390
391 sprintf(buffer, " { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname);
392 T_FileStream_writeLine(out, buffer);
393 for(i=1; i<fileCount; ++i) {
394 sprintf(buffer, ",\n { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname);
395 T_FileStream_writeLine(out, buffer);
396 }
397
398 T_FileStream_writeLine(out, "\n }\n};\n");
399 T_FileStream_close(out);
400
401 uprv_free(symPrefix);
402 }
403 }
404
405 static void
addFile(const char * filename,const char * name,const char * source,UBool sourceTOC,UBool verbose)406 addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) {
407 char *s;
408 uint32_t length;
409 char *fullPath = NULL;
410
411 if(fileCount==fileMax) {
412 fileMax += CHUNK_FILE_COUNT;
413 files = uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */
414 if(files==NULL) {
415 fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount);
416 exit(U_MEMORY_ALLOCATION_ERROR);
417 }
418 }
419
420 if(!sourceTOC) {
421 FileStream *file;
422
423 if(uprv_pathIsAbsolute(filename)) {
424 fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename);
425 exit(U_ILLEGAL_ARGUMENT_ERROR);
426 }
427 fullPath = pathToFullPath(filename, source);
428 /* store the pathname */
429 length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
430 s=allocString(length);
431 uprv_strcpy(s, name);
432 uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
433 uprv_strcat(s, filename);
434
435 /* get the basename */
436 fixDirToTreePath(s);
437 files[fileCount].basename=s;
438 files[fileCount].basenameLength=length;
439
440 files[fileCount].pathname=fullPath;
441
442 basenameTotal+=length;
443
444 /* try to open the file */
445 file=T_FileStream_open(fullPath, "rb");
446 if(file==NULL) {
447 fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath);
448 exit(U_FILE_ACCESS_ERROR);
449 }
450
451 /* get the file length */
452 length=T_FileStream_size(file);
453 if(T_FileStream_error(file) || length<=20) {
454 fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath);
455 exit(U_FILE_ACCESS_ERROR);
456 }
457
458 T_FileStream_close(file);
459
460 /* do not add files that are longer than maxSize */
461 if(maxSize && length>maxSize) {
462 if (verbose) {
463 printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize);
464 }
465 return;
466 }
467 files[fileCount].fileSize=length;
468 } else {
469 char *t;
470 /* get and store the basename */
471 /* need to include the package name */
472 length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
473 s=allocString(length);
474 uprv_strcpy(s, name);
475 uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
476 uprv_strcat(s, filename);
477 fixDirToTreePath(s);
478 files[fileCount].basename=s;
479 /* turn the basename into an entry point name and store in the pathname field */
480 t=files[fileCount].pathname=allocString(length);
481 while(--length>0) {
482 if(*s=='.' || *s=='-' || *s=='/') {
483 *t='_';
484 } else {
485 *t=*s;
486 }
487 ++s;
488 ++t;
489 }
490 *t=0;
491 }
492 ++fileCount;
493 }
494
495 static char *
allocString(uint32_t length)496 allocString(uint32_t length) {
497 uint32_t top=stringTop+length;
498 char *p;
499
500 if(top>STRING_STORE_SIZE) {
501 fprintf(stderr, "gencmn: out of memory\n");
502 exit(U_MEMORY_ALLOCATION_ERROR);
503 }
504 p=stringStore+stringTop;
505 stringTop=top;
506 return p;
507 }
508
509 static char *
pathToFullPath(const char * path,const char * source)510 pathToFullPath(const char *path, const char *source) {
511 int32_t length;
512 int32_t newLength;
513 char *fullPath;
514 int32_t n;
515
516 length = (uint32_t)(uprv_strlen(path) + 1);
517 newLength = (length + 1 + (int32_t)uprv_strlen(source));
518 fullPath = uprv_malloc(newLength);
519 if(source != NULL) {
520 uprv_strcpy(fullPath, source);
521 uprv_strcat(fullPath, U_FILE_SEP_STRING);
522 } else {
523 fullPath[0] = 0;
524 }
525 n = (int32_t)uprv_strlen(fullPath);
526 fullPath[n] = 0; /* Suppress compiler warning for unused variable n */
527 /* when conditional code below is not compiled. */
528 uprv_strcat(fullPath, path);
529
530 #if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
531 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR)
532 /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
533 for(;fullPath[n];n++) {
534 if(fullPath[n] == U_FILE_ALT_SEP_CHAR) {
535 fullPath[n] = U_FILE_SEP_CHAR;
536 }
537 }
538 #endif
539 #endif
540 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
541 /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
542 for(;fullPath[n];n++) {
543 if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) {
544 fullPath[n] = U_FILE_SEP_CHAR;
545 }
546 }
547 #endif
548 return fullPath;
549 }
550
551 static int
compareFiles(const void * file1,const void * file2)552 compareFiles(const void *file1, const void *file2) {
553 /* sort by basename */
554 return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename);
555 }
556
557 static void
fixDirToTreePath(char * s)558 fixDirToTreePath(char *s)
559 {
560 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR))
561 char *t;
562 #endif
563 #if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
564 for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) {
565 *t = U_TREE_ENTRY_SEP_CHAR;
566 }
567 #endif
568 #if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
569 for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) {
570 *t = U_TREE_ENTRY_SEP_CHAR;
571 }
572 #endif
573 }
574