1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************************
5 *
6 * Copyright (C) 1998-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ********************************************************************************
10 *
11 *
12 * makeconv.cpp:
13 * tool creating a binary (compressed) representation of the conversion mapping
14 * table (IBM NLTC ucmap format).
15 *
16 * 05/04/2000 helena Added fallback mapping into the picture...
17 * 06/29/2000 helena Major rewrite of the callback APIs.
18 */
19
20 #include <stdio.h>
21 #include "unicode/putil.h"
22 #include "unicode/ucnv_err.h"
23 #include "charstr.h"
24 #include "ucnv_bld.h"
25 #include "ucnv_imp.h"
26 #include "ucnv_cnv.h"
27 #include "cstring.h"
28 #include "cmemory.h"
29 #include "uinvchar.h"
30 #include "filestrm.h"
31 #include "toolutil.h"
32 #include "uoptions.h"
33 #include "unicode/udata.h"
34 #include "unewdata.h"
35 #include "uparse.h"
36 #include "ucm.h"
37 #include "makeconv.h"
38 #include "genmbcs.h"
39
40 #define DEBUG 0
41
42 typedef struct ConvData {
43 UCMFile *ucm;
44 NewConverter *cnvData, *extData;
45 UConverterSharedData sharedData;
46 UConverterStaticData staticData;
47 } ConvData;
48
49 static void
initConvData(ConvData * data)50 initConvData(ConvData *data) {
51 uprv_memset(data, 0, sizeof(ConvData));
52 data->sharedData.structSize=sizeof(UConverterSharedData);
53 data->staticData.structSize=sizeof(UConverterStaticData);
54 data->sharedData.staticData=&data->staticData;
55 }
56
57 static void
cleanupConvData(ConvData * data)58 cleanupConvData(ConvData *data) {
59 if(data!=NULL) {
60 if(data->cnvData!=NULL) {
61 data->cnvData->close(data->cnvData);
62 data->cnvData=NULL;
63 }
64 if(data->extData!=NULL) {
65 data->extData->close(data->extData);
66 data->extData=NULL;
67 }
68 ucm_close(data->ucm);
69 data->ucm=NULL;
70 }
71 }
72
73 /*
74 * from ucnvstat.c - static prototypes of data-based converters
75 */
76 U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
77
78 /*
79 * Global - verbosity
80 */
81 UBool VERBOSE = FALSE;
82 UBool QUIET = FALSE;
83 UBool SMALL = FALSE;
84 UBool IGNORE_SISO_CHECK = FALSE;
85
86 static void
87 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
88
89 /*
90 * Set up the UNewData and write the converter..
91 */
92 static void
93 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
94
95 UBool haveCopyright=TRUE;
96
97 static UDataInfo dataInfo={
98 sizeof(UDataInfo),
99 0,
100
101 U_IS_BIG_ENDIAN,
102 U_CHARSET_FAMILY,
103 sizeof(UChar),
104 0,
105
106 {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */
107 {6, 2, 0, 0}, /* formatVersion */
108 {0, 0, 0, 0} /* dataVersion (calculated at runtime) */
109 };
110
111 static void
writeConverterData(ConvData * data,const char * cnvName,const char * cnvDir,UErrorCode * status)112 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
113 {
114 UNewDataMemory *mem = NULL;
115 uint32_t sz2;
116 uint32_t size = 0;
117 int32_t tableType;
118
119 if(U_FAILURE(*status))
120 {
121 return;
122 }
123
124 tableType=TABLE_NONE;
125 if(data->cnvData!=NULL) {
126 tableType|=TABLE_BASE;
127 }
128 if(data->extData!=NULL) {
129 tableType|=TABLE_EXT;
130 }
131
132 mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
133
134 if(U_FAILURE(*status))
135 {
136 fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
137 cnvName,
138 "cnv",
139 u_errorName(*status));
140 return;
141 }
142
143 if(VERBOSE)
144 {
145 printf("- Opened udata %s.%s\n", cnvName, "cnv");
146 }
147
148
149 /* all read only, clean, platform independent data. Mmmm. :) */
150 udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
151 size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */
152 /* Now, write the table */
153 if(tableType&TABLE_BASE) {
154 size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
155 }
156 if(tableType&TABLE_EXT) {
157 size += data->extData->write(data->extData, &data->staticData, mem, tableType);
158 }
159
160 sz2 = udata_finish(mem, status);
161 if(size != sz2)
162 {
163 fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
164 *status=U_INTERNAL_PROGRAM_ERROR;
165 }
166 if(VERBOSE)
167 {
168 printf("- Wrote %u bytes to the udata.\n", (int)sz2);
169 }
170 }
171
172 enum {
173 OPT_HELP_H,
174 OPT_HELP_QUESTION_MARK,
175 OPT_COPYRIGHT,
176 OPT_VERSION,
177 OPT_DESTDIR,
178 OPT_VERBOSE,
179 OPT_SMALL,
180 OPT_IGNORE_SISO_CHECK,
181 OPT_QUIET,
182
183 OPT_COUNT
184 };
185
186 static UOption options[]={
187 UOPTION_HELP_H,
188 UOPTION_HELP_QUESTION_MARK,
189 UOPTION_COPYRIGHT,
190 UOPTION_VERSION,
191 UOPTION_DESTDIR,
192 UOPTION_VERBOSE,
193 { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
194 { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
195 UOPTION_QUIET,
196 };
197
main(int argc,char * argv[])198 int main(int argc, char* argv[])
199 {
200 ConvData data;
201 char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
202
203 U_MAIN_INIT_ARGS(argc, argv);
204
205 /* Set up the ICU version number */
206 UVersionInfo icuVersion;
207 u_getVersion(icuVersion);
208 uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
209
210 /* preset then read command line options */
211 options[OPT_DESTDIR].value=u_getDataDirectory();
212 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
213
214 /* error handling, printing usage message */
215 if(argc<0) {
216 fprintf(stderr,
217 "error in command line argument \"%s\"\n",
218 argv[-argc]);
219 } else if(argc<2) {
220 argc=-1;
221 }
222 if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
223 FILE *stdfile=argc<0 ? stderr : stdout;
224 fprintf(stdfile,
225 "usage: %s [-options] files...\n"
226 "\tread .ucm codepage mapping files and write .cnv files\n"
227 "options:\n"
228 "\t-h or -? or --help this usage text\n"
229 "\t-V or --version show a version message\n"
230 "\t-c or --copyright include a copyright notice\n"
231 "\t-d or --destdir destination directory, followed by the path\n"
232 "\t-v or --verbose Turn on verbose output\n"
233 "\t-q or --quiet do not display warnings and progress\n",
234 argv[0]);
235 fprintf(stdfile,
236 "\t --small Generate smaller .cnv files. They will be\n"
237 "\t significantly smaller but may not be compatible with\n"
238 "\t older versions of ICU and will require heap memory\n"
239 "\t allocation when loaded.\n"
240 "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n");
241 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
242 }
243
244 if(options[OPT_VERSION].doesOccur) {
245 printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
246 dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
247 printf("%s\n", U_COPYRIGHT_STRING);
248 exit(0);
249 }
250
251 /* get the options values */
252 haveCopyright = options[OPT_COPYRIGHT].doesOccur;
253 const char *destdir = options[OPT_DESTDIR].value;
254 VERBOSE = options[OPT_VERBOSE].doesOccur;
255 QUIET = options[OPT_QUIET].doesOccur;
256 SMALL = options[OPT_SMALL].doesOccur;
257
258 if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
259 IGNORE_SISO_CHECK = TRUE;
260 }
261
262 icu::CharString outFileName;
263 UErrorCode err = U_ZERO_ERROR;
264 if (destdir != NULL && *destdir != 0) {
265 outFileName.append(destdir, err).ensureEndsWithFileSeparator(err);
266 if (U_FAILURE(err)) {
267 return err;
268 }
269 }
270 int32_t outBasenameStart = outFileName.length();
271
272 #if DEBUG
273 {
274 int i;
275 printf("makeconv: processing %d files...\n", argc - 1);
276 for(i=1; i<argc; ++i) {
277 printf("%s ", argv[i]);
278 }
279 printf("\n");
280 fflush(stdout);
281 }
282 #endif
283
284 UBool printFilename = (UBool) (argc > 2 || VERBOSE);
285 for (++argv; --argc; ++argv)
286 {
287 UErrorCode localError = U_ZERO_ERROR;
288 const char *arg = getLongPathname(*argv);
289
290 /*produces the right destination path for display*/
291 outFileName.truncate(outBasenameStart);
292 if (outBasenameStart != 0)
293 {
294 /* find the last file sepator */
295 const char *basename = findBasename(arg);
296 outFileName.append(basename, localError);
297 }
298 else
299 {
300 outFileName.append(arg, localError);
301 }
302 if (U_FAILURE(localError)) {
303 return localError;
304 }
305
306 /*removes the extension if any is found*/
307 int32_t lastDotIndex = outFileName.lastIndexOf('.');
308 if (lastDotIndex >= outBasenameStart) {
309 outFileName.truncate(lastDotIndex);
310 }
311
312 /* the basename without extension is the converter name */
313 if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) {
314 fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart);
315 return U_BUFFER_OVERFLOW_ERROR;
316 }
317 uprv_strcpy(cnvName, outFileName.data() + outBasenameStart);
318
319 /*Adds the target extension*/
320 outFileName.append(CONVERTER_FILE_EXTENSION, localError);
321 if (U_FAILURE(localError)) {
322 return localError;
323 }
324
325 #if DEBUG
326 printf("makeconv: processing %s ...\n", arg);
327 fflush(stdout);
328 #endif
329 initConvData(&data);
330 createConverter(&data, arg, &localError);
331
332 if (U_FAILURE(localError))
333 {
334 /* if an error is found, print out an error msg and keep going */
335 fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n",
336 outFileName.data(), arg, u_errorName(localError));
337 if(U_SUCCESS(err)) {
338 err = localError;
339 }
340 }
341 else
342 {
343 /* Insure the static data name matches the file name */
344 /* Changed to ignore directory and only compare base name
345 LDH 1/2/08*/
346 char *p;
347 p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
348
349 if(p == NULL) /* OK, try alternate */
350 {
351 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
352 if(p == NULL)
353 {
354 p=cnvName; /* If no separators, no problem */
355 }
356 }
357 else
358 {
359 p++; /* If found separator, don't include it in compare */
360 }
361 if(uprv_stricmp(p,data.staticData.name) && !QUIET)
362 {
363 fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
364 cnvName, CONVERTER_FILE_EXTENSION,
365 data.staticData.name);
366 }
367
368 uprv_strcpy((char*)data.staticData.name, cnvName);
369
370 if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
371 fprintf(stderr,
372 "Error: A converter name must contain only invariant characters.\n"
373 "%s is not a valid converter name.\n",
374 data.staticData.name);
375 if(U_SUCCESS(err)) {
376 err = U_INVALID_TABLE_FORMAT;
377 }
378 }
379
380 localError = U_ZERO_ERROR;
381 writeConverterData(&data, cnvName, destdir, &localError);
382
383 if(U_FAILURE(localError))
384 {
385 /* if an error is found, print out an error msg and keep going*/
386 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg,
387 u_errorName(localError));
388 if(U_SUCCESS(err)) {
389 err = localError;
390 }
391 }
392 else if (printFilename)
393 {
394 puts(outFileName.data() + outBasenameStart);
395 }
396 }
397 fflush(stdout);
398 fflush(stderr);
399
400 cleanupConvData(&data);
401 }
402
403 return err;
404 }
405
406 static void
getPlatformAndCCSIDFromName(const char * name,int8_t * pPlatform,int32_t * pCCSID)407 getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
408 if( (name[0]=='i' || name[0]=='I') &&
409 (name[1]=='b' || name[1]=='B') &&
410 (name[2]=='m' || name[2]=='M')
411 ) {
412 name+=3;
413 if(*name=='-') {
414 ++name;
415 }
416 *pPlatform=UCNV_IBM;
417 *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
418 } else {
419 *pPlatform=UCNV_UNKNOWN;
420 *pCCSID=0;
421 }
422 }
423
424 static void
readHeader(ConvData * data,FileStream * convFile,UErrorCode * pErrorCode)425 readHeader(ConvData *data,
426 FileStream* convFile,
427 UErrorCode *pErrorCode) {
428 char line[1024];
429 char *s, *key, *value;
430 const UConverterStaticData *prototype;
431 UConverterStaticData *staticData;
432
433 if(U_FAILURE(*pErrorCode)) {
434 return;
435 }
436
437 staticData=&data->staticData;
438 staticData->platform=UCNV_IBM;
439 staticData->subCharLen=0;
440
441 while(T_FileStream_readLine(convFile, line, sizeof(line))) {
442 /* basic parsing and handling of state-related items */
443 if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
444 continue;
445 }
446
447 /* stop at the beginning of the mapping section */
448 if(uprv_strcmp(line, "CHARMAP")==0) {
449 break;
450 }
451
452 /* collect the information from the header field, ignore unknown keys */
453 if(uprv_strcmp(key, "code_set_name")==0) {
454 if(*value!=0) {
455 uprv_strcpy((char *)staticData->name, value);
456 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
457 }
458 } else if(uprv_strcmp(key, "subchar")==0) {
459 uint8_t bytes[UCNV_EXT_MAX_BYTES];
460 int8_t length;
461
462 s=value;
463 length=ucm_parseBytes(bytes, line, (const char **)&s);
464 if(1<=length && length<=4 && *s==0) {
465 staticData->subCharLen=length;
466 uprv_memcpy(staticData->subChar, bytes, length);
467 } else {
468 fprintf(stderr, "error: illegal <subchar> %s\n", value);
469 *pErrorCode=U_INVALID_TABLE_FORMAT;
470 return;
471 }
472 } else if(uprv_strcmp(key, "subchar1")==0) {
473 uint8_t bytes[UCNV_EXT_MAX_BYTES];
474
475 s=value;
476 if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
477 staticData->subChar1=bytes[0];
478 } else {
479 fprintf(stderr, "error: illegal <subchar1> %s\n", value);
480 *pErrorCode=U_INVALID_TABLE_FORMAT;
481 return;
482 }
483 }
484 }
485
486 /* copy values from the UCMFile to the static data */
487 staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
488 staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
489 staticData->conversionType=data->ucm->states.conversionType;
490
491 if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
492 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
493 *pErrorCode=U_INVALID_TABLE_FORMAT;
494 return;
495 }
496
497 /*
498 * Now that we know the type, copy any 'default' values from the table.
499 * We need not check the type any further because the parser only
500 * recognizes what we have prototypes for.
501 *
502 * For delta (extension-only) tables, copy values from the base file
503 * instead, see createConverter().
504 */
505 if(data->ucm->baseName[0]==0) {
506 prototype=ucnv_converterStaticData[staticData->conversionType];
507 if(prototype!=NULL) {
508 if(staticData->name[0]==0) {
509 uprv_strcpy((char *)staticData->name, prototype->name);
510 }
511
512 if(staticData->codepage==0) {
513 staticData->codepage=prototype->codepage;
514 }
515
516 if(staticData->platform==0) {
517 staticData->platform=prototype->platform;
518 }
519
520 if(staticData->minBytesPerChar==0) {
521 staticData->minBytesPerChar=prototype->minBytesPerChar;
522 }
523
524 if(staticData->maxBytesPerChar==0) {
525 staticData->maxBytesPerChar=prototype->maxBytesPerChar;
526 }
527
528 if(staticData->subCharLen==0) {
529 staticData->subCharLen=prototype->subCharLen;
530 if(prototype->subCharLen>0) {
531 uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
532 }
533 }
534 }
535 }
536
537 if(data->ucm->states.outputType<0) {
538 data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
539 }
540
541 if( staticData->subChar1!=0 &&
542 (staticData->minBytesPerChar>1 ||
543 (staticData->conversionType!=UCNV_MBCS &&
544 staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
545 ) {
546 fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
547 *pErrorCode=U_INVALID_TABLE_FORMAT;
548 }
549 }
550
551 /* return TRUE if a base table was read, FALSE for an extension table */
552 static UBool
readFile(ConvData * data,const char * converterName,UErrorCode * pErrorCode)553 readFile(ConvData *data, const char* converterName,
554 UErrorCode *pErrorCode) {
555 char line[1024];
556 char *end;
557 FileStream *convFile;
558
559 UCMStates *baseStates;
560 UBool dataIsBase;
561
562 if(U_FAILURE(*pErrorCode)) {
563 return FALSE;
564 }
565
566 data->ucm=ucm_open();
567
568 convFile=T_FileStream_open(converterName, "r");
569 if(convFile==NULL) {
570 *pErrorCode=U_FILE_ACCESS_ERROR;
571 return FALSE;
572 }
573
574 readHeader(data, convFile, pErrorCode);
575 if(U_FAILURE(*pErrorCode)) {
576 return FALSE;
577 }
578
579 if(data->ucm->baseName[0]==0) {
580 dataIsBase=TRUE;
581 baseStates=&data->ucm->states;
582 ucm_processStates(baseStates, IGNORE_SISO_CHECK);
583 } else {
584 dataIsBase=FALSE;
585 baseStates=NULL;
586 }
587
588 /* read the base table */
589 ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
590 if(U_FAILURE(*pErrorCode)) {
591 return FALSE;
592 }
593
594 /* read an extension table if there is one */
595 while(T_FileStream_readLine(convFile, line, sizeof(line))) {
596 end=uprv_strchr(line, 0);
597 while(line<end &&
598 (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
599 --end;
600 }
601 *end=0;
602
603 if(line[0]=='#' || u_skipWhitespace(line)==end) {
604 continue; /* ignore empty and comment lines */
605 }
606
607 if(0==uprv_strcmp(line, "CHARMAP")) {
608 /* read the extension table */
609 ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
610 } else {
611 fprintf(stderr, "unexpected text after the base mapping table\n");
612 }
613 break;
614 }
615
616 T_FileStream_close(convFile);
617
618 if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
619 fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
620 *pErrorCode=U_INVALID_TABLE_FORMAT;
621 }
622
623 return dataIsBase;
624 }
625
626 static void
createConverter(ConvData * data,const char * converterName,UErrorCode * pErrorCode)627 createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
628 ConvData baseData;
629 UBool dataIsBase;
630
631 UConverterStaticData *staticData;
632 UCMStates *states, *baseStates;
633
634 if(U_FAILURE(*pErrorCode)) {
635 return;
636 }
637
638 initConvData(data);
639
640 dataIsBase=readFile(data, converterName, pErrorCode);
641 if(U_FAILURE(*pErrorCode)) {
642 return;
643 }
644
645 staticData=&data->staticData;
646 states=&data->ucm->states;
647
648 if(dataIsBase) {
649 /*
650 * Build a normal .cnv file with a base table
651 * and an optional extension table.
652 */
653 data->cnvData=MBCSOpen(data->ucm);
654 if(data->cnvData==NULL) {
655 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
656
657 } else if(!data->cnvData->isValid(data->cnvData,
658 staticData->subChar, staticData->subCharLen)
659 ) {
660 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
661 *pErrorCode=U_INVALID_TABLE_FORMAT;
662
663 } else if(staticData->subChar1!=0 &&
664 !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
665 ) {
666 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
667 *pErrorCode=U_INVALID_TABLE_FORMAT;
668
669 } else if(
670 data->ucm->ext->mappingsLength>0 &&
671 !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
672 ) {
673 *pErrorCode=U_INVALID_TABLE_FORMAT;
674 } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
675 /* sort the table so that it can be turned into UTF-8-friendly data */
676 ucm_sortTable(data->ucm->base);
677 }
678
679 if(U_SUCCESS(*pErrorCode)) {
680 if(
681 /* add the base table after ucm_checkBaseExt()! */
682 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
683 ) {
684 *pErrorCode=U_INVALID_TABLE_FORMAT;
685 } else {
686 /*
687 * addTable() may have requested moving more mappings to the extension table
688 * if they fit into the base toUnicode table but not into the
689 * base fromUnicode table.
690 * (Especially for UTF-8-friendly fromUnicode tables.)
691 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
692 * to be excluded from the extension toUnicode data.
693 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
694 * the base fromUnicode table.
695 */
696 ucm_moveMappings(data->ucm->base, data->ucm->ext);
697 ucm_sortTable(data->ucm->ext);
698 if(data->ucm->ext->mappingsLength>0) {
699 /* prepare the extension table, if there is one */
700 data->extData=CnvExtOpen(data->ucm);
701 if(data->extData==NULL) {
702 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
703 } else if(
704 !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
705 ) {
706 *pErrorCode=U_INVALID_TABLE_FORMAT;
707 }
708 }
709 }
710 }
711 } else {
712 /* Build an extension-only .cnv file. */
713 char baseFilename[500];
714 char *basename;
715
716 initConvData(&baseData);
717
718 /* assemble a path/filename for data->ucm->baseName */
719 uprv_strcpy(baseFilename, converterName);
720 basename=(char *)findBasename(baseFilename);
721 uprv_strcpy(basename, data->ucm->baseName);
722 uprv_strcat(basename, ".ucm");
723
724 /* read the base table */
725 dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
726 if(U_FAILURE(*pErrorCode)) {
727 return;
728 } else if(!dataIsBase) {
729 fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
730 *pErrorCode=U_INVALID_TABLE_FORMAT;
731 } else {
732 /* prepare the extension table */
733 data->extData=CnvExtOpen(data->ucm);
734 if(data->extData==NULL) {
735 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
736 } else {
737 /* fill in gaps in extension file header fields */
738 UCMapping *m, *mLimit;
739 uint8_t fallbackFlags;
740
741 baseStates=&baseData.ucm->states;
742 if(states->conversionType==UCNV_DBCS) {
743 staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
744 } else if(states->minCharLength==0) {
745 staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
746 }
747 if(states->maxCharLength<states->minCharLength) {
748 staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
749 }
750
751 if(staticData->subCharLen==0) {
752 uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
753 staticData->subCharLen=baseData.staticData.subCharLen;
754 }
755 /*
756 * do not copy subChar1 -
757 * only use what is explicitly specified
758 * because it cannot be unset in the extension file header
759 */
760
761 /* get the fallback flags */
762 fallbackFlags=0;
763 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
764 m<mLimit && fallbackFlags!=3;
765 ++m
766 ) {
767 if(m->f==1) {
768 fallbackFlags|=1;
769 } else if(m->f==3) {
770 fallbackFlags|=2;
771 }
772 }
773
774 if(fallbackFlags&1) {
775 staticData->hasFromUnicodeFallback=TRUE;
776 }
777 if(fallbackFlags&2) {
778 staticData->hasToUnicodeFallback=TRUE;
779 }
780
781 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
782 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
783 *pErrorCode=U_INVALID_TABLE_FORMAT;
784
785 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
786 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
787 *pErrorCode=U_INVALID_TABLE_FORMAT;
788
789 } else if(
790 !ucm_checkValidity(data->ucm->ext, baseStates) ||
791 !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
792 ) {
793 *pErrorCode=U_INVALID_TABLE_FORMAT;
794 } else {
795 if(states->maxCharLength>1) {
796 /*
797 * When building a normal .cnv file with a base table
798 * for an MBCS (not SBCS) table with explicit precision flags,
799 * the MBCSAddTable() function marks some mappings for moving
800 * to the extension table.
801 * They fit into the base toUnicode table but not into the
802 * base fromUnicode table.
803 * (Note: We do have explicit precision flags because they are
804 * required for extension table generation, and
805 * ucm_checkBaseExt() verified it.)
806 *
807 * We do not call MBCSAddTable() here (we probably could)
808 * so we need to do the analysis before building the extension table.
809 * We assume that MBCSAddTable() will build a UTF-8-friendly table.
810 * Redundant mappings in the extension table are ok except they cost some size.
811 *
812 * Do this after ucm_checkBaseExt().
813 */
814 const MBCSData *mbcsData=MBCSGetDummy();
815 int32_t needsMove=0;
816 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
817 m<mLimit;
818 ++m
819 ) {
820 if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
821 m->f|=MBCS_FROM_U_EXT_FLAG;
822 m->moveFlag=UCM_MOVE_TO_EXT;
823 ++needsMove;
824 }
825 }
826
827 if(needsMove!=0) {
828 ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
829 ucm_sortTable(data->ucm->ext);
830 }
831 }
832 if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
833 *pErrorCode=U_INVALID_TABLE_FORMAT;
834 }
835 }
836 }
837 }
838
839 cleanupConvData(&baseData);
840 }
841 }
842
843 /*
844 * Hey, Emacs, please set the following:
845 *
846 * Local Variables:
847 * indent-tabs-mode: nil
848 * End:
849 *
850 */
851