1 /*
2 ********************************************************************************
3 *
4 * Copyright (C) 1998-2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ********************************************************************************
8 *
9 *
10 * makeconv.cpp:
11 * tool creating a binary (compressed) representation of the conversion mapping
12 * table (IBM NLTC ucmap format).
13 *
14 * 05/04/2000 helena Added fallback mapping into the picture...
15 * 06/29/2000 helena Major rewrite of the callback APIs.
16 */
17
18 #include <stdio.h>
19 #include "unicode/putil.h"
20 #include "unicode/ucnv_err.h"
21 #include "charstr.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_imp.h"
24 #include "ucnv_cnv.h"
25 #include "cstring.h"
26 #include "cmemory.h"
27 #include "uinvchar.h"
28 #include "filestrm.h"
29 #include "toolutil.h"
30 #include "uoptions.h"
31 #include "unicode/udata.h"
32 #include "unewdata.h"
33 #include "uparse.h"
34 #include "ucm.h"
35 #include "makeconv.h"
36 #include "genmbcs.h"
37
38 #define DEBUG 0
39
40 typedef struct ConvData {
41 UCMFile *ucm;
42 NewConverter *cnvData, *extData;
43 UConverterSharedData sharedData;
44 UConverterStaticData staticData;
45 } ConvData;
46
47 static void
initConvData(ConvData * data)48 initConvData(ConvData *data) {
49 uprv_memset(data, 0, sizeof(ConvData));
50 data->sharedData.structSize=sizeof(UConverterSharedData);
51 data->staticData.structSize=sizeof(UConverterStaticData);
52 data->sharedData.staticData=&data->staticData;
53 }
54
55 static void
cleanupConvData(ConvData * data)56 cleanupConvData(ConvData *data) {
57 if(data!=NULL) {
58 if(data->cnvData!=NULL) {
59 data->cnvData->close(data->cnvData);
60 data->cnvData=NULL;
61 }
62 if(data->extData!=NULL) {
63 data->extData->close(data->extData);
64 data->extData=NULL;
65 }
66 ucm_close(data->ucm);
67 data->ucm=NULL;
68 }
69 }
70
71 /*
72 * from ucnvstat.c - static prototypes of data-based converters
73 */
74 U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
75
76 /*
77 * Global - verbosity
78 */
79 UBool VERBOSE = FALSE;
80 UBool QUIET = FALSE;
81 UBool SMALL = FALSE;
82 UBool IGNORE_SISO_CHECK = FALSE;
83
84 static void
85 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
86
87 /*
88 * Set up the UNewData and write the converter..
89 */
90 static void
91 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
92
93 UBool haveCopyright=TRUE;
94
95 static UDataInfo dataInfo={
96 sizeof(UDataInfo),
97 0,
98
99 U_IS_BIG_ENDIAN,
100 U_CHARSET_FAMILY,
101 sizeof(UChar),
102 0,
103
104 {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */
105 {6, 2, 0, 0}, /* formatVersion */
106 {0, 0, 0, 0} /* dataVersion (calculated at runtime) */
107 };
108
109 static void
writeConverterData(ConvData * data,const char * cnvName,const char * cnvDir,UErrorCode * status)110 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
111 {
112 UNewDataMemory *mem = NULL;
113 uint32_t sz2;
114 uint32_t size = 0;
115 int32_t tableType;
116
117 if(U_FAILURE(*status))
118 {
119 return;
120 }
121
122 tableType=TABLE_NONE;
123 if(data->cnvData!=NULL) {
124 tableType|=TABLE_BASE;
125 }
126 if(data->extData!=NULL) {
127 tableType|=TABLE_EXT;
128 }
129
130 mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
131
132 if(U_FAILURE(*status))
133 {
134 fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
135 cnvName,
136 "cnv",
137 u_errorName(*status));
138 return;
139 }
140
141 if(VERBOSE)
142 {
143 printf("- Opened udata %s.%s\n", cnvName, "cnv");
144 }
145
146
147 /* all read only, clean, platform independent data. Mmmm. :) */
148 udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
149 size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */
150 /* Now, write the table */
151 if(tableType&TABLE_BASE) {
152 size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
153 }
154 if(tableType&TABLE_EXT) {
155 size += data->extData->write(data->extData, &data->staticData, mem, tableType);
156 }
157
158 sz2 = udata_finish(mem, status);
159 if(size != sz2)
160 {
161 fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
162 *status=U_INTERNAL_PROGRAM_ERROR;
163 }
164 if(VERBOSE)
165 {
166 printf("- Wrote %u bytes to the udata.\n", (int)sz2);
167 }
168 }
169
170 enum {
171 OPT_HELP_H,
172 OPT_HELP_QUESTION_MARK,
173 OPT_COPYRIGHT,
174 OPT_VERSION,
175 OPT_DESTDIR,
176 OPT_VERBOSE,
177 OPT_SMALL,
178 OPT_IGNORE_SISO_CHECK,
179 OPT_QUIET,
180
181 OPT_COUNT
182 };
183
184 static UOption options[]={
185 UOPTION_HELP_H,
186 UOPTION_HELP_QUESTION_MARK,
187 UOPTION_COPYRIGHT,
188 UOPTION_VERSION,
189 UOPTION_DESTDIR,
190 UOPTION_VERBOSE,
191 { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
192 { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
193 UOPTION_QUIET,
194 };
195
main(int argc,char * argv[])196 int main(int argc, char* argv[])
197 {
198 ConvData data;
199 char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
200
201 U_MAIN_INIT_ARGS(argc, argv);
202
203 /* Set up the ICU version number */
204 UVersionInfo icuVersion;
205 u_getVersion(icuVersion);
206 uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
207
208 /* preset then read command line options */
209 options[OPT_DESTDIR].value=u_getDataDirectory();
210 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
211
212 /* error handling, printing usage message */
213 if(argc<0) {
214 fprintf(stderr,
215 "error in command line argument \"%s\"\n",
216 argv[-argc]);
217 } else if(argc<2) {
218 argc=-1;
219 }
220 if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
221 FILE *stdfile=argc<0 ? stderr : stdout;
222 fprintf(stdfile,
223 "usage: %s [-options] files...\n"
224 "\tread .ucm codepage mapping files and write .cnv files\n"
225 "options:\n"
226 "\t-h or -? or --help this usage text\n"
227 "\t-V or --version show a version message\n"
228 "\t-c or --copyright include a copyright notice\n"
229 "\t-d or --destdir destination directory, followed by the path\n"
230 "\t-v or --verbose Turn on verbose output\n"
231 "\t-q or --quiet do not display warnings and progress\n",
232 argv[0]);
233 fprintf(stdfile,
234 "\t --small Generate smaller .cnv files. They will be\n"
235 "\t significantly smaller but may not be compatible with\n"
236 "\t older versions of ICU and will require heap memory\n"
237 "\t allocation when loaded.\n"
238 "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n");
239 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
240 }
241
242 if(options[OPT_VERSION].doesOccur) {
243 printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
244 dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
245 printf("%s\n", U_COPYRIGHT_STRING);
246 exit(0);
247 }
248
249 /* get the options values */
250 haveCopyright = options[OPT_COPYRIGHT].doesOccur;
251 const char *destdir = options[OPT_DESTDIR].value;
252 VERBOSE = options[OPT_VERBOSE].doesOccur;
253 QUIET = options[OPT_QUIET].doesOccur;
254 SMALL = options[OPT_SMALL].doesOccur;
255
256 if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
257 IGNORE_SISO_CHECK = TRUE;
258 }
259
260 icu::CharString outFileName;
261 UErrorCode err = U_ZERO_ERROR;
262 if (destdir != NULL && *destdir != 0) {
263 outFileName.append(destdir, err).ensureEndsWithFileSeparator(err);
264 if (U_FAILURE(err)) {
265 return err;
266 }
267 }
268 int32_t outBasenameStart = outFileName.length();
269
270 #if DEBUG
271 {
272 int i;
273 printf("makeconv: processing %d files...\n", argc - 1);
274 for(i=1; i<argc; ++i) {
275 printf("%s ", argv[i]);
276 }
277 printf("\n");
278 fflush(stdout);
279 }
280 #endif
281
282 UBool printFilename = (UBool) (argc > 2 || VERBOSE);
283 for (++argv; --argc; ++argv)
284 {
285 UErrorCode localError = U_ZERO_ERROR;
286 const char *arg = getLongPathname(*argv);
287
288 /*produces the right destination path for display*/
289 outFileName.truncate(outBasenameStart);
290 if (outBasenameStart != 0)
291 {
292 /* find the last file sepator */
293 const char *basename = findBasename(arg);
294 outFileName.append(basename, localError);
295 }
296 else
297 {
298 outFileName.append(arg, localError);
299 }
300 if (U_FAILURE(localError)) {
301 return localError;
302 }
303
304 /*removes the extension if any is found*/
305 int32_t lastDotIndex = outFileName.lastIndexOf('.');
306 if (lastDotIndex >= outBasenameStart) {
307 outFileName.truncate(lastDotIndex);
308 }
309
310 /* the basename without extension is the converter name */
311 if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) {
312 fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart);
313 return U_BUFFER_OVERFLOW_ERROR;
314 }
315 uprv_strcpy(cnvName, outFileName.data() + outBasenameStart);
316
317 /*Adds the target extension*/
318 outFileName.append(CONVERTER_FILE_EXTENSION, localError);
319 if (U_FAILURE(localError)) {
320 return localError;
321 }
322
323 #if DEBUG
324 printf("makeconv: processing %s ...\n", arg);
325 fflush(stdout);
326 #endif
327 initConvData(&data);
328 createConverter(&data, arg, &localError);
329
330 if (U_FAILURE(localError))
331 {
332 /* if an error is found, print out an error msg and keep going */
333 fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n",
334 outFileName.data(), arg, u_errorName(localError));
335 if(U_SUCCESS(err)) {
336 err = localError;
337 }
338 }
339 else
340 {
341 /* Insure the static data name matches the file name */
342 /* Changed to ignore directory and only compare base name
343 LDH 1/2/08*/
344 char *p;
345 p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
346
347 if(p == NULL) /* OK, try alternate */
348 {
349 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
350 if(p == NULL)
351 {
352 p=cnvName; /* If no separators, no problem */
353 }
354 }
355 else
356 {
357 p++; /* If found separator, don't include it in compare */
358 }
359 if(uprv_stricmp(p,data.staticData.name) && !QUIET)
360 {
361 fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
362 cnvName, CONVERTER_FILE_EXTENSION,
363 data.staticData.name);
364 }
365
366 uprv_strcpy((char*)data.staticData.name, cnvName);
367
368 if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
369 fprintf(stderr,
370 "Error: A converter name must contain only invariant characters.\n"
371 "%s is not a valid converter name.\n",
372 data.staticData.name);
373 if(U_SUCCESS(err)) {
374 err = U_INVALID_TABLE_FORMAT;
375 }
376 }
377
378 localError = U_ZERO_ERROR;
379 writeConverterData(&data, cnvName, destdir, &localError);
380
381 if(U_FAILURE(localError))
382 {
383 /* if an error is found, print out an error msg and keep going*/
384 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg,
385 u_errorName(localError));
386 if(U_SUCCESS(err)) {
387 err = localError;
388 }
389 }
390 else if (printFilename)
391 {
392 puts(outFileName.data() + outBasenameStart);
393 }
394 }
395 fflush(stdout);
396 fflush(stderr);
397
398 cleanupConvData(&data);
399 }
400
401 return err;
402 }
403
404 static void
getPlatformAndCCSIDFromName(const char * name,int8_t * pPlatform,int32_t * pCCSID)405 getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
406 if( (name[0]=='i' || name[0]=='I') &&
407 (name[1]=='b' || name[1]=='B') &&
408 (name[2]=='m' || name[2]=='M')
409 ) {
410 name+=3;
411 if(*name=='-') {
412 ++name;
413 }
414 *pPlatform=UCNV_IBM;
415 *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
416 } else {
417 *pPlatform=UCNV_UNKNOWN;
418 *pCCSID=0;
419 }
420 }
421
422 static void
readHeader(ConvData * data,FileStream * convFile,UErrorCode * pErrorCode)423 readHeader(ConvData *data,
424 FileStream* convFile,
425 UErrorCode *pErrorCode) {
426 char line[1024];
427 char *s, *key, *value;
428 const UConverterStaticData *prototype;
429 UConverterStaticData *staticData;
430
431 if(U_FAILURE(*pErrorCode)) {
432 return;
433 }
434
435 staticData=&data->staticData;
436 staticData->platform=UCNV_IBM;
437 staticData->subCharLen=0;
438
439 while(T_FileStream_readLine(convFile, line, sizeof(line))) {
440 /* basic parsing and handling of state-related items */
441 if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
442 continue;
443 }
444
445 /* stop at the beginning of the mapping section */
446 if(uprv_strcmp(line, "CHARMAP")==0) {
447 break;
448 }
449
450 /* collect the information from the header field, ignore unknown keys */
451 if(uprv_strcmp(key, "code_set_name")==0) {
452 if(*value!=0) {
453 uprv_strcpy((char *)staticData->name, value);
454 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
455 }
456 } else if(uprv_strcmp(key, "subchar")==0) {
457 uint8_t bytes[UCNV_EXT_MAX_BYTES];
458 int8_t length;
459
460 s=value;
461 length=ucm_parseBytes(bytes, line, (const char **)&s);
462 if(1<=length && length<=4 && *s==0) {
463 staticData->subCharLen=length;
464 uprv_memcpy(staticData->subChar, bytes, length);
465 } else {
466 fprintf(stderr, "error: illegal <subchar> %s\n", value);
467 *pErrorCode=U_INVALID_TABLE_FORMAT;
468 return;
469 }
470 } else if(uprv_strcmp(key, "subchar1")==0) {
471 uint8_t bytes[UCNV_EXT_MAX_BYTES];
472
473 s=value;
474 if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
475 staticData->subChar1=bytes[0];
476 } else {
477 fprintf(stderr, "error: illegal <subchar1> %s\n", value);
478 *pErrorCode=U_INVALID_TABLE_FORMAT;
479 return;
480 }
481 }
482 }
483
484 /* copy values from the UCMFile to the static data */
485 staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
486 staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
487 staticData->conversionType=data->ucm->states.conversionType;
488
489 if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
490 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
491 *pErrorCode=U_INVALID_TABLE_FORMAT;
492 return;
493 }
494
495 /*
496 * Now that we know the type, copy any 'default' values from the table.
497 * We need not check the type any further because the parser only
498 * recognizes what we have prototypes for.
499 *
500 * For delta (extension-only) tables, copy values from the base file
501 * instead, see createConverter().
502 */
503 if(data->ucm->baseName[0]==0) {
504 prototype=ucnv_converterStaticData[staticData->conversionType];
505 if(prototype!=NULL) {
506 if(staticData->name[0]==0) {
507 uprv_strcpy((char *)staticData->name, prototype->name);
508 }
509
510 if(staticData->codepage==0) {
511 staticData->codepage=prototype->codepage;
512 }
513
514 if(staticData->platform==0) {
515 staticData->platform=prototype->platform;
516 }
517
518 if(staticData->minBytesPerChar==0) {
519 staticData->minBytesPerChar=prototype->minBytesPerChar;
520 }
521
522 if(staticData->maxBytesPerChar==0) {
523 staticData->maxBytesPerChar=prototype->maxBytesPerChar;
524 }
525
526 if(staticData->subCharLen==0) {
527 staticData->subCharLen=prototype->subCharLen;
528 if(prototype->subCharLen>0) {
529 uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
530 }
531 }
532 }
533 }
534
535 if(data->ucm->states.outputType<0) {
536 data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
537 }
538
539 if( staticData->subChar1!=0 &&
540 (staticData->minBytesPerChar>1 ||
541 (staticData->conversionType!=UCNV_MBCS &&
542 staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
543 ) {
544 fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
545 *pErrorCode=U_INVALID_TABLE_FORMAT;
546 }
547 }
548
549 /* return TRUE if a base table was read, FALSE for an extension table */
550 static UBool
readFile(ConvData * data,const char * converterName,UErrorCode * pErrorCode)551 readFile(ConvData *data, const char* converterName,
552 UErrorCode *pErrorCode) {
553 char line[1024];
554 char *end;
555 FileStream *convFile;
556
557 UCMStates *baseStates;
558 UBool dataIsBase;
559
560 if(U_FAILURE(*pErrorCode)) {
561 return FALSE;
562 }
563
564 data->ucm=ucm_open();
565
566 convFile=T_FileStream_open(converterName, "r");
567 if(convFile==NULL) {
568 *pErrorCode=U_FILE_ACCESS_ERROR;
569 return FALSE;
570 }
571
572 readHeader(data, convFile, pErrorCode);
573 if(U_FAILURE(*pErrorCode)) {
574 return FALSE;
575 }
576
577 if(data->ucm->baseName[0]==0) {
578 dataIsBase=TRUE;
579 baseStates=&data->ucm->states;
580 ucm_processStates(baseStates, IGNORE_SISO_CHECK);
581 } else {
582 dataIsBase=FALSE;
583 baseStates=NULL;
584 }
585
586 /* read the base table */
587 ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
588 if(U_FAILURE(*pErrorCode)) {
589 return FALSE;
590 }
591
592 /* read an extension table if there is one */
593 while(T_FileStream_readLine(convFile, line, sizeof(line))) {
594 end=uprv_strchr(line, 0);
595 while(line<end &&
596 (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
597 --end;
598 }
599 *end=0;
600
601 if(line[0]=='#' || u_skipWhitespace(line)==end) {
602 continue; /* ignore empty and comment lines */
603 }
604
605 if(0==uprv_strcmp(line, "CHARMAP")) {
606 /* read the extension table */
607 ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
608 } else {
609 fprintf(stderr, "unexpected text after the base mapping table\n");
610 }
611 break;
612 }
613
614 T_FileStream_close(convFile);
615
616 if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
617 fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
618 *pErrorCode=U_INVALID_TABLE_FORMAT;
619 }
620
621 return dataIsBase;
622 }
623
624 static void
createConverter(ConvData * data,const char * converterName,UErrorCode * pErrorCode)625 createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
626 ConvData baseData;
627 UBool dataIsBase;
628
629 UConverterStaticData *staticData;
630 UCMStates *states, *baseStates;
631
632 if(U_FAILURE(*pErrorCode)) {
633 return;
634 }
635
636 initConvData(data);
637
638 dataIsBase=readFile(data, converterName, pErrorCode);
639 if(U_FAILURE(*pErrorCode)) {
640 return;
641 }
642
643 staticData=&data->staticData;
644 states=&data->ucm->states;
645
646 if(dataIsBase) {
647 /*
648 * Build a normal .cnv file with a base table
649 * and an optional extension table.
650 */
651 data->cnvData=MBCSOpen(data->ucm);
652 if(data->cnvData==NULL) {
653 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
654
655 } else if(!data->cnvData->isValid(data->cnvData,
656 staticData->subChar, staticData->subCharLen)
657 ) {
658 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
659 *pErrorCode=U_INVALID_TABLE_FORMAT;
660
661 } else if(staticData->subChar1!=0 &&
662 !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
663 ) {
664 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
665 *pErrorCode=U_INVALID_TABLE_FORMAT;
666
667 } else if(
668 data->ucm->ext->mappingsLength>0 &&
669 !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
670 ) {
671 *pErrorCode=U_INVALID_TABLE_FORMAT;
672 } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
673 /* sort the table so that it can be turned into UTF-8-friendly data */
674 ucm_sortTable(data->ucm->base);
675 }
676
677 if(U_SUCCESS(*pErrorCode)) {
678 if(
679 /* add the base table after ucm_checkBaseExt()! */
680 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
681 ) {
682 *pErrorCode=U_INVALID_TABLE_FORMAT;
683 } else {
684 /*
685 * addTable() may have requested moving more mappings to the extension table
686 * if they fit into the base toUnicode table but not into the
687 * base fromUnicode table.
688 * (Especially for UTF-8-friendly fromUnicode tables.)
689 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
690 * to be excluded from the extension toUnicode data.
691 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
692 * the base fromUnicode table.
693 */
694 ucm_moveMappings(data->ucm->base, data->ucm->ext);
695 ucm_sortTable(data->ucm->ext);
696 if(data->ucm->ext->mappingsLength>0) {
697 /* prepare the extension table, if there is one */
698 data->extData=CnvExtOpen(data->ucm);
699 if(data->extData==NULL) {
700 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
701 } else if(
702 !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
703 ) {
704 *pErrorCode=U_INVALID_TABLE_FORMAT;
705 }
706 }
707 }
708 }
709 } else {
710 /* Build an extension-only .cnv file. */
711 char baseFilename[500];
712 char *basename;
713
714 initConvData(&baseData);
715
716 /* assemble a path/filename for data->ucm->baseName */
717 uprv_strcpy(baseFilename, converterName);
718 basename=(char *)findBasename(baseFilename);
719 uprv_strcpy(basename, data->ucm->baseName);
720 uprv_strcat(basename, ".ucm");
721
722 /* read the base table */
723 dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
724 if(U_FAILURE(*pErrorCode)) {
725 return;
726 } else if(!dataIsBase) {
727 fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
728 *pErrorCode=U_INVALID_TABLE_FORMAT;
729 } else {
730 /* prepare the extension table */
731 data->extData=CnvExtOpen(data->ucm);
732 if(data->extData==NULL) {
733 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
734 } else {
735 /* fill in gaps in extension file header fields */
736 UCMapping *m, *mLimit;
737 uint8_t fallbackFlags;
738
739 baseStates=&baseData.ucm->states;
740 if(states->conversionType==UCNV_DBCS) {
741 staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
742 } else if(states->minCharLength==0) {
743 staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
744 }
745 if(states->maxCharLength<states->minCharLength) {
746 staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
747 }
748
749 if(staticData->subCharLen==0) {
750 uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
751 staticData->subCharLen=baseData.staticData.subCharLen;
752 }
753 /*
754 * do not copy subChar1 -
755 * only use what is explicitly specified
756 * because it cannot be unset in the extension file header
757 */
758
759 /* get the fallback flags */
760 fallbackFlags=0;
761 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
762 m<mLimit && fallbackFlags!=3;
763 ++m
764 ) {
765 if(m->f==1) {
766 fallbackFlags|=1;
767 } else if(m->f==3) {
768 fallbackFlags|=2;
769 }
770 }
771
772 if(fallbackFlags&1) {
773 staticData->hasFromUnicodeFallback=TRUE;
774 }
775 if(fallbackFlags&2) {
776 staticData->hasToUnicodeFallback=TRUE;
777 }
778
779 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
780 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
781 *pErrorCode=U_INVALID_TABLE_FORMAT;
782
783 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
784 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
785 *pErrorCode=U_INVALID_TABLE_FORMAT;
786
787 } else if(
788 !ucm_checkValidity(data->ucm->ext, baseStates) ||
789 !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
790 ) {
791 *pErrorCode=U_INVALID_TABLE_FORMAT;
792 } else {
793 if(states->maxCharLength>1) {
794 /*
795 * When building a normal .cnv file with a base table
796 * for an MBCS (not SBCS) table with explicit precision flags,
797 * the MBCSAddTable() function marks some mappings for moving
798 * to the extension table.
799 * They fit into the base toUnicode table but not into the
800 * base fromUnicode table.
801 * (Note: We do have explicit precision flags because they are
802 * required for extension table generation, and
803 * ucm_checkBaseExt() verified it.)
804 *
805 * We do not call MBCSAddTable() here (we probably could)
806 * so we need to do the analysis before building the extension table.
807 * We assume that MBCSAddTable() will build a UTF-8-friendly table.
808 * Redundant mappings in the extension table are ok except they cost some size.
809 *
810 * Do this after ucm_checkBaseExt().
811 */
812 const MBCSData *mbcsData=MBCSGetDummy();
813 int32_t needsMove=0;
814 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
815 m<mLimit;
816 ++m
817 ) {
818 if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
819 m->f|=MBCS_FROM_U_EXT_FLAG;
820 m->moveFlag=UCM_MOVE_TO_EXT;
821 ++needsMove;
822 }
823 }
824
825 if(needsMove!=0) {
826 ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
827 ucm_sortTable(data->ucm->ext);
828 }
829 }
830 if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
831 *pErrorCode=U_INVALID_TABLE_FORMAT;
832 }
833 }
834 }
835 }
836
837 cleanupConvData(&baseData);
838 }
839 }
840
841 /*
842 * Hey, Emacs, please set the following:
843 *
844 * Local Variables:
845 * indent-tabs-mode: nil
846 * End:
847 *
848 */
849