1 /*****************************************************************************
2 *
3 * Copyright (C) 1999-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ******************************************************************************/
7
8 /*
9 * uconv(1): an iconv(1)-like converter using ICU.
10 *
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
13 *
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
16 *
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
19 */
20
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include <string.h>
33 #include <stdlib.h>
34
35 #include "cmemory.h"
36 #include "cstring.h"
37 #include "ustrfmt.h"
38
39 #include "unicode/uwmsg.h"
40
41 U_NAMESPACE_USE
42
43 #if (defined(U_WINDOWS) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__)
44 #include <io.h>
45 #include <fcntl.h>
46 #if defined(U_WINDOWS)
47 #define USE_FILENO_BINARY_MODE 1
48 /* Windows likes to rename Unix-like functions */
49 #ifndef fileno
50 #define fileno _fileno
51 #endif
52 #ifndef setmode
53 #define setmode _setmode
54 #endif
55 #ifndef O_BINARY
56 #define O_BINARY _O_BINARY
57 #endif
58 #endif
59 #endif
60
61 #ifdef UCONVMSG_LINK
62 /* below from the README */
63 #include "unicode/utypes.h"
64 #include "unicode/udata.h"
65 U_CFUNC char uconvmsg_dat[];
66 #endif
67
68 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
69
70 #define DEFAULT_BUFSZ 4096
71 #define UCONVMSG "uconvmsg"
72
73 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */
74
75 /*
76 * Initialize the message bundle so that message strings can be fetched
77 * by u_wmsg().
78 *
79 */
80
initMsg(const char * pname)81 static void initMsg(const char *pname) {
82 static int ps = 0;
83
84 if (!ps) {
85 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */
86 UErrorCode err = U_ZERO_ERROR;
87
88 ps = 1;
89
90 /* Set up our static data - if any */
91 #ifdef UCONVMSG_LINK
92 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
93 if (U_FAILURE(err)) {
94 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
95 pname, u_errorName(err));
96 err = U_ZERO_ERROR; /* It may still fail */
97 }
98 #endif
99
100 /* Get messages. */
101 gBundle = u_wmsg_setPath(UCONVMSG, &err);
102 if (U_FAILURE(err)) {
103 fprintf(stderr,
104 "%s: warning: couldn't open bundle %s: %s\n",
105 pname, UCONVMSG, u_errorName(err));
106 #ifdef UCONVMSG_LINK
107 fprintf(stderr,
108 "%s: setAppData was called, internal data %s failed to load\n",
109 pname, UCONVMSG);
110 #endif
111
112 err = U_ZERO_ERROR;
113 /* that was try #1, try again with a path */
114 uprv_strcpy(dataPath, u_getDataDirectory());
115 uprv_strcat(dataPath, U_FILE_SEP_STRING);
116 uprv_strcat(dataPath, UCONVMSG);
117
118 gBundle = u_wmsg_setPath(dataPath, &err);
119 if (U_FAILURE(err)) {
120 fprintf(stderr,
121 "%s: warning: still couldn't open bundle %s: %s\n",
122 pname, dataPath, u_errorName(err));
123 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
124 }
125 }
126 }
127 }
128
129 /* Mapping of callback names to the callbacks passed to the converter
130 API. */
131
132 static struct callback_ent {
133 const char *name;
134 UConverterFromUCallback fromu;
135 const void *fromuctxt;
136 UConverterToUCallback tou;
137 const void *touctxt;
138 } transcode_callbacks[] = {
139 { "substitute",
140 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
141 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
142 { "skip",
143 UCNV_FROM_U_CALLBACK_SKIP, 0,
144 UCNV_TO_U_CALLBACK_SKIP, 0 },
145 { "stop",
146 UCNV_FROM_U_CALLBACK_STOP, 0,
147 UCNV_TO_U_CALLBACK_STOP, 0 },
148 { "escape",
149 UCNV_FROM_U_CALLBACK_ESCAPE, 0,
150 UCNV_TO_U_CALLBACK_ESCAPE, 0},
151 { "escape-icu",
152 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
153 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
154 { "escape-java",
155 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
156 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
157 { "escape-c",
158 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
159 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
160 { "escape-xml",
161 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
162 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
163 { "escape-xml-hex",
164 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
165 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
166 { "escape-xml-dec",
167 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
168 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
169 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
170 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
171 };
172
173 /* Return a pointer to a callback record given its name. */
174
findCallback(const char * name)175 static const struct callback_ent *findCallback(const char *name) {
176 int i, count =
177 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
178
179 /* We'll do a linear search, there aren't many of them and bsearch()
180 may not be that portable. */
181
182 for (i = 0; i < count; ++i) {
183 if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
184 return &transcode_callbacks[i];
185 }
186 }
187
188 return 0;
189 }
190
191 /* Print converter information. If lookfor is set, only that converter will
192 be printed, otherwise all converters will be printed. If canon is non
193 zero, tags and aliases for each converter are printed too, in the format
194 expected for convrters.txt(5). */
195
printConverters(const char * pname,const char * lookfor,UBool canon)196 static int printConverters(const char *pname, const char *lookfor,
197 UBool canon)
198 {
199 UErrorCode err = U_ZERO_ERROR;
200 int32_t num;
201 uint16_t num_stds;
202 const char **stds;
203
204 /* If there is a specified name, just handle that now. */
205
206 if (lookfor) {
207 if (!canon) {
208 printf("%s\n", lookfor);
209 return 0;
210 } else {
211 /* Because we are printing a canonical name, we need the
212 true converter name. We've done that already except for
213 the default name (because we want to print the exact
214 name one would get when calling ucnv_getDefaultName()
215 in non-canon mode). But since we do not know at this
216 point if we have the default name or something else, we
217 need to normalize again to the canonical converter
218 name. */
219
220 const char *truename = ucnv_getAlias(lookfor, 0, &err);
221 if (U_SUCCESS(err)) {
222 lookfor = truename;
223 } else {
224 err = U_ZERO_ERROR;
225 }
226 }
227 }
228
229 /* Print converter names. We come here for one of two reasons: we
230 are printing all the names (lookfor was null), or we have a
231 single converter to print but in canon mode, hence we need to
232 get to it in order to print everything. */
233
234 num = ucnv_countAvailable();
235 if (num <= 0) {
236 initMsg(pname);
237 u_wmsg(stderr, "cantGetNames");
238 return -1;
239 }
240 if (lookfor) {
241 num = 1; /* We know where we want to be. */
242 }
243
244 num_stds = ucnv_countStandards();
245 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
246 if (!stds) {
247 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
248 return -1;
249 } else {
250 uint16_t s;
251
252 if (canon) {
253 printf("{ ");
254 }
255 for (s = 0; s < num_stds; ++s) {
256 stds[s] = ucnv_getStandard(s, &err);
257 if (canon) {
258 printf("%s ", stds[s]);
259 }
260 if (U_FAILURE(err)) {
261 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
262 return -1;
263 }
264 }
265 if (canon) {
266 puts("}");
267 }
268 }
269
270 for (int32_t i = 0; i < num; i++) {
271 const char *name;
272 uint16_t num_aliases;
273
274 /* Set the name either to what we are looking for, or
275 to the current converter name. */
276
277 if (lookfor) {
278 name = lookfor;
279 } else {
280 name = ucnv_getAvailableName(i);
281 }
282
283 /* Get all the aliases associated to the name. */
284
285 err = U_ZERO_ERROR;
286 num_aliases = ucnv_countAliases(name, &err);
287 if (U_FAILURE(err)) {
288 printf("%s", name);
289
290 UnicodeString str(name, "");
291 putchar('\t');
292 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
293 u_wmsg_errorName(err));
294 return -1;
295 } else {
296 uint16_t a, s, t;
297
298 /* Write all the aliases and their tags. */
299
300 for (a = 0; a < num_aliases; ++a) {
301 const char *alias = ucnv_getAlias(name, a, &err);
302
303 if (U_FAILURE(err)) {
304 UnicodeString str(name, "");
305 putchar('\t');
306 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
307 u_wmsg_errorName(err));
308 return -1;
309 }
310
311 /* Print the current alias so that it looks right. */
312 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
313 alias,
314 (canon ? "" : " "));
315
316 /* Look (slowly, linear searching) for a tag. */
317
318 if (canon) {
319 /* -1 to skip the last standard */
320 for (s = t = 0; s < num_stds-1; ++s) {
321 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
322 if (U_SUCCESS(err)) {
323 /* List the standard tags */
324 const char *standardName;
325 UBool isFirst = TRUE;
326 UErrorCode enumError = U_ZERO_ERROR;
327 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
328 /* See if this alias is supported by this standard. */
329 if (!strcmp(standardName, alias)) {
330 if (!t) {
331 printf(" {");
332 t = 1;
333 }
334 /* Print a * after the default standard name */
335 printf(" %s%s", stds[s], (isFirst ? "*" : ""));
336 }
337 isFirst = FALSE;
338 }
339 }
340 }
341 if (t) {
342 printf(" }");
343 }
344 }
345 /* Terminate this entry. */
346 if (canon) {
347 puts("");
348 }
349
350 /* Move on. */
351 }
352 /* Terminate this entry. */
353 if (!canon) {
354 puts("");
355 }
356 }
357 }
358
359 /* Free temporary data. */
360
361 uprv_free(stds);
362
363 /* Success. */
364
365 return 0;
366 }
367
368 /* Print all available transliterators. If canon is non zero, print
369 one transliterator per line. */
370
printTransliterators(UBool canon)371 static int printTransliterators(UBool canon)
372 {
373 #if UCONFIG_NO_TRANSLITERATION
374 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
375 return 1;
376 #else
377 int32_t numtrans = utrans_countAvailableIDs(), i;
378 int buflen = 512;
379 char *buf = (char *) uprv_malloc(buflen);
380 char staticbuf[512];
381
382 char sepchar = canon ? '\n' : ' ';
383
384 if (!buf) {
385 buf = staticbuf;
386 buflen = sizeof(staticbuf);
387 }
388
389 for (i = 0; i < numtrans; ++i) {
390 int32_t len = utrans_getAvailableID(i, buf, buflen);
391 if (len >= buflen - 1) {
392 if (buf != staticbuf) {
393 buflen <<= 1;
394 if (buflen < len) {
395 buflen = len + 64;
396 }
397 buf = (char *) uprv_realloc(buf, buflen);
398 if (!buf) {
399 buf = staticbuf;
400 buflen = sizeof(staticbuf);
401 }
402 }
403 utrans_getAvailableID(i, buf, buflen);
404 if (len >= buflen) {
405 uprv_strcpy(buf + buflen - 4, "..."); /* Truncate the name. */
406 }
407 }
408
409 printf("%s", buf);
410 if (i < numtrans - 1) {
411 putchar(sepchar);
412 }
413 }
414
415 /* Add a terminating newline if needed. */
416
417 if (sepchar != '\n') {
418 putchar('\n');
419 }
420
421 /* Free temporary data. */
422
423 if (buf != staticbuf) {
424 uprv_free(buf);
425 }
426
427 /* Success. */
428
429 return 0;
430 #endif
431 }
432
433 enum {
434 uSP = 0x20, // space
435 uCR = 0xd, // carriage return
436 uLF = 0xa, // line feed
437 uNL = 0x85, // newline
438 uLS = 0x2028, // line separator
439 uPS = 0x2029, // paragraph separator
440 uSig = 0xfeff // signature/BOM character
441 };
442
443 static inline int32_t
getChunkLimit(const UnicodeString & prev,const UnicodeString & s)444 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
445 // find one of
446 // CR, LF, CRLF, NL, LS, PS
447 // for paragraph ends (see UAX #13/Unicode 4)
448 // and include it in the chunk
449 // all of these characters are on the BMP
450 // do not include FF or VT in case they are part of a paragraph
451 // (important for bidi contexts)
452 static const UChar paraEnds[] = {
453 0xd, 0xa, 0x85, 0x2028, 0x2029
454 };
455 enum {
456 iCR, iLF, iNL, iLS, iPS, iCount
457 };
458
459 // first, see if there is a CRLF split between prev and s
460 if (prev.endsWith(paraEnds + iCR, 1)) {
461 if (s.startsWith(paraEnds + iLF, 1)) {
462 return 1; // split CRLF, include the LF
463 } else if (!s.isEmpty()) {
464 return 0; // complete the last chunk
465 } else {
466 return -1; // wait for actual further contents to arrive
467 }
468 }
469
470 const UChar *u = s.getBuffer(), *limit = u + s.length();
471 UChar c;
472
473 while (u < limit) {
474 c = *u++;
475 if (
476 ((c < uSP) && (c == uCR || c == uLF)) ||
477 (c == uNL) ||
478 ((c & uLS) == uLS)
479 ) {
480 if (c == uCR) {
481 // check for CRLF
482 if (u == limit) {
483 return -1; // LF may be in the next chunk
484 } else if (*u == uLF) {
485 ++u; // include the LF in this chunk
486 }
487 }
488 return (int32_t)(u - s.getBuffer());
489 }
490 }
491
492 return -1; // continue collecting the chunk
493 }
494
495 enum {
496 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM)
497 CNV_WITH_FEFF, // can convert the U+FEFF signature character
498 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
499 };
500
501 static inline UChar
nibbleToHex(uint8_t n)502 nibbleToHex(uint8_t n) {
503 n &= 0xf;
504 return
505 n <= 9 ?
506 (UChar)(0x30 + n) :
507 (UChar)((0x61 - 10) + n);
508 }
509
510 // check the converter's Unicode signature properties;
511 // the fromUnicode side of the converter must be in its initial state
512 // and will be reset again if it was used
513 static int32_t
cnvSigType(UConverter * cnv)514 cnvSigType(UConverter *cnv) {
515 UErrorCode err;
516 int32_t result;
517
518 // test if the output charset can convert U+FEFF
519 USet *set = uset_open(1, 0);
520 err = U_ZERO_ERROR;
521 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
522 if (U_SUCCESS(err) && uset_contains(set, uSig)) {
523 result = CNV_WITH_FEFF;
524 } else {
525 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
526 }
527 uset_close(set);
528
529 if (result == CNV_WITH_FEFF) {
530 // test if the output charset emits a signature anyway
531 const UChar a[1] = { 0x61 }; // "a"
532 const UChar *in;
533
534 char buffer[20];
535 char *out;
536
537 in = a;
538 out = buffer;
539 err = U_ZERO_ERROR;
540 ucnv_fromUnicode(cnv,
541 &out, buffer + sizeof(buffer),
542 &in, a + 1,
543 NULL, TRUE, &err);
544 ucnv_resetFromUnicode(cnv);
545
546 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
547 U_SUCCESS(err)
548 ) {
549 result = CNV_ADDS_FEFF;
550 }
551 }
552
553 return result;
554 }
555
556 class ConvertFile {
557 public:
ConvertFile()558 ConvertFile() :
559 buf(NULL), outbuf(NULL), fromoffsets(NULL),
560 bufsz(0), signature(0) {}
561
562 void
setBufferSize(size_t bufferSize)563 setBufferSize(size_t bufferSize) {
564 bufsz = bufferSize;
565
566 buf = new char[2 * bufsz];
567 outbuf = buf + bufsz;
568
569 // +1 for an added U+FEFF in the intermediate Unicode buffer
570 fromoffsets = new int32_t[bufsz + 1];
571 }
572
~ConvertFile()573 ~ConvertFile() {
574 delete [] buf;
575 delete [] fromoffsets;
576 }
577
578 UBool convertFile(const char *pname,
579 const char *fromcpage,
580 UConverterToUCallback toucallback,
581 const void *touctxt,
582 const char *tocpage,
583 UConverterFromUCallback fromucallback,
584 const void *fromuctxt,
585 UBool fallback,
586 const char *translit,
587 const char *infilestr,
588 FILE * outfile, int verbose);
589 private:
590 friend int main(int argc, char **argv);
591
592 char *buf, *outbuf;
593 int32_t *fromoffsets;
594
595 size_t bufsz;
596 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
597 };
598
599 // Convert a file from one encoding to another
600 UBool
convertFile(const char * pname,const char * fromcpage,UConverterToUCallback toucallback,const void * touctxt,const char * tocpage,UConverterFromUCallback fromucallback,const void * fromuctxt,UBool fallback,const char * translit,const char * infilestr,FILE * outfile,int verbose)601 ConvertFile::convertFile(const char *pname,
602 const char *fromcpage,
603 UConverterToUCallback toucallback,
604 const void *touctxt,
605 const char *tocpage,
606 UConverterFromUCallback fromucallback,
607 const void *fromuctxt,
608 UBool fallback,
609 const char *translit,
610 const char *infilestr,
611 FILE * outfile, int verbose)
612 {
613 FILE *infile;
614 UBool ret = TRUE;
615 UConverter *convfrom = 0;
616 UConverter *convto = 0;
617 UErrorCode err = U_ZERO_ERROR;
618 UBool flush;
619 const char *cbufp, *prevbufp;
620 char *bufp;
621
622 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
623
624 const UChar *unibuf, *unibufbp;
625 UChar *unibufp;
626
627 size_t rd, wr;
628
629 #if !UCONFIG_NO_TRANSLITERATION
630 Transliterator *t = 0; // Transliterator acting on Unicode data.
631 UnicodeString chunk; // One chunk of the text being collected for transformation.
632 #endif
633 UnicodeString u; // String to do the transliteration.
634 int32_t ulen;
635
636 // use conversion offsets for error messages
637 // unless a transliterator is used -
638 // a text transformation will reorder characters in unpredictable ways
639 UBool useOffsets = TRUE;
640
641 // Open the correct input file or connect to stdin for reading input
642
643 if (infilestr != 0 && strcmp(infilestr, "-")) {
644 infile = fopen(infilestr, "rb");
645 if (infile == 0) {
646 UnicodeString str1(infilestr, "");
647 str1.append((UChar32) 0);
648 UnicodeString str2(strerror(errno), "");
649 str2.append((UChar32) 0);
650 initMsg(pname);
651 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
652 return FALSE;
653 }
654 } else {
655 infilestr = "-";
656 infile = stdin;
657 #ifdef USE_FILENO_BINARY_MODE
658 if (setmode(fileno(stdin), O_BINARY) == -1) {
659 initMsg(pname);
660 u_wmsg(stderr, "cantSetInBinMode");
661 return FALSE;
662 }
663 #endif
664 }
665
666 if (verbose) {
667 fprintf(stderr, "%s:\n", infilestr);
668 }
669
670 #if !UCONFIG_NO_TRANSLITERATION
671 // Create transliterator as needed.
672
673 if (translit != NULL && *translit) {
674 UParseError parse;
675 UnicodeString str(translit), pestr;
676
677 /* Create from rules or by ID as needed. */
678
679 parse.line = -1;
680
681 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
682 t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
683 } else {
684 t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
685 }
686
687 if (U_FAILURE(err)) {
688 str.append((UChar32) 0);
689 initMsg(pname);
690
691 if (parse.line >= 0) {
692 UChar linebuf[20], offsetbuf[20];
693 uprv_itou(linebuf, 20, parse.line, 10, 0);
694 uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
695 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
696 u_wmsg_errorName(err), linebuf, offsetbuf);
697 } else {
698 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
699 u_wmsg_errorName(err));
700 }
701
702 if (t) {
703 delete t;
704 t = 0;
705 }
706 goto error_exit;
707 }
708
709 useOffsets = FALSE;
710 }
711 #endif
712
713 // Create codepage converter. If the codepage or its aliases weren't
714 // available, it returns NULL and a failure code. We also set the
715 // callbacks, and return errors in the same way.
716
717 convfrom = ucnv_open(fromcpage, &err);
718 if (U_FAILURE(err)) {
719 UnicodeString str(fromcpage, "");
720 initMsg(pname);
721 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
722 u_wmsg_errorName(err));
723 goto error_exit;
724 }
725 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
726 if (U_FAILURE(err)) {
727 initMsg(pname);
728 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
729 goto error_exit;
730 }
731
732 convto = ucnv_open(tocpage, &err);
733 if (U_FAILURE(err)) {
734 UnicodeString str(tocpage, "");
735 initMsg(pname);
736 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
737 u_wmsg_errorName(err));
738 goto error_exit;
739 }
740 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
741 if (U_FAILURE(err)) {
742 initMsg(pname);
743 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
744 goto error_exit;
745 }
746 ucnv_setFallback(convto, fallback);
747
748 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
749 int8_t sig;
750
751 // OK, we can convert now.
752 sig = signature;
753 rd = 0;
754
755 do {
756 willexit = FALSE;
757
758 // input file offset at the beginning of the next buffer
759 infoffset += rd;
760
761 rd = fread(buf, 1, bufsz, infile);
762 if (ferror(infile) != 0) {
763 UnicodeString str(strerror(errno));
764 initMsg(pname);
765 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
766 goto error_exit;
767 }
768
769 // Convert the read buffer into the new encoding via Unicode.
770 // After the call 'unibufp' will be placed behind the last
771 // character that was converted in the 'unibuf'.
772 // Also the 'cbufp' is positioned behind the last converted
773 // character.
774 // At the last conversion in the file, flush should be set to
775 // true so that we get all characters converted.
776 //
777 // The converter must be flushed at the end of conversion so
778 // that characters on hold also will be written.
779
780 cbufp = buf;
781 flush = (UBool)(rd != bufsz);
782
783 // convert until the input is consumed
784 do {
785 // remember the start of the current byte-to-Unicode conversion
786 prevbufp = cbufp;
787
788 unibuf = unibufp = u.getBuffer((int32_t)bufsz);
789
790 // Use bufsz instead of u.getCapacity() for the targetLimit
791 // so that we don't overflow fromoffsets[].
792 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
793 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
794
795 ulen = (int32_t)(unibufp - unibuf);
796 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
797
798 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
799 // converting all of the input bytes.
800 // It works like this because ucnv_toUnicode() returns only under the
801 // following conditions:
802 // - an error occurred during conversion (an error code is set)
803 // - the target buffer is filled (the error code indicates an overflow)
804 // - the source is consumed
805 // That is, if the error code does not indicate a failure,
806 // not even an overflow, then the source must be consumed entirely.
807 fromSawEndOfBytes = (UBool)U_SUCCESS(err);
808
809 if (err == U_BUFFER_OVERFLOW_ERROR) {
810 err = U_ZERO_ERROR;
811 } else if (U_FAILURE(err)) {
812 char pos[32], errorBytes[32];
813 int8_t i, length, errorLength;
814
815 UErrorCode localError = U_ZERO_ERROR;
816 errorLength = (int8_t)sizeof(errorBytes);
817 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
818 if (U_FAILURE(localError) || errorLength == 0) {
819 errorLength = 1;
820 }
821
822 // print the input file offset of the start of the error bytes:
823 // input file offset of the current byte buffer +
824 // length of the just consumed bytes -
825 // length of the error bytes
826 length =
827 (int8_t)sprintf(pos, "%d",
828 (int)(infoffset + (cbufp - buf) - errorLength));
829
830 // output the bytes that caused the error
831 UnicodeString str;
832 for (i = 0; i < errorLength; ++i) {
833 if (i > 0) {
834 str.append((UChar)uSP);
835 }
836 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
837 str.append(nibbleToHex((uint8_t)errorBytes[i]));
838 }
839
840 initMsg(pname);
841 u_wmsg(stderr, "problemCvtToU",
842 UnicodeString(pos, length, "").getTerminatedBuffer(),
843 str.getTerminatedBuffer(),
844 u_wmsg_errorName(err));
845
846 willexit = TRUE;
847 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
848 }
849
850 // Replaced a check for whether the input was consumed by
851 // looping until it is; message key "premEndInput" now obsolete.
852
853 if (ulen == 0) {
854 continue;
855 }
856
857 // remove a U+FEFF Unicode signature character if requested
858 if (sig < 0) {
859 if (u.charAt(0) == uSig) {
860 u.remove(0, 1);
861
862 // account for the removed UChar and offset
863 --ulen;
864
865 if (useOffsets) {
866 // remove an offset from fromoffsets[] as well
867 // to keep the array parallel with the UChars
868 memmove(fromoffsets, fromoffsets + 1, ulen * 4);
869 }
870
871 }
872 sig = 0;
873 }
874
875 #if !UCONFIG_NO_TRANSLITERATION
876 // Transliterate/transform if needed.
877
878 // For transformation, we use chunking code -
879 // collect Unicode input until, for example, an end-of-line,
880 // then transform and output-convert that and continue collecting.
881 // This makes the transformation result independent of the buffer size
882 // while avoiding the slower keyboard mode.
883 // The end-of-chunk characters are completely included in the
884 // transformed string in case they are to be transformed themselves.
885 if (t != NULL) {
886 UnicodeString out;
887 int32_t chunkLimit;
888
889 do {
890 chunkLimit = getChunkLimit(chunk, u);
891 if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
892 // use all of the rest at the end of the text
893 chunkLimit = u.length();
894 }
895 if (chunkLimit >= 0) {
896 // complete the chunk and transform it
897 chunk.append(u, 0, chunkLimit);
898 u.remove(0, chunkLimit);
899 t->transliterate(chunk);
900
901 // append the transformation result to the result and empty the chunk
902 out.append(chunk);
903 chunk.remove();
904 } else {
905 // continue collecting the chunk
906 chunk.append(u);
907 break;
908 }
909 } while (!u.isEmpty());
910
911 u = out;
912 ulen = u.length();
913 }
914 #endif
915
916 // add a U+FEFF Unicode signature character if requested
917 // and possible/necessary
918 if (sig > 0) {
919 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
920 u.insert(0, (UChar)uSig);
921
922 if (useOffsets) {
923 // insert a pseudo-offset into fromoffsets[] as well
924 // to keep the array parallel with the UChars
925 memmove(fromoffsets + 1, fromoffsets, ulen * 4);
926 fromoffsets[0] = -1;
927 }
928
929 // account for the additional UChar and offset
930 ++ulen;
931 }
932 sig = 0;
933 }
934
935 // Convert the Unicode buffer into the destination codepage
936 // Again 'bufp' will be placed behind the last converted character
937 // And 'unibufp' will be placed behind the last converted unicode character
938 // At the last conversion flush should be set to true to ensure that
939 // all characters left get converted
940
941 unibuf = unibufbp = u.getBuffer();
942
943 do {
944 bufp = outbuf;
945
946 // Use fromSawEndOfBytes in addition to the flush flag -
947 // it indicates whether the intermediate Unicode string
948 // contains the very last UChars for the very last input bytes.
949 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
950 &unibufbp,
951 unibuf + ulen,
952 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
953
954 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
955 // converting all of the intermediate UChars.
956 // See comment for fromSawEndOfBytes.
957 toSawEndOfUnicode = (UBool)U_SUCCESS(err);
958
959 if (err == U_BUFFER_OVERFLOW_ERROR) {
960 err = U_ZERO_ERROR;
961 } else if (U_FAILURE(err)) {
962 UChar errorUChars[4];
963 const char *errtag;
964 char pos[32];
965 UChar32 c;
966 int8_t i, length, errorLength;
967
968 UErrorCode localError = U_ZERO_ERROR;
969 errorLength = (int8_t)LENGTHOF(errorUChars);
970 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
971 if (U_FAILURE(localError) || errorLength == 0) {
972 // need at least 1 so that we don't access beyond the length of fromoffsets[]
973 errorLength = 1;
974 }
975
976 int32_t ferroffset;
977
978 if (useOffsets) {
979 // Unicode buffer offset of the start of the error UChars
980 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
981 if (ferroffset < 0) {
982 // approximation - the character started in the previous Unicode buffer
983 ferroffset = 0;
984 }
985
986 // get the corresponding byte offset out of fromoffsets[]
987 // go back if the offset is not known for some of the UChars
988 int32_t fromoffset;
989 do {
990 fromoffset = fromoffsets[ferroffset];
991 } while (fromoffset < 0 && --ferroffset >= 0);
992
993 // total input file offset =
994 // input file offset of the current byte buffer +
995 // byte buffer offset of where the current Unicode buffer is converted from +
996 // fromoffsets[Unicode offset]
997 ferroffset = infoffset + (prevbufp - buf) + fromoffset;
998 errtag = "problemCvtFromU";
999 } else {
1000 // Do not use fromoffsets if (t != NULL) because the Unicode text may
1001 // be different from what the offsets refer to.
1002
1003 // output file offset
1004 ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
1005 errtag = "problemCvtFromUOut";
1006 }
1007
1008 length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
1009
1010 // output the code points that caused the error
1011 UnicodeString str;
1012 for (i = 0; i < errorLength;) {
1013 if (i > 0) {
1014 str.append((UChar)uSP);
1015 }
1016 U16_NEXT(errorUChars, i, errorLength, c);
1017 if (c >= 0x100000) {
1018 str.append(nibbleToHex((uint8_t)(c >> 20)));
1019 }
1020 if (c >= 0x10000) {
1021 str.append(nibbleToHex((uint8_t)(c >> 16)));
1022 }
1023 str.append(nibbleToHex((uint8_t)(c >> 12)));
1024 str.append(nibbleToHex((uint8_t)(c >> 8)));
1025 str.append(nibbleToHex((uint8_t)(c >> 4)));
1026 str.append(nibbleToHex((uint8_t)c));
1027 }
1028
1029 initMsg(pname);
1030 u_wmsg(stderr, errtag,
1031 UnicodeString(pos, length, "").getTerminatedBuffer(),
1032 str.getTerminatedBuffer(),
1033 u_wmsg_errorName(err));
1034 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1035
1036 willexit = TRUE;
1037 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1038 }
1039
1040 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1041 // looping until they are; message key "premEnd" now obsolete.
1042
1043 // Finally, write the converted buffer to the output file
1044 size_t outlen = (size_t) (bufp - outbuf);
1045 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1046 if (wr != outlen) {
1047 UnicodeString str(strerror(errno));
1048 initMsg(pname);
1049 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1050 willexit = TRUE;
1051 }
1052
1053 if (willexit) {
1054 goto error_exit;
1055 }
1056 } while (!toSawEndOfUnicode);
1057 } while (!fromSawEndOfBytes);
1058 } while (!flush); // Stop when we have flushed the
1059 // converters (this means that it's
1060 // the end of output)
1061
1062 goto normal_exit;
1063
1064 error_exit:
1065 ret = FALSE;
1066
1067 normal_exit:
1068 // Cleanup.
1069
1070 ucnv_close(convfrom);
1071 ucnv_close(convto);
1072
1073 #if !UCONFIG_NO_TRANSLITERATION
1074 delete t;
1075 #endif
1076
1077 if (infile != stdin) {
1078 fclose(infile);
1079 }
1080
1081 return ret;
1082 }
1083
usage(const char * pname,int ecode)1084 static void usage(const char *pname, int ecode) {
1085 const UChar *msg;
1086 int32_t msgLen;
1087 UErrorCode err = U_ZERO_ERROR;
1088 FILE *fp = ecode ? stderr : stdout;
1089 int res;
1090
1091 initMsg(pname);
1092 msg =
1093 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1094 &msgLen, &err);
1095 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1096 UnicodeString mname(msg, msgLen + 1);
1097
1098 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1099 if (!ecode) {
1100 if (!res) {
1101 fputc('\n', fp);
1102 }
1103 if (!u_wmsg(fp, "help")) {
1104 /* Now dump callbacks and finish. */
1105
1106 int i, count =
1107 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
1108 for (i = 0; i < count; ++i) {
1109 fprintf(fp, " %s", transcode_callbacks[i].name);
1110 }
1111 fputc('\n', fp);
1112 }
1113 }
1114
1115 exit(ecode);
1116 }
1117
1118 extern int
main(int argc,char ** argv)1119 main(int argc, char **argv)
1120 {
1121 FILE *outfile;
1122 int ret = 0;
1123
1124 size_t bufsz = DEFAULT_BUFSZ;
1125
1126 const char *fromcpage = 0;
1127 const char *tocpage = 0;
1128 const char *translit = 0;
1129 const char *outfilestr = 0;
1130 UBool fallback = FALSE;
1131
1132 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1133 const void *fromuctxt = 0;
1134 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1135 const void *touctxt = 0;
1136
1137 char **iter, **remainArgv, **remainArgvLimit;
1138 char **end = argv + argc;
1139
1140 const char *pname;
1141
1142 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1143 const char *printName = 0;
1144
1145 UBool verbose = FALSE;
1146 UErrorCode status = U_ZERO_ERROR;
1147
1148 ConvertFile cf;
1149
1150 /* Initialize ICU */
1151 u_init(&status);
1152 if (U_FAILURE(status)) {
1153 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
1154 argv[0], u_errorName(status));
1155 exit(1);
1156 }
1157
1158 // Get and prettify pname.
1159 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1160 #ifdef U_WINDOWS
1161 if (!pname) {
1162 pname = uprv_strrchr(*argv, '/');
1163 }
1164 #endif
1165 if (!pname) {
1166 pname = *argv;
1167 } else {
1168 ++pname;
1169 }
1170
1171 // First, get the arguments from command-line
1172 // to know the codepages to convert between
1173
1174 remainArgv = remainArgvLimit = argv + 1;
1175 for (iter = argv + 1; iter != end; iter++) {
1176 // Check for from charset
1177 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1178 iter++;
1179 if (iter != end)
1180 fromcpage = *iter;
1181 else
1182 usage(pname, 1);
1183 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1184 iter++;
1185 if (iter != end)
1186 tocpage = *iter;
1187 else
1188 usage(pname, 1);
1189 } else if (strcmp("-x", *iter) == 0) {
1190 iter++;
1191 if (iter != end)
1192 translit = *iter;
1193 else
1194 usage(pname, 1);
1195 } else if (!strcmp("--fallback", *iter)) {
1196 fallback = TRUE;
1197 } else if (!strcmp("--no-fallback", *iter)) {
1198 fallback = FALSE;
1199 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1200 iter++;
1201 if (iter != end) {
1202 bufsz = atoi(*iter);
1203 if ((int) bufsz <= 0) {
1204 initMsg(pname);
1205 UnicodeString str(*iter);
1206 initMsg(pname);
1207 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1208 return 3;
1209 }
1210 } else {
1211 usage(pname, 1);
1212 }
1213 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1214 if (printTranslits) {
1215 usage(pname, 1);
1216 }
1217 printConvs = TRUE;
1218 } else if (strcmp("--default-code", *iter) == 0) {
1219 if (printTranslits) {
1220 usage(pname, 1);
1221 }
1222 printName = ucnv_getDefaultName();
1223 } else if (strcmp("--list-code", *iter) == 0) {
1224 if (printTranslits) {
1225 usage(pname, 1);
1226 }
1227
1228 iter++;
1229 if (iter != end) {
1230 UErrorCode e = U_ZERO_ERROR;
1231 printName = ucnv_getAlias(*iter, 0, &e);
1232 if (U_FAILURE(e) || !printName) {
1233 UnicodeString str(*iter);
1234 initMsg(pname);
1235 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1236 return 2;
1237 }
1238 } else
1239 usage(pname, 1);
1240 } else if (strcmp("--canon", *iter) == 0) {
1241 printCanon = TRUE;
1242 } else if (strcmp("-L", *iter) == 0
1243 || !strcmp("--list-transliterators", *iter)) {
1244 if (printConvs) {
1245 usage(pname, 1);
1246 }
1247 printTranslits = TRUE;
1248 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1249 || !strcmp("--help", *iter)) {
1250 usage(pname, 0);
1251 } else if (!strcmp("-c", *iter)) {
1252 fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1253 } else if (!strcmp("--to-callback", *iter)) {
1254 iter++;
1255 if (iter != end) {
1256 const struct callback_ent *cbe = findCallback(*iter);
1257 if (cbe) {
1258 fromucallback = cbe->fromu;
1259 fromuctxt = cbe->fromuctxt;
1260 } else {
1261 UnicodeString str(*iter);
1262 initMsg(pname);
1263 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1264 return 4;
1265 }
1266 } else {
1267 usage(pname, 1);
1268 }
1269 } else if (!strcmp("--from-callback", *iter)) {
1270 iter++;
1271 if (iter != end) {
1272 const struct callback_ent *cbe = findCallback(*iter);
1273 if (cbe) {
1274 toucallback = cbe->tou;
1275 touctxt = cbe->touctxt;
1276 } else {
1277 UnicodeString str(*iter);
1278 initMsg(pname);
1279 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1280 return 4;
1281 }
1282 } else {
1283 usage(pname, 1);
1284 }
1285 } else if (!strcmp("-i", *iter)) {
1286 toucallback = UCNV_TO_U_CALLBACK_SKIP;
1287 } else if (!strcmp("--callback", *iter)) {
1288 iter++;
1289 if (iter != end) {
1290 const struct callback_ent *cbe = findCallback(*iter);
1291 if (cbe) {
1292 fromucallback = cbe->fromu;
1293 fromuctxt = cbe->fromuctxt;
1294 toucallback = cbe->tou;
1295 touctxt = cbe->touctxt;
1296 } else {
1297 UnicodeString str(*iter);
1298 initMsg(pname);
1299 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1300 return 4;
1301 }
1302 } else {
1303 usage(pname, 1);
1304 }
1305 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1306 verbose = FALSE;
1307 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1308 verbose = TRUE;
1309 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1310 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname);
1311 return 0;
1312 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1313 ++iter;
1314 if (iter != end && !outfilestr) {
1315 outfilestr = *iter;
1316 } else {
1317 usage(pname, 1);
1318 }
1319 } else if (0 == strcmp("--add-signature", *iter)) {
1320 cf.signature = 1;
1321 } else if (0 == strcmp("--remove-signature", *iter)) {
1322 cf.signature = -1;
1323 } else if (**iter == '-' && (*iter)[1]) {
1324 usage(pname, 1);
1325 } else {
1326 // move a non-option up in argv[]
1327 *remainArgvLimit++ = *iter;
1328 }
1329 }
1330
1331 if (printConvs || printName) {
1332 return printConverters(pname, printName, printCanon) ? 2 : 0;
1333 } else if (printTranslits) {
1334 return printTransliterators(printCanon) ? 3 : 0;
1335 }
1336
1337 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1338 fromcpage = ucnv_getDefaultName();
1339 }
1340 if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1341 tocpage = ucnv_getDefaultName();
1342 }
1343
1344 // Open the correct output file or connect to stdout for reading input
1345 if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1346 outfile = fopen(outfilestr, "wb");
1347 if (outfile == 0) {
1348 UnicodeString str1(outfilestr, "");
1349 UnicodeString str2(strerror(errno), "");
1350 initMsg(pname);
1351 u_wmsg(stderr, "cantCreateOutputF",
1352 str1.getBuffer(), str2.getBuffer());
1353 return 1;
1354 }
1355 } else {
1356 outfilestr = "-";
1357 outfile = stdout;
1358 #ifdef USE_FILENO_BINARY_MODE
1359 if (setmode(fileno(outfile), O_BINARY) == -1) {
1360 u_wmsg(stderr, "cantSetOutBinMode");
1361 exit(-1);
1362 }
1363 #endif
1364 }
1365
1366 /* Loop again on the arguments to find all the input files, and
1367 convert them. */
1368
1369 cf.setBufferSize(bufsz);
1370
1371 if(remainArgv < remainArgvLimit) {
1372 for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1373 if (!cf.convertFile(
1374 pname, fromcpage, toucallback, touctxt, tocpage,
1375 fromucallback, fromuctxt, fallback, translit, *iter,
1376 outfile, verbose)
1377 ) {
1378 goto error_exit;
1379 }
1380 }
1381 } else {
1382 if (!cf.convertFile(
1383 pname, fromcpage, toucallback, touctxt, tocpage,
1384 fromucallback, fromuctxt, fallback, translit, 0,
1385 outfile, verbose)
1386 ) {
1387 goto error_exit;
1388 }
1389 }
1390
1391 goto normal_exit;
1392 error_exit:
1393 ret = 1;
1394 normal_exit:
1395
1396 if (outfile != stdout) {
1397 fclose(outfile);
1398 }
1399
1400 return ret;
1401 }
1402
1403
1404 /*
1405 * Hey, Emacs, please set the following:
1406 *
1407 * Local Variables:
1408 * indent-tabs-mode: nil
1409 * End:
1410 *
1411 */
1412