1 /*****************************************************************************
2 *
3 * Copyright (C) 1999-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ******************************************************************************/
7
8 /*
9 * uconv(1): an iconv(1)-like converter using ICU.
10 *
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
13 *
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
16 *
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
19 */
20
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include <string.h>
33 #include <stdlib.h>
34
35 #include "cmemory.h"
36 #include "cstring.h"
37 #include "ustrfmt.h"
38
39 #include "unicode/uwmsg.h"
40
41 U_NAMESPACE_USE
42
43 #if (defined(U_WINDOWS) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__)
44 #include <io.h>
45 #include <fcntl.h>
46 #if defined(U_WINDOWS)
47 #define USE_FILENO_BINARY_MODE 1
48 /* Windows likes to rename Unix-like functions */
49 #ifndef fileno
50 #define fileno _fileno
51 #endif
52 #ifndef setmode
53 #define setmode _setmode
54 #endif
55 #ifndef O_BINARY
56 #define O_BINARY _O_BINARY
57 #endif
58 #endif
59 #endif
60
61 #ifdef UCONVMSG_LINK
62 /* below from the README */
63 #include "unicode/utypes.h"
64 #include "unicode/udata.h"
65 U_CFUNC char uconvmsg_dat[];
66 #endif
67
68 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
69
70 #define DEFAULT_BUFSZ 4096
71 #define UCONVMSG "uconvmsg"
72
73 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */
74
75 /*
76 * Initialize the message bundle so that message strings can be fetched
77 * by u_wmsg().
78 *
79 */
80
initMsg(const char * pname)81 static void initMsg(const char *pname) {
82 static int ps = 0;
83
84 if (!ps) {
85 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */
86 UErrorCode err = U_ZERO_ERROR;
87
88 ps = 1;
89
90 /* Set up our static data - if any */
91 #ifdef UCONVMSG_LINK
92 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
93 if (U_FAILURE(err)) {
94 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
95 pname, u_errorName(err));
96 err = U_ZERO_ERROR; /* It may still fail */
97 }
98 #endif
99
100 /* Get messages. */
101 gBundle = u_wmsg_setPath(UCONVMSG, &err);
102 if (U_FAILURE(err)) {
103 fprintf(stderr,
104 "%s: warning: couldn't open bundle %s: %s\n",
105 pname, UCONVMSG, u_errorName(err));
106 #ifdef UCONVMSG_LINK
107 fprintf(stderr,
108 "%s: setAppData was called, internal data %s failed to load\n",
109 pname, UCONVMSG);
110 #endif
111
112 err = U_ZERO_ERROR;
113 /* that was try #1, try again with a path */
114 uprv_strcpy(dataPath, u_getDataDirectory());
115 uprv_strcat(dataPath, U_FILE_SEP_STRING);
116 uprv_strcat(dataPath, UCONVMSG);
117
118 gBundle = u_wmsg_setPath(dataPath, &err);
119 if (U_FAILURE(err)) {
120 fprintf(stderr,
121 "%s: warning: still couldn't open bundle %s: %s\n",
122 pname, dataPath, u_errorName(err));
123 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
124 }
125 }
126 }
127 }
128
129 /* Mapping of callback names to the callbacks passed to the converter
130 API. */
131
132 static struct callback_ent {
133 const char *name;
134 UConverterFromUCallback fromu;
135 const void *fromuctxt;
136 UConverterToUCallback tou;
137 const void *touctxt;
138 } transcode_callbacks[] = {
139 { "substitute",
140 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
141 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
142 { "skip",
143 UCNV_FROM_U_CALLBACK_SKIP, 0,
144 UCNV_TO_U_CALLBACK_SKIP, 0 },
145 { "stop",
146 UCNV_FROM_U_CALLBACK_STOP, 0,
147 UCNV_TO_U_CALLBACK_STOP, 0 },
148 { "escape",
149 UCNV_FROM_U_CALLBACK_ESCAPE, 0,
150 UCNV_TO_U_CALLBACK_ESCAPE, 0},
151 { "escape-icu",
152 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
153 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
154 { "escape-java",
155 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
156 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
157 { "escape-c",
158 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
159 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
160 { "escape-xml",
161 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
162 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
163 { "escape-xml-hex",
164 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
165 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
166 { "escape-xml-dec",
167 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
168 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
169 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
170 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
171 };
172
173 /* Return a pointer to a callback record given its name. */
174
findCallback(const char * name)175 static const struct callback_ent *findCallback(const char *name) {
176 int i, count =
177 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
178
179 /* We'll do a linear search, there aren't many of them and bsearch()
180 may not be that portable. */
181
182 for (i = 0; i < count; ++i) {
183 if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
184 return &transcode_callbacks[i];
185 }
186 }
187
188 return 0;
189 }
190
191 /* Print converter information. If lookfor is set, only that converter will
192 be printed, otherwise all converters will be printed. If canon is non
193 zero, tags and aliases for each converter are printed too, in the format
194 expected for convrters.txt(5). */
195
printConverters(const char * pname,const char * lookfor,UBool canon)196 static int printConverters(const char *pname, const char *lookfor,
197 UBool canon)
198 {
199 UErrorCode err = U_ZERO_ERROR;
200 int32_t num;
201 uint16_t num_stds;
202 const char **stds;
203
204 /* If there is a specified name, just handle that now. */
205
206 if (lookfor) {
207 if (!canon) {
208 printf("%s\n", lookfor);
209 return 0;
210 } else {
211 /* Because we are printing a canonical name, we need the
212 true converter name. We've done that already except for
213 the default name (because we want to print the exact
214 name one would get when calling ucnv_getDefaultName()
215 in non-canon mode). But since we do not know at this
216 point if we have the default name or something else, we
217 need to normalize again to the canonical converter
218 name. */
219
220 const char *truename = ucnv_getAlias(lookfor, 0, &err);
221 if (U_SUCCESS(err)) {
222 lookfor = truename;
223 } else {
224 err = U_ZERO_ERROR;
225 }
226 }
227 }
228
229 /* Print converter names. We come here for one of two reasons: we
230 are printing all the names (lookfor was null), or we have a
231 single converter to print but in canon mode, hence we need to
232 get to it in order to print everything. */
233
234 num = ucnv_countAvailable();
235 if (num <= 0) {
236 initMsg(pname);
237 u_wmsg(stderr, "cantGetNames");
238 return -1;
239 }
240 if (lookfor) {
241 num = 1; /* We know where we want to be. */
242 }
243
244 num_stds = ucnv_countStandards();
245 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
246 if (!stds) {
247 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
248 return -1;
249 } else {
250 uint16_t s;
251
252 if (canon) {
253 printf("{ ");
254 }
255 for (s = 0; s < num_stds; ++s) {
256 stds[s] = ucnv_getStandard(s, &err);
257 if (canon) {
258 printf("%s ", stds[s]);
259 }
260 if (U_FAILURE(err)) {
261 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
262 goto error_cleanup;
263 }
264 }
265 if (canon) {
266 puts("}");
267 }
268 }
269
270 for (int32_t i = 0; i < num; i++) {
271 const char *name;
272 uint16_t num_aliases;
273
274 /* Set the name either to what we are looking for, or
275 to the current converter name. */
276
277 if (lookfor) {
278 name = lookfor;
279 } else {
280 name = ucnv_getAvailableName(i);
281 }
282
283 /* Get all the aliases associated to the name. */
284
285 err = U_ZERO_ERROR;
286 num_aliases = ucnv_countAliases(name, &err);
287 if (U_FAILURE(err)) {
288 printf("%s", name);
289
290 UnicodeString str(name, "");
291 putchar('\t');
292 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
293 u_wmsg_errorName(err));
294 goto error_cleanup;
295 } else {
296 uint16_t a, s, t;
297
298 /* Write all the aliases and their tags. */
299
300 for (a = 0; a < num_aliases; ++a) {
301 const char *alias = ucnv_getAlias(name, a, &err);
302
303 if (U_FAILURE(err)) {
304 UnicodeString str(name, "");
305 putchar('\t');
306 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
307 u_wmsg_errorName(err));
308 goto error_cleanup;
309 }
310
311 /* Print the current alias so that it looks right. */
312 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
313 alias,
314 (canon ? "" : " "));
315
316 /* Look (slowly, linear searching) for a tag. */
317
318 if (canon) {
319 /* -1 to skip the last standard */
320 for (s = t = 0; s < num_stds-1; ++s) {
321 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
322 if (U_SUCCESS(err)) {
323 /* List the standard tags */
324 const char *standardName;
325 UBool isFirst = TRUE;
326 UErrorCode enumError = U_ZERO_ERROR;
327 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
328 /* See if this alias is supported by this standard. */
329 if (!strcmp(standardName, alias)) {
330 if (!t) {
331 printf(" {");
332 t = 1;
333 }
334 /* Print a * after the default standard name */
335 printf(" %s%s", stds[s], (isFirst ? "*" : ""));
336 }
337 isFirst = FALSE;
338 }
339 }
340 }
341 if (t) {
342 printf(" }");
343 }
344 }
345 /* Terminate this entry. */
346 if (canon) {
347 puts("");
348 }
349
350 /* Move on. */
351 }
352 /* Terminate this entry. */
353 if (!canon) {
354 puts("");
355 }
356 }
357 }
358
359 /* Free temporary data. */
360
361 uprv_free(stds);
362
363 /* Success. */
364
365 return 0;
366 error_cleanup:
367 uprv_free(stds);
368 return -1;
369 }
370
371 /* Print all available transliterators. If canon is non zero, print
372 one transliterator per line. */
373
printTransliterators(UBool canon)374 static int printTransliterators(UBool canon)
375 {
376 #if UCONFIG_NO_TRANSLITERATION
377 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
378 return 1;
379 #else
380 UErrorCode status = U_ZERO_ERROR;
381 UEnumeration *ids = utrans_openIDs(&status);
382 int32_t i, numtrans = uenum_count(ids, &status);
383
384 char sepchar = canon ? '\n' : ' ';
385
386 for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
387 int32_t len;
388 const char *nextTrans = uenum_next(ids, &len, &status);
389
390 printf("%s", nextTrans);
391 if (i < numtrans - 1) {
392 putchar(sepchar);
393 }
394 }
395
396 uenum_close(ids);
397
398 /* Add a terminating newline if needed. */
399
400 if (sepchar != '\n') {
401 putchar('\n');
402 }
403
404 /* Success. */
405
406 return 0;
407 #endif
408 }
409
410 enum {
411 uSP = 0x20, // space
412 uCR = 0xd, // carriage return
413 uLF = 0xa, // line feed
414 uNL = 0x85, // newline
415 uLS = 0x2028, // line separator
416 uPS = 0x2029, // paragraph separator
417 uSig = 0xfeff // signature/BOM character
418 };
419
420 static inline int32_t
getChunkLimit(const UnicodeString & prev,const UnicodeString & s)421 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
422 // find one of
423 // CR, LF, CRLF, NL, LS, PS
424 // for paragraph ends (see UAX #13/Unicode 4)
425 // and include it in the chunk
426 // all of these characters are on the BMP
427 // do not include FF or VT in case they are part of a paragraph
428 // (important for bidi contexts)
429 static const UChar paraEnds[] = {
430 0xd, 0xa, 0x85, 0x2028, 0x2029
431 };
432 enum {
433 iCR, iLF, iNL, iLS, iPS, iCount
434 };
435
436 // first, see if there is a CRLF split between prev and s
437 if (prev.endsWith(paraEnds + iCR, 1)) {
438 if (s.startsWith(paraEnds + iLF, 1)) {
439 return 1; // split CRLF, include the LF
440 } else if (!s.isEmpty()) {
441 return 0; // complete the last chunk
442 } else {
443 return -1; // wait for actual further contents to arrive
444 }
445 }
446
447 const UChar *u = s.getBuffer(), *limit = u + s.length();
448 UChar c;
449
450 while (u < limit) {
451 c = *u++;
452 if (
453 ((c < uSP) && (c == uCR || c == uLF)) ||
454 (c == uNL) ||
455 ((c & uLS) == uLS)
456 ) {
457 if (c == uCR) {
458 // check for CRLF
459 if (u == limit) {
460 return -1; // LF may be in the next chunk
461 } else if (*u == uLF) {
462 ++u; // include the LF in this chunk
463 }
464 }
465 return (int32_t)(u - s.getBuffer());
466 }
467 }
468
469 return -1; // continue collecting the chunk
470 }
471
472 enum {
473 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM)
474 CNV_WITH_FEFF, // can convert the U+FEFF signature character
475 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
476 };
477
478 static inline UChar
nibbleToHex(uint8_t n)479 nibbleToHex(uint8_t n) {
480 n &= 0xf;
481 return
482 n <= 9 ?
483 (UChar)(0x30 + n) :
484 (UChar)((0x61 - 10) + n);
485 }
486
487 // check the converter's Unicode signature properties;
488 // the fromUnicode side of the converter must be in its initial state
489 // and will be reset again if it was used
490 static int32_t
cnvSigType(UConverter * cnv)491 cnvSigType(UConverter *cnv) {
492 UErrorCode err;
493 int32_t result;
494
495 // test if the output charset can convert U+FEFF
496 USet *set = uset_open(1, 0);
497 err = U_ZERO_ERROR;
498 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
499 if (U_SUCCESS(err) && uset_contains(set, uSig)) {
500 result = CNV_WITH_FEFF;
501 } else {
502 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
503 }
504 uset_close(set);
505
506 if (result == CNV_WITH_FEFF) {
507 // test if the output charset emits a signature anyway
508 const UChar a[1] = { 0x61 }; // "a"
509 const UChar *in;
510
511 char buffer[20];
512 char *out;
513
514 in = a;
515 out = buffer;
516 err = U_ZERO_ERROR;
517 ucnv_fromUnicode(cnv,
518 &out, buffer + sizeof(buffer),
519 &in, a + 1,
520 NULL, TRUE, &err);
521 ucnv_resetFromUnicode(cnv);
522
523 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
524 U_SUCCESS(err)
525 ) {
526 result = CNV_ADDS_FEFF;
527 }
528 }
529
530 return result;
531 }
532
533 class ConvertFile {
534 public:
ConvertFile()535 ConvertFile() :
536 buf(NULL), outbuf(NULL), fromoffsets(NULL),
537 bufsz(0), signature(0) {}
538
539 void
setBufferSize(size_t bufferSize)540 setBufferSize(size_t bufferSize) {
541 bufsz = bufferSize;
542
543 buf = new char[2 * bufsz];
544 outbuf = buf + bufsz;
545
546 // +1 for an added U+FEFF in the intermediate Unicode buffer
547 fromoffsets = new int32_t[bufsz + 1];
548 }
549
~ConvertFile()550 ~ConvertFile() {
551 delete [] buf;
552 delete [] fromoffsets;
553 }
554
555 UBool convertFile(const char *pname,
556 const char *fromcpage,
557 UConverterToUCallback toucallback,
558 const void *touctxt,
559 const char *tocpage,
560 UConverterFromUCallback fromucallback,
561 const void *fromuctxt,
562 UBool fallback,
563 const char *translit,
564 const char *infilestr,
565 FILE * outfile, int verbose);
566 private:
567 friend int main(int argc, char **argv);
568
569 char *buf, *outbuf;
570 int32_t *fromoffsets;
571
572 size_t bufsz;
573 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
574 };
575
576 // Convert a file from one encoding to another
577 UBool
convertFile(const char * pname,const char * fromcpage,UConverterToUCallback toucallback,const void * touctxt,const char * tocpage,UConverterFromUCallback fromucallback,const void * fromuctxt,UBool fallback,const char * translit,const char * infilestr,FILE * outfile,int verbose)578 ConvertFile::convertFile(const char *pname,
579 const char *fromcpage,
580 UConverterToUCallback toucallback,
581 const void *touctxt,
582 const char *tocpage,
583 UConverterFromUCallback fromucallback,
584 const void *fromuctxt,
585 UBool fallback,
586 const char *translit,
587 const char *infilestr,
588 FILE * outfile, int verbose)
589 {
590 FILE *infile;
591 UBool ret = TRUE;
592 UConverter *convfrom = 0;
593 UConverter *convto = 0;
594 UErrorCode err = U_ZERO_ERROR;
595 UBool flush;
596 const char *cbufp, *prevbufp;
597 char *bufp;
598
599 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
600
601 const UChar *unibuf, *unibufbp;
602 UChar *unibufp;
603
604 size_t rd, wr;
605
606 #if !UCONFIG_NO_TRANSLITERATION
607 Transliterator *t = 0; // Transliterator acting on Unicode data.
608 UnicodeString chunk; // One chunk of the text being collected for transformation.
609 #endif
610 UnicodeString u; // String to do the transliteration.
611 int32_t ulen;
612
613 // use conversion offsets for error messages
614 // unless a transliterator is used -
615 // a text transformation will reorder characters in unpredictable ways
616 UBool useOffsets = TRUE;
617
618 // Open the correct input file or connect to stdin for reading input
619
620 if (infilestr != 0 && strcmp(infilestr, "-")) {
621 infile = fopen(infilestr, "rb");
622 if (infile == 0) {
623 UnicodeString str1(infilestr, "");
624 str1.append((UChar32) 0);
625 UnicodeString str2(strerror(errno), "");
626 str2.append((UChar32) 0);
627 initMsg(pname);
628 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
629 return FALSE;
630 }
631 } else {
632 infilestr = "-";
633 infile = stdin;
634 #ifdef USE_FILENO_BINARY_MODE
635 if (setmode(fileno(stdin), O_BINARY) == -1) {
636 initMsg(pname);
637 u_wmsg(stderr, "cantSetInBinMode");
638 return FALSE;
639 }
640 #endif
641 }
642
643 if (verbose) {
644 fprintf(stderr, "%s:\n", infilestr);
645 }
646
647 #if !UCONFIG_NO_TRANSLITERATION
648 // Create transliterator as needed.
649
650 if (translit != NULL && *translit) {
651 UParseError parse;
652 UnicodeString str(translit), pestr;
653
654 /* Create from rules or by ID as needed. */
655
656 parse.line = -1;
657
658 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
659 t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
660 } else {
661 t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
662 }
663
664 if (U_FAILURE(err)) {
665 str.append((UChar32) 0);
666 initMsg(pname);
667
668 if (parse.line >= 0) {
669 UChar linebuf[20], offsetbuf[20];
670 uprv_itou(linebuf, 20, parse.line, 10, 0);
671 uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
672 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
673 u_wmsg_errorName(err), linebuf, offsetbuf);
674 } else {
675 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
676 u_wmsg_errorName(err));
677 }
678
679 if (t) {
680 delete t;
681 t = 0;
682 }
683 goto error_exit;
684 }
685
686 useOffsets = FALSE;
687 }
688 #endif
689
690 // Create codepage converter. If the codepage or its aliases weren't
691 // available, it returns NULL and a failure code. We also set the
692 // callbacks, and return errors in the same way.
693
694 convfrom = ucnv_open(fromcpage, &err);
695 if (U_FAILURE(err)) {
696 UnicodeString str(fromcpage, "");
697 initMsg(pname);
698 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
699 u_wmsg_errorName(err));
700 goto error_exit;
701 }
702 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
703 if (U_FAILURE(err)) {
704 initMsg(pname);
705 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
706 goto error_exit;
707 }
708
709 convto = ucnv_open(tocpage, &err);
710 if (U_FAILURE(err)) {
711 UnicodeString str(tocpage, "");
712 initMsg(pname);
713 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
714 u_wmsg_errorName(err));
715 goto error_exit;
716 }
717 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
718 if (U_FAILURE(err)) {
719 initMsg(pname);
720 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
721 goto error_exit;
722 }
723 ucnv_setFallback(convto, fallback);
724
725 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
726 int8_t sig;
727
728 // OK, we can convert now.
729 sig = signature;
730 rd = 0;
731
732 do {
733 willexit = FALSE;
734
735 // input file offset at the beginning of the next buffer
736 infoffset += rd;
737
738 rd = fread(buf, 1, bufsz, infile);
739 if (ferror(infile) != 0) {
740 UnicodeString str(strerror(errno));
741 initMsg(pname);
742 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
743 goto error_exit;
744 }
745
746 // Convert the read buffer into the new encoding via Unicode.
747 // After the call 'unibufp' will be placed behind the last
748 // character that was converted in the 'unibuf'.
749 // Also the 'cbufp' is positioned behind the last converted
750 // character.
751 // At the last conversion in the file, flush should be set to
752 // true so that we get all characters converted.
753 //
754 // The converter must be flushed at the end of conversion so
755 // that characters on hold also will be written.
756
757 cbufp = buf;
758 flush = (UBool)(rd != bufsz);
759
760 // convert until the input is consumed
761 do {
762 // remember the start of the current byte-to-Unicode conversion
763 prevbufp = cbufp;
764
765 unibuf = unibufp = u.getBuffer((int32_t)bufsz);
766
767 // Use bufsz instead of u.getCapacity() for the targetLimit
768 // so that we don't overflow fromoffsets[].
769 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
770 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
771
772 ulen = (int32_t)(unibufp - unibuf);
773 u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
774
775 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
776 // converting all of the input bytes.
777 // It works like this because ucnv_toUnicode() returns only under the
778 // following conditions:
779 // - an error occurred during conversion (an error code is set)
780 // - the target buffer is filled (the error code indicates an overflow)
781 // - the source is consumed
782 // That is, if the error code does not indicate a failure,
783 // not even an overflow, then the source must be consumed entirely.
784 fromSawEndOfBytes = (UBool)U_SUCCESS(err);
785
786 if (err == U_BUFFER_OVERFLOW_ERROR) {
787 err = U_ZERO_ERROR;
788 } else if (U_FAILURE(err)) {
789 char pos[32], errorBytes[32];
790 int8_t i, length, errorLength;
791
792 UErrorCode localError = U_ZERO_ERROR;
793 errorLength = (int8_t)sizeof(errorBytes);
794 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
795 if (U_FAILURE(localError) || errorLength == 0) {
796 errorLength = 1;
797 }
798
799 // print the input file offset of the start of the error bytes:
800 // input file offset of the current byte buffer +
801 // length of the just consumed bytes -
802 // length of the error bytes
803 length =
804 (int8_t)sprintf(pos, "%d",
805 (int)(infoffset + (cbufp - buf) - errorLength));
806
807 // output the bytes that caused the error
808 UnicodeString str;
809 for (i = 0; i < errorLength; ++i) {
810 if (i > 0) {
811 str.append((UChar)uSP);
812 }
813 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
814 str.append(nibbleToHex((uint8_t)errorBytes[i]));
815 }
816
817 initMsg(pname);
818 u_wmsg(stderr, "problemCvtToU",
819 UnicodeString(pos, length, "").getTerminatedBuffer(),
820 str.getTerminatedBuffer(),
821 u_wmsg_errorName(err));
822
823 willexit = TRUE;
824 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
825 }
826
827 // Replaced a check for whether the input was consumed by
828 // looping until it is; message key "premEndInput" now obsolete.
829
830 if (ulen == 0) {
831 continue;
832 }
833
834 // remove a U+FEFF Unicode signature character if requested
835 if (sig < 0) {
836 if (u.charAt(0) == uSig) {
837 u.remove(0, 1);
838
839 // account for the removed UChar and offset
840 --ulen;
841
842 if (useOffsets) {
843 // remove an offset from fromoffsets[] as well
844 // to keep the array parallel with the UChars
845 memmove(fromoffsets, fromoffsets + 1, ulen * 4);
846 }
847
848 }
849 sig = 0;
850 }
851
852 #if !UCONFIG_NO_TRANSLITERATION
853 // Transliterate/transform if needed.
854
855 // For transformation, we use chunking code -
856 // collect Unicode input until, for example, an end-of-line,
857 // then transform and output-convert that and continue collecting.
858 // This makes the transformation result independent of the buffer size
859 // while avoiding the slower keyboard mode.
860 // The end-of-chunk characters are completely included in the
861 // transformed string in case they are to be transformed themselves.
862 if (t != NULL) {
863 UnicodeString out;
864 int32_t chunkLimit;
865
866 do {
867 chunkLimit = getChunkLimit(chunk, u);
868 if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
869 // use all of the rest at the end of the text
870 chunkLimit = u.length();
871 }
872 if (chunkLimit >= 0) {
873 // complete the chunk and transform it
874 chunk.append(u, 0, chunkLimit);
875 u.remove(0, chunkLimit);
876 t->transliterate(chunk);
877
878 // append the transformation result to the result and empty the chunk
879 out.append(chunk);
880 chunk.remove();
881 } else {
882 // continue collecting the chunk
883 chunk.append(u);
884 break;
885 }
886 } while (!u.isEmpty());
887
888 u = out;
889 ulen = u.length();
890 }
891 #endif
892
893 // add a U+FEFF Unicode signature character if requested
894 // and possible/necessary
895 if (sig > 0) {
896 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
897 u.insert(0, (UChar)uSig);
898
899 if (useOffsets) {
900 // insert a pseudo-offset into fromoffsets[] as well
901 // to keep the array parallel with the UChars
902 memmove(fromoffsets + 1, fromoffsets, ulen * 4);
903 fromoffsets[0] = -1;
904 }
905
906 // account for the additional UChar and offset
907 ++ulen;
908 }
909 sig = 0;
910 }
911
912 // Convert the Unicode buffer into the destination codepage
913 // Again 'bufp' will be placed behind the last converted character
914 // And 'unibufp' will be placed behind the last converted unicode character
915 // At the last conversion flush should be set to true to ensure that
916 // all characters left get converted
917
918 unibuf = unibufbp = u.getBuffer();
919
920 do {
921 bufp = outbuf;
922
923 // Use fromSawEndOfBytes in addition to the flush flag -
924 // it indicates whether the intermediate Unicode string
925 // contains the very last UChars for the very last input bytes.
926 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
927 &unibufbp,
928 unibuf + ulen,
929 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
930
931 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
932 // converting all of the intermediate UChars.
933 // See comment for fromSawEndOfBytes.
934 toSawEndOfUnicode = (UBool)U_SUCCESS(err);
935
936 if (err == U_BUFFER_OVERFLOW_ERROR) {
937 err = U_ZERO_ERROR;
938 } else if (U_FAILURE(err)) {
939 UChar errorUChars[4];
940 const char *errtag;
941 char pos[32];
942 UChar32 c;
943 int8_t i, length, errorLength;
944
945 UErrorCode localError = U_ZERO_ERROR;
946 errorLength = (int8_t)LENGTHOF(errorUChars);
947 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
948 if (U_FAILURE(localError) || errorLength == 0) {
949 // need at least 1 so that we don't access beyond the length of fromoffsets[]
950 errorLength = 1;
951 }
952
953 int32_t ferroffset;
954
955 if (useOffsets) {
956 // Unicode buffer offset of the start of the error UChars
957 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
958 if (ferroffset < 0) {
959 // approximation - the character started in the previous Unicode buffer
960 ferroffset = 0;
961 }
962
963 // get the corresponding byte offset out of fromoffsets[]
964 // go back if the offset is not known for some of the UChars
965 int32_t fromoffset;
966 do {
967 fromoffset = fromoffsets[ferroffset];
968 } while (fromoffset < 0 && --ferroffset >= 0);
969
970 // total input file offset =
971 // input file offset of the current byte buffer +
972 // byte buffer offset of where the current Unicode buffer is converted from +
973 // fromoffsets[Unicode offset]
974 ferroffset = infoffset + (prevbufp - buf) + fromoffset;
975 errtag = "problemCvtFromU";
976 } else {
977 // Do not use fromoffsets if (t != NULL) because the Unicode text may
978 // be different from what the offsets refer to.
979
980 // output file offset
981 ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
982 errtag = "problemCvtFromUOut";
983 }
984
985 length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
986
987 // output the code points that caused the error
988 UnicodeString str;
989 for (i = 0; i < errorLength;) {
990 if (i > 0) {
991 str.append((UChar)uSP);
992 }
993 U16_NEXT(errorUChars, i, errorLength, c);
994 if (c >= 0x100000) {
995 str.append(nibbleToHex((uint8_t)(c >> 20)));
996 }
997 if (c >= 0x10000) {
998 str.append(nibbleToHex((uint8_t)(c >> 16)));
999 }
1000 str.append(nibbleToHex((uint8_t)(c >> 12)));
1001 str.append(nibbleToHex((uint8_t)(c >> 8)));
1002 str.append(nibbleToHex((uint8_t)(c >> 4)));
1003 str.append(nibbleToHex((uint8_t)c));
1004 }
1005
1006 initMsg(pname);
1007 u_wmsg(stderr, errtag,
1008 UnicodeString(pos, length, "").getTerminatedBuffer(),
1009 str.getTerminatedBuffer(),
1010 u_wmsg_errorName(err));
1011 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1012
1013 willexit = TRUE;
1014 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1015 }
1016
1017 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1018 // looping until they are; message key "premEnd" now obsolete.
1019
1020 // Finally, write the converted buffer to the output file
1021 size_t outlen = (size_t) (bufp - outbuf);
1022 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1023 if (wr != outlen) {
1024 UnicodeString str(strerror(errno));
1025 initMsg(pname);
1026 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1027 willexit = TRUE;
1028 }
1029
1030 if (willexit) {
1031 goto error_exit;
1032 }
1033 } while (!toSawEndOfUnicode);
1034 } while (!fromSawEndOfBytes);
1035 } while (!flush); // Stop when we have flushed the
1036 // converters (this means that it's
1037 // the end of output)
1038
1039 goto normal_exit;
1040
1041 error_exit:
1042 ret = FALSE;
1043
1044 normal_exit:
1045 // Cleanup.
1046
1047 ucnv_close(convfrom);
1048 ucnv_close(convto);
1049
1050 #if !UCONFIG_NO_TRANSLITERATION
1051 delete t;
1052 #endif
1053
1054 if (infile != stdin) {
1055 fclose(infile);
1056 }
1057
1058 return ret;
1059 }
1060
usage(const char * pname,int ecode)1061 static void usage(const char *pname, int ecode) {
1062 const UChar *msg;
1063 int32_t msgLen;
1064 UErrorCode err = U_ZERO_ERROR;
1065 FILE *fp = ecode ? stderr : stdout;
1066 int res;
1067
1068 initMsg(pname);
1069 msg =
1070 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1071 &msgLen, &err);
1072 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1073 UnicodeString mname(msg, msgLen + 1);
1074
1075 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1076 if (!ecode) {
1077 if (!res) {
1078 fputc('\n', fp);
1079 }
1080 if (!u_wmsg(fp, "help")) {
1081 /* Now dump callbacks and finish. */
1082
1083 int i, count =
1084 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
1085 for (i = 0; i < count; ++i) {
1086 fprintf(fp, " %s", transcode_callbacks[i].name);
1087 }
1088 fputc('\n', fp);
1089 }
1090 }
1091
1092 exit(ecode);
1093 }
1094
1095 extern int
main(int argc,char ** argv)1096 main(int argc, char **argv)
1097 {
1098 FILE *outfile;
1099 int ret = 0;
1100
1101 size_t bufsz = DEFAULT_BUFSZ;
1102
1103 const char *fromcpage = 0;
1104 const char *tocpage = 0;
1105 const char *translit = 0;
1106 const char *outfilestr = 0;
1107 UBool fallback = FALSE;
1108
1109 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1110 const void *fromuctxt = 0;
1111 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1112 const void *touctxt = 0;
1113
1114 char **iter, **remainArgv, **remainArgvLimit;
1115 char **end = argv + argc;
1116
1117 const char *pname;
1118
1119 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1120 const char *printName = 0;
1121
1122 UBool verbose = FALSE;
1123 UErrorCode status = U_ZERO_ERROR;
1124
1125 ConvertFile cf;
1126
1127 /* Initialize ICU */
1128 u_init(&status);
1129 if (U_FAILURE(status)) {
1130 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
1131 argv[0], u_errorName(status));
1132 exit(1);
1133 }
1134
1135 // Get and prettify pname.
1136 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1137 #ifdef U_WINDOWS
1138 if (!pname) {
1139 pname = uprv_strrchr(*argv, '/');
1140 }
1141 #endif
1142 if (!pname) {
1143 pname = *argv;
1144 } else {
1145 ++pname;
1146 }
1147
1148 // First, get the arguments from command-line
1149 // to know the codepages to convert between
1150
1151 remainArgv = remainArgvLimit = argv + 1;
1152 for (iter = argv + 1; iter != end; iter++) {
1153 // Check for from charset
1154 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1155 iter++;
1156 if (iter != end)
1157 fromcpage = *iter;
1158 else
1159 usage(pname, 1);
1160 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1161 iter++;
1162 if (iter != end)
1163 tocpage = *iter;
1164 else
1165 usage(pname, 1);
1166 } else if (strcmp("-x", *iter) == 0) {
1167 iter++;
1168 if (iter != end)
1169 translit = *iter;
1170 else
1171 usage(pname, 1);
1172 } else if (!strcmp("--fallback", *iter)) {
1173 fallback = TRUE;
1174 } else if (!strcmp("--no-fallback", *iter)) {
1175 fallback = FALSE;
1176 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1177 iter++;
1178 if (iter != end) {
1179 bufsz = atoi(*iter);
1180 if ((int) bufsz <= 0) {
1181 initMsg(pname);
1182 UnicodeString str(*iter);
1183 initMsg(pname);
1184 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1185 return 3;
1186 }
1187 } else {
1188 usage(pname, 1);
1189 }
1190 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1191 if (printTranslits) {
1192 usage(pname, 1);
1193 }
1194 printConvs = TRUE;
1195 } else if (strcmp("--default-code", *iter) == 0) {
1196 if (printTranslits) {
1197 usage(pname, 1);
1198 }
1199 printName = ucnv_getDefaultName();
1200 } else if (strcmp("--list-code", *iter) == 0) {
1201 if (printTranslits) {
1202 usage(pname, 1);
1203 }
1204
1205 iter++;
1206 if (iter != end) {
1207 UErrorCode e = U_ZERO_ERROR;
1208 printName = ucnv_getAlias(*iter, 0, &e);
1209 if (U_FAILURE(e) || !printName) {
1210 UnicodeString str(*iter);
1211 initMsg(pname);
1212 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1213 return 2;
1214 }
1215 } else
1216 usage(pname, 1);
1217 } else if (strcmp("--canon", *iter) == 0) {
1218 printCanon = TRUE;
1219 } else if (strcmp("-L", *iter) == 0
1220 || !strcmp("--list-transliterators", *iter)) {
1221 if (printConvs) {
1222 usage(pname, 1);
1223 }
1224 printTranslits = TRUE;
1225 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1226 || !strcmp("--help", *iter)) {
1227 usage(pname, 0);
1228 } else if (!strcmp("-c", *iter)) {
1229 fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1230 } else if (!strcmp("--to-callback", *iter)) {
1231 iter++;
1232 if (iter != end) {
1233 const struct callback_ent *cbe = findCallback(*iter);
1234 if (cbe) {
1235 fromucallback = cbe->fromu;
1236 fromuctxt = cbe->fromuctxt;
1237 } else {
1238 UnicodeString str(*iter);
1239 initMsg(pname);
1240 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1241 return 4;
1242 }
1243 } else {
1244 usage(pname, 1);
1245 }
1246 } else if (!strcmp("--from-callback", *iter)) {
1247 iter++;
1248 if (iter != end) {
1249 const struct callback_ent *cbe = findCallback(*iter);
1250 if (cbe) {
1251 toucallback = cbe->tou;
1252 touctxt = cbe->touctxt;
1253 } else {
1254 UnicodeString str(*iter);
1255 initMsg(pname);
1256 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1257 return 4;
1258 }
1259 } else {
1260 usage(pname, 1);
1261 }
1262 } else if (!strcmp("-i", *iter)) {
1263 toucallback = UCNV_TO_U_CALLBACK_SKIP;
1264 } else if (!strcmp("--callback", *iter)) {
1265 iter++;
1266 if (iter != end) {
1267 const struct callback_ent *cbe = findCallback(*iter);
1268 if (cbe) {
1269 fromucallback = cbe->fromu;
1270 fromuctxt = cbe->fromuctxt;
1271 toucallback = cbe->tou;
1272 touctxt = cbe->touctxt;
1273 } else {
1274 UnicodeString str(*iter);
1275 initMsg(pname);
1276 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1277 return 4;
1278 }
1279 } else {
1280 usage(pname, 1);
1281 }
1282 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1283 verbose = FALSE;
1284 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1285 verbose = TRUE;
1286 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1287 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname);
1288 return 0;
1289 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1290 ++iter;
1291 if (iter != end && !outfilestr) {
1292 outfilestr = *iter;
1293 } else {
1294 usage(pname, 1);
1295 }
1296 } else if (0 == strcmp("--add-signature", *iter)) {
1297 cf.signature = 1;
1298 } else if (0 == strcmp("--remove-signature", *iter)) {
1299 cf.signature = -1;
1300 } else if (**iter == '-' && (*iter)[1]) {
1301 usage(pname, 1);
1302 } else {
1303 // move a non-option up in argv[]
1304 *remainArgvLimit++ = *iter;
1305 }
1306 }
1307
1308 if (printConvs || printName) {
1309 return printConverters(pname, printName, printCanon) ? 2 : 0;
1310 } else if (printTranslits) {
1311 return printTransliterators(printCanon) ? 3 : 0;
1312 }
1313
1314 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1315 fromcpage = ucnv_getDefaultName();
1316 }
1317 if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1318 tocpage = ucnv_getDefaultName();
1319 }
1320
1321 // Open the correct output file or connect to stdout for reading input
1322 if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1323 outfile = fopen(outfilestr, "wb");
1324 if (outfile == 0) {
1325 UnicodeString str1(outfilestr, "");
1326 UnicodeString str2(strerror(errno), "");
1327 initMsg(pname);
1328 u_wmsg(stderr, "cantCreateOutputF",
1329 str1.getBuffer(), str2.getBuffer());
1330 return 1;
1331 }
1332 } else {
1333 outfilestr = "-";
1334 outfile = stdout;
1335 #ifdef USE_FILENO_BINARY_MODE
1336 if (setmode(fileno(outfile), O_BINARY) == -1) {
1337 u_wmsg(stderr, "cantSetOutBinMode");
1338 exit(-1);
1339 }
1340 #endif
1341 }
1342
1343 /* Loop again on the arguments to find all the input files, and
1344 convert them. */
1345
1346 cf.setBufferSize(bufsz);
1347
1348 if(remainArgv < remainArgvLimit) {
1349 for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1350 if (!cf.convertFile(
1351 pname, fromcpage, toucallback, touctxt, tocpage,
1352 fromucallback, fromuctxt, fallback, translit, *iter,
1353 outfile, verbose)
1354 ) {
1355 goto error_exit;
1356 }
1357 }
1358 } else {
1359 if (!cf.convertFile(
1360 pname, fromcpage, toucallback, touctxt, tocpage,
1361 fromucallback, fromuctxt, fallback, translit, 0,
1362 outfile, verbose)
1363 ) {
1364 goto error_exit;
1365 }
1366 }
1367
1368 goto normal_exit;
1369 error_exit:
1370 #if !UCONFIG_NO_LEGACY_CONVERSION
1371 ret = 1;
1372 #else
1373 fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1374 #endif
1375 normal_exit:
1376
1377 if (outfile != stdout) {
1378 fclose(outfile);
1379 }
1380
1381 return ret;
1382 }
1383
1384
1385 /*
1386 * Hey, Emacs, please set the following:
1387 *
1388 * Local Variables:
1389 * indent-tabs-mode: nil
1390 * End:
1391 *
1392 */
1393